diff --git a/README.md b/README.md
index 0a309ebe2d828fc1934570b857d24fb289fcb832..e1a50c87e26d493ba3ac760f357905d89aa40dab 100644
--- a/README.md
+++ b/README.md
@@ -7,14 +7,14 @@
 
 | **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
 |-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-cpu)](https://ci.tensorflow.org/job/tensorflow-master-cpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-linux-gpu)](https://ci.tensorflow.org/job/tensorflow-master-linux-gpu) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-mac)](https://ci.tensorflow.org/job/tensorflow-master-mac) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
 between them.  This flexible architecture enables you to deploy computation to one
 or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes TensorBoard, a data visualization toolkit.
+code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard), a data visualization toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
@@ -86,6 +86,7 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 * [TensorFlow Website](https://www.tensorflow.org)
 * [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
 * [TensorFlow Model Zoo](https://github.com/tensorflow/models)
 * [TensorFlow MOOC on Udacity](https://www.udacity.com/course/deep-learning--ud730)
 * [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
diff --git a/RELEASE.md b/RELEASE.md
index c63d9f20c9a842ceed97afc25690073d082c42cb..e8459531748628fd822d876d79625fdd65798791 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -9,6 +9,8 @@
 * Distributed Mutex / CriticalSection added to `tf.contrib.framework.CriticalSection`.
 * Better text processing with `tf.regex_replace`.
 * Easy, efficient sequence input with `tf.contrib.data.bucket_by_sequence_length`
+* Initial support for `tf.contrib.tensorrt` that enables native TensorRT in
+  TensorFlow.
 
 ## Bug Fixes and Other Changes
 * Accelerated Linear Algebra (XLA):
@@ -50,6 +52,15 @@
   * Support `float16` `dtype` in `tf.linalg.*`.
   * Add `tf.estimator.export.TensorServingInputReceiver` that allows `tf.estimator.Estimator.export_savedmodel` to pass raw tensors to model functions.
 
+## Deprecations
+
+* TensorFlow 1.7 may be the last time we support Cuda versions below 8.0.
+  Starting with TensorFlow 1.8 release, 8.0 will be the minimum supported
+  version.
+* TensorFlow 1.7 may be the last time we support cuDNN versions below 6.0.
+  Starting with TensorFlow 1.8 release, 6.0 will be the minimum supported
+  version.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
diff --git a/configure.py b/configure.py
index 6744082d5d55c3a039b7a4efa7a539e77185cabd..8fb8979111627b9b25be80c77c611932880e011d 100644
--- a/configure.py
+++ b/configure.py
@@ -35,6 +35,7 @@ except ImportError:
 
 _DEFAULT_CUDA_VERSION = '9.0'
 _DEFAULT_CUDNN_VERSION = '7'
+_DEFAULT_NCCL_VERSION = '1.3'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,5.2'
 _DEFAULT_CUDA_PATH = '/usr/local/cuda'
 _DEFAULT_CUDA_PATH_LINUX = '/opt/cuda'
@@ -484,6 +485,8 @@ def set_cc_opt_flags(environ_cp):
   if is_ppc64le():
     # gcc on ppc64le does not support -march, use mcpu instead
     default_cc_opt_flags = '-mcpu=native'
+  elif is_windows():
+    default_cc_opt_flags = '/arch:AVX'
   else:
     default_cc_opt_flags = '-march=native'
   question = ('Please specify optimization flags to use during compilation when'
@@ -494,7 +497,7 @@ def set_cc_opt_flags(environ_cp):
   for opt in cc_opt_flags.split():
     write_to_bazelrc('build:opt --copt=%s' % opt)
   # It should be safe on the same build host.
-  if not is_ppc64le():
+  if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=native')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
   # TODO(mikecase): Remove these default defines once we are able to get
@@ -1102,6 +1105,81 @@ def set_tf_tensorrt_install_path(environ_cp):
   write_action_env_to_bazelrc('TF_TENSORRT_VERSION', tf_tensorrt_version)
 
 
+def set_tf_nccl_install_path(environ_cp):
+  """Set NCCL_INSTALL_PATH and TF_NCCL_VERSION.
+
+  Args:
+    environ_cp: copy of the os.environ.
+
+  Raises:
+    ValueError: if this method was called under non-Linux platform.
+    UserInputError: if user has provided invalid input multiple times.
+  """
+  if not is_linux():
+    raise ValueError('Currently NCCL is only supported on Linux platforms.')
+
+  ask_nccl_version = (
+      'Please specify the NCCL version you want to use. '
+      '[Leave empty to default to NCCL %s]: ') % _DEFAULT_NCCL_VERSION
+
+  for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
+    tf_nccl_version = get_from_env_or_user_or_default(
+        environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, _DEFAULT_NCCL_VERSION)
+    tf_nccl_version = reformat_version_sequence(str(tf_nccl_version), 1)
+
+    if tf_nccl_version == '1':
+      break  # No need to get install path, NCCL 1 is a GitHub repo.
+
+    # TODO(csigg): Look with ldconfig first if we can find the library in paths
+    # like /usr/lib/x86_64-linux-gnu and the header file in the corresponding
+    # include directory. This is where the NCCL .deb packages install them.
+    # Then ask the user if we should use that. Instead of a single
+    # NCCL_INSTALL_PATH, pass separate NCCL_LIB_PATH and NCCL_HDR_PATH to
+    # nccl_configure.bzl
+    default_nccl_path = environ_cp.get('CUDA_TOOLKIT_PATH')
+    ask_nccl_path = (r'Please specify the location where NCCL %s library is '
+                     'installed. Refer to README.md for more details. [Default '
+                     'is %s]:') % (tf_nccl_version, default_nccl_path)
+    nccl_install_path = get_from_env_or_user_or_default(
+        environ_cp, 'NCCL_INSTALL_PATH', ask_nccl_path, default_nccl_path)
+
+    # Result returned from "read" will be used unexpanded. That make "~"
+    # unusable. Going through one more level of expansion to handle that.
+    nccl_install_path = os.path.realpath(os.path.expanduser(nccl_install_path))
+    if is_windows() or is_cygwin():
+      nccl_install_path = cygpath(nccl_install_path)
+
+    if is_windows():
+      nccl_lib_path = 'lib/x64/nccl.lib'
+    elif is_linux():
+      nccl_lib_path = 'lib/libnccl.so.%s' % tf_nccl_version
+    elif is_macos():
+      nccl_lib_path = 'lib/libnccl.%s.dylib' % tf_nccl_version
+
+    nccl_lib_path = os.path.join(nccl_install_path, nccl_lib_path)
+    nccl_hdr_path = os.path.join(nccl_install_path, 'include/nccl.h')
+    if os.path.exists(nccl_lib_path) and os.path.exists(nccl_hdr_path):
+      # Set NCCL_INSTALL_PATH
+      environ_cp['NCCL_INSTALL_PATH'] = nccl_install_path
+      write_action_env_to_bazelrc('NCCL_INSTALL_PATH', nccl_install_path)
+      break
+
+    # Reset and Retry
+    print('Invalid path to NCCL %s toolkit, %s or %s not found. Please use the '
+          'O/S agnostic package of NCCL 2' % (tf_nccl_version, nccl_lib_path,
+                                              nccl_hdr_path))
+
+    environ_cp['TF_NCCL_VERSION'] = ''
+  else:
+    raise UserInputError('Invalid TF_NCCL setting was provided %d '
+                         'times in a row. Assuming to be a scripting mistake.' %
+                         _DEFAULT_PROMPT_ASK_ATTEMPTS)
+
+  # Set TF_NCCL_VERSION
+  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
+  write_action_env_to_bazelrc('TF_NCCL_VERSION', tf_nccl_version)
+
+
 def get_native_cuda_compute_capabilities(environ_cp):
   """Get native cuda compute capabilities.
 
@@ -1438,6 +1516,8 @@ def main():
     set_tf_cudnn_version(environ_cp)
     if is_linux():
       set_tf_tensorrt_install_path(environ_cp)
+      set_tf_nccl_install_path(environ_cp)
+
     set_tf_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
         'LD_LIBRARY_PATH') != '1':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 31e64793de52a13530ebbf5ccc0e38cf570b16fd..f2ad16fa04f5beb6616c58c28d0f0c460c3e3a17 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -401,323 +401,6 @@ package_group(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
-py_library(
-    name = "tensorflow_py",
-    srcs = ["__init__.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python"],
-)
-
-py_library(
-    name = "experimental_tensorflow_py",
-    srcs = ["experimental_api.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow/tools/api/tests:__subpackages__"],
-    deps = [
-        "//tensorflow/python",
-        "//tensorflow/tools/api/generator:python_api",
-    ],
-)
-
-filegroup(
-    name = "all_opensource_files",
-    data = [
-        ":all_files",
-        "//tensorflow/c:all_files",
-        "//tensorflow/cc:all_files",
-        "//tensorflow/cc/saved_model:all_files",
-        "//tensorflow/cc/saved_model/python:all_files",
-        "//tensorflow/cc/tools:all_files",
-        "//tensorflow/compiler/aot:all_files",
-        "//tensorflow/compiler/aot/tests:all_files",
-        "//tensorflow/compiler/jit:all_files",
-        "//tensorflow/compiler/jit/graphcycles:all_files",
-        "//tensorflow/compiler/jit/kernels:all_files",
-        "//tensorflow/compiler/jit/legacy_flags:all_files",
-        "//tensorflow/compiler/jit/ops:all_files",
-        "//tensorflow/compiler/plugin:all_files",
-        "//tensorflow/compiler/tests:all_files",
-        "//tensorflow/compiler/tf2xla:all_files",
-        "//tensorflow/compiler/tf2xla/cc:all_files",
-        "//tensorflow/compiler/tf2xla/kernels:all_files",
-        "//tensorflow/compiler/tf2xla/lib:all_files",
-        "//tensorflow/compiler/tf2xla/ops:all_files",
-        "//tensorflow/compiler/xla:all_files",
-        "//tensorflow/compiler/xla/client:all_files",
-        "//tensorflow/compiler/xla/client/lib:all_files",
-        "//tensorflow/compiler/xla/client/xla_client:all_files",
-        "//tensorflow/compiler/xla/legacy_flags:all_files",
-        "//tensorflow/compiler/xla/python:all_files",
-        "//tensorflow/compiler/xla/service:all_files",
-        "//tensorflow/compiler/xla/service/cpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu:all_files",
-        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend:all_files",
-        "//tensorflow/compiler/xla/service/interpreter:all_files",
-        "//tensorflow/compiler/xla/service/llvm_ir:all_files",
-        "//tensorflow/compiler/xla/tests:all_files",
-        "//tensorflow/compiler/xla/tools:all_files",
-        "//tensorflow/compiler/xla/tools/parser:all_files",
-        "//tensorflow/contrib:all_files",
-        "//tensorflow/contrib/all_reduce:all_files",
-        "//tensorflow/contrib/android:all_files",
-        "//tensorflow/contrib/autograph:all_files",
-        "//tensorflow/contrib/autograph/converters:all_files",
-        "//tensorflow/contrib/autograph/impl:all_files",
-        "//tensorflow/contrib/autograph/pyct:all_files",
-        "//tensorflow/contrib/autograph/pyct/static_analysis:all_files",
-        "//tensorflow/contrib/autograph/utils:all_files",
-        "//tensorflow/contrib/batching:all_files",
-        "//tensorflow/contrib/bayesflow:all_files",
-        "//tensorflow/contrib/boosted_trees:all_files",
-        "//tensorflow/contrib/boosted_trees/estimator_batch:all_files",
-        "//tensorflow/contrib/boosted_trees/lib:all_files",
-        "//tensorflow/contrib/boosted_trees/proto:all_files",
-        "//tensorflow/contrib/boosted_trees/resources:all_files",
-        "//tensorflow/contrib/cloud:all_files",
-        "//tensorflow/contrib/cloud/kernels:all_files",
-        "//tensorflow/contrib/cluster_resolver:all_files",
-        "//tensorflow/contrib/coder:all_files",
-        "//tensorflow/contrib/compiler:all_files",
-        "//tensorflow/contrib/copy_graph:all_files",
-        "//tensorflow/contrib/crf:all_files",
-        "//tensorflow/contrib/cudnn_rnn:all_files",
-        "//tensorflow/contrib/data:all_files",
-        "//tensorflow/contrib/data/kernels:all_files",
-        "//tensorflow/contrib/data/python/kernel_tests:all_files",
-        "//tensorflow/contrib/data/python/ops:all_files",
-        "//tensorflow/contrib/decision_trees/proto:all_files",
-        "//tensorflow/contrib/deprecated:all_files",
-        "//tensorflow/contrib/distributions:all_files",
-        "//tensorflow/contrib/eager/proto:all_files",
-        "//tensorflow/contrib/eager/python:all_files",
-        "//tensorflow/contrib/estimator:all_files",
-        "//tensorflow/contrib/factorization:all_files",
-        "//tensorflow/contrib/factorization/examples:all_files",
-        "//tensorflow/contrib/factorization/kernels:all_files",
-        "//tensorflow/contrib/feature_column:all_files",
-        "//tensorflow/contrib/ffmpeg:all_files",
-        "//tensorflow/contrib/ffmpeg/default:all_files",
-        "//tensorflow/contrib/framework:all_files",
-        "//tensorflow/contrib/fused_conv:all_files",
-        "//tensorflow/contrib/gan:all_files",
-        "//tensorflow/contrib/gdr:all_files",
-        "//tensorflow/contrib/graph_editor:all_files",
-        "//tensorflow/contrib/grid_rnn:all_files",
-        "//tensorflow/contrib/hooks:all_files",
-        "//tensorflow/contrib/hvx/clock_cycle_profiling:all_files",
-        "//tensorflow/contrib/hvx/hvx_ops_support_checker:all_files",
-        "//tensorflow/contrib/image:all_files",
-        "//tensorflow/contrib/input_pipeline:all_files",
-        "//tensorflow/contrib/input_pipeline/kernels:all_files",
-        "//tensorflow/contrib/integrate:all_files",
-        "//tensorflow/contrib/keras:all_files",
-        "//tensorflow/contrib/kernel_methods:all_files",
-        "//tensorflow/contrib/kfac:all_files",
-        "//tensorflow/contrib/kfac/examples:all_files",
-        "//tensorflow/contrib/kfac/examples/tests:all_files",
-        "//tensorflow/contrib/kfac/python/kernel_tests:all_files",
-        "//tensorflow/contrib/kfac/python/ops:all_files",
-        "//tensorflow/contrib/labeled_tensor:all_files",
-        "//tensorflow/contrib/layers:all_files",
-        "//tensorflow/contrib/layers/kernels:all_files",
-        "//tensorflow/contrib/learn:all_files",
-        "//tensorflow/contrib/learn/python/learn/datasets:all_files",
-        "//tensorflow/contrib/legacy_seq2seq:all_files",
-        "//tensorflow/contrib/libsvm:all_files",
-        "//tensorflow/contrib/linalg:all_files",
-        "//tensorflow/contrib/linear_optimizer:all_files",
-        "//tensorflow/contrib/lite:all_files",
-        "//tensorflow/contrib/lite/java:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main:all_files",
-        "//tensorflow/contrib/lite/java/demo/app/src/main/assets:all_files",
-        "//tensorflow/contrib/lite/java/src/main/native:all_files",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:all_files",
-        "//tensorflow/contrib/lite/kernels:all_files",
-        "//tensorflow/contrib/lite/kernels/internal:all_files",
-        "//tensorflow/contrib/lite/models/smartreply:all_files",
-        "//tensorflow/contrib/lite/nnapi:all_files",
-        "//tensorflow/contrib/lite/python:all_files",
-        "//tensorflow/contrib/lite/schema:all_files",
-        "//tensorflow/contrib/lite/testing:all_files",
-        "//tensorflow/contrib/lite/toco:all_files",
-        "//tensorflow/contrib/lite/toco/graph_transformations/tests:all_files",
-        "//tensorflow/contrib/lite/toco/python:all_files",
-        "//tensorflow/contrib/lite/toco/tensorflow_graph_matching:all_files",
-        "//tensorflow/contrib/lite/toco/tflite:all_files",
-        "//tensorflow/contrib/lite/tools:all_files",
-        "//tensorflow/contrib/lookup:all_files",
-        "//tensorflow/contrib/losses:all_files",
-        "//tensorflow/contrib/makefile:all_files",
-        "//tensorflow/contrib/memory_stats:all_files",
-        "//tensorflow/contrib/meta_graph_transform:all_files",
-        "//tensorflow/contrib/metrics:all_files",
-        "//tensorflow/contrib/model_pruning:all_files",
-        "//tensorflow/contrib/model_pruning/examples/cifar10:all_files",
-        "//tensorflow/contrib/nccl:all_files",
-        "//tensorflow/contrib/nearest_neighbor:all_files",
-        "//tensorflow/contrib/nn:all_files",
-        "//tensorflow/contrib/opt:all_files",
-        "//tensorflow/contrib/periodic_resample:all_files",
-        "//tensorflow/contrib/predictor:all_files",
-        "//tensorflow/contrib/quantize:all_files",
-        "//tensorflow/contrib/receptive_field:all_files",
-        "//tensorflow/contrib/reduce_slice_ops:all_files",
-        "//tensorflow/contrib/remote_fused_graph/pylib:all_files",
-        "//tensorflow/contrib/resampler:all_files",
-        "//tensorflow/contrib/rnn:all_files",
-        "//tensorflow/contrib/saved_model:all_files",
-        "//tensorflow/contrib/saved_model/cc/saved_model:all_files",
-        "//tensorflow/contrib/seq2seq:all_files",
-        "//tensorflow/contrib/session_bundle:all_files",
-        "//tensorflow/contrib/session_bundle/example:all_files",
-        "//tensorflow/contrib/signal:all_files",
-        "//tensorflow/contrib/slim:all_files",
-        "//tensorflow/contrib/slim/python/slim/data:all_files",
-        "//tensorflow/contrib/slim/python/slim/nets:all_files",
-        "//tensorflow/contrib/solvers:all_files",
-        "//tensorflow/contrib/sparsemax:all_files",
-        "//tensorflow/contrib/specs:all_files",
-        "//tensorflow/contrib/staging:all_files",
-        "//tensorflow/contrib/stat_summarizer:all_files",
-        "//tensorflow/contrib/stateless:all_files",
-        "//tensorflow/contrib/summary:all_files",
-        "//tensorflow/contrib/tensor_forest:all_files",
-        "//tensorflow/contrib/tensor_forest/hybrid:all_files",
-        "//tensorflow/contrib/tensor_forest/kernels/v4:all_files",
-        "//tensorflow/contrib/tensor_forest/proto:all_files",
-        "//tensorflow/contrib/tensorboard:all_files",
-        "//tensorflow/contrib/tensorboard/db:all_files",
-        "//tensorflow/contrib/tensorrt:all_files",
-        "//tensorflow/contrib/testing:all_files",
-        "//tensorflow/contrib/text:all_files",
-        "//tensorflow/contrib/tfprof:all_files",
-        "//tensorflow/contrib/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/examples:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries:all_files",
-        "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:all_files",
-        "//tensorflow/contrib/tpu:all_files",
-        "//tensorflow/contrib/tpu/profiler:all_files",
-        "//tensorflow/contrib/tpu/proto:all_files",
-        "//tensorflow/contrib/training:all_files",
-        "//tensorflow/contrib/util:all_files",
-        "//tensorflow/contrib/verbs:all_files",
-        "//tensorflow/core:all_files",
-        "//tensorflow/core/api_def:all_files",
-        "//tensorflow/core/common_runtime/eager:all_files",
-        "//tensorflow/core/debug:all_files",
-        "//tensorflow/core/distributed_runtime:all_files",
-        "//tensorflow/core/distributed_runtime/rpc:all_files",
-        "//tensorflow/core/grappler:all_files",
-        "//tensorflow/core/grappler/clusters:all_files",
-        "//tensorflow/core/grappler/costs:all_files",
-        "//tensorflow/core/grappler/inputs:all_files",
-        "//tensorflow/core/grappler/optimizers:all_files",
-        "//tensorflow/core/grappler/utils:all_files",
-        "//tensorflow/core/kernels:all_files",
-        "//tensorflow/core/kernels/batching_util:all_files",
-        "//tensorflow/core/kernels/data:all_files",
-        "//tensorflow/core/kernels/data/sql:all_files",
-        "//tensorflow/core/kernels/fuzzing:all_files",
-        "//tensorflow/core/kernels/hexagon:all_files",
-        "//tensorflow/core/kernels/neon:all_files",
-        "//tensorflow/core/lib/db:all_files",
-        "//tensorflow/core/ops/compat:all_files",
-        "//tensorflow/core/platform/cloud:all_files",
-        "//tensorflow/core/platform/default/build_config:all_files",
-        "//tensorflow/core/platform/hadoop:all_files",
-        "//tensorflow/core/platform/s3:all_files",
-        "//tensorflow/core/profiler:all_files",
-        "//tensorflow/core/profiler/internal:all_files",
-        "//tensorflow/core/profiler/internal/advisor:all_files",
-        "//tensorflow/core/util/ctc:all_files",
-        "//tensorflow/core/util/tensor_bundle:all_files",
-        "//tensorflow/examples/adding_an_op:all_files",
-        "//tensorflow/examples/android:all_files",
-        "//tensorflow/examples/benchmark:all_files",
-        "//tensorflow/examples/get_started/regression:all_files",
-        "//tensorflow/examples/how_tos/reading_data:all_files",
-        "//tensorflow/examples/image_retraining:all_files",
-        "//tensorflow/examples/label_image:all_files",
-        "//tensorflow/examples/learn:all_files",
-        "//tensorflow/examples/multibox_detector:all_files",
-        "//tensorflow/examples/saved_model:all_files",
-        "//tensorflow/examples/speech_commands:all_files",
-        "//tensorflow/examples/tutorials/estimators:all_files",
-        "//tensorflow/examples/tutorials/layers:all_files",
-        "//tensorflow/examples/tutorials/mnist:all_files",
-        "//tensorflow/examples/tutorials/monitors:all_files",
-        "//tensorflow/examples/tutorials/word2vec:all_files",
-        "//tensorflow/examples/wav_to_spectrogram:all_files",
-        "//tensorflow/go:all_files",
-        "//tensorflow/java:all_files",
-        "//tensorflow/java/src/main/java/org/tensorflow/examples:all_files",
-        "//tensorflow/java/src/main/native:all_files",
-        "//tensorflow/python:all_files",
-        "//tensorflow/python/data:all_files",
-        "//tensorflow/python/data/kernel_tests:all_files",
-        "//tensorflow/python/data/ops:all_files",
-        "//tensorflow/python/data/util:all_files",
-        "//tensorflow/python/debug:all_files",
-        "//tensorflow/python/eager:all_files",
-        "//tensorflow/python/estimator:all_files",
-        "//tensorflow/python/feature_column:all_files",
-        "//tensorflow/python/keras:all_files",
-        "//tensorflow/python/kernel_tests:all_files",
-        "//tensorflow/python/kernel_tests/distributions:all_files",
-        "//tensorflow/python/kernel_tests/linalg:all_files",
-        "//tensorflow/python/kernel_tests/random:all_files",
-        "//tensorflow/python/kernel_tests/testdata:all_files",
-        "//tensorflow/python/ops/distributions:all_files",
-        "//tensorflow/python/ops/linalg:all_files",
-        "//tensorflow/python/ops/losses:all_files",
-        "//tensorflow/python/profiler:all_files",
-        "//tensorflow/python/profiler/internal:all_files",
-        "//tensorflow/python/saved_model:all_files",
-        "//tensorflow/python/tools:all_files",
-        "//tensorflow/tools/api/generator:all_files",
-        "//tensorflow/tools/api/golden:all_files",
-        "//tensorflow/tools/api/lib:all_files",
-        "//tensorflow/tools/api/tests:all_files",
-        "//tensorflow/tools/benchmark:all_files",
-        "//tensorflow/tools/build_info:all_files",
-        "//tensorflow/tools/ci_build/gpu_build:all_files",
-        "//tensorflow/tools/common:all_files",
-        "//tensorflow/tools/compatibility:all_files",
-        "//tensorflow/tools/dist_test/server:all_files",
-        "//tensorflow/tools/docker:all_files",
-        "//tensorflow/tools/docker/notebooks:all_files",
-        "//tensorflow/tools/docs:all_files",
-        "//tensorflow/tools/git:all_files",
-        "//tensorflow/tools/graph_transforms:all_files",
-        "//tensorflow/tools/mlpbtxt:all_files",
-        "//tensorflow/tools/proto_text:all_files",
-        "//tensorflow/tools/quantization:all_files",
-        "//tensorflow/tools/test:all_files",
-        "//tensorflow/user_ops:all_files",
-        "//third_party/eigen3:all_files",
-        "//third_party/fft2d:all_files",
-        "//third_party/flatbuffers:all_files",
-        "//third_party/hadoop:all_files",
-        "//third_party/sycl:all_files",
-        "//third_party/sycl/sycl:all_files",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
@@ -767,11 +450,12 @@ tf_cc_shared_object(
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
+        "//tensorflow/core:gpu_runtime_impl",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
         "//tensorflow/core:lib_internal_impl",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/stream_executor:stream_executor_impl",
-        "//tensorflow/core:gpu_runtime_impl",
     ] + tf_additional_binary_deps(),
 )
 
@@ -851,3 +535,14 @@ exports_files(
         "tf_exported_symbols.lds",
     ],
 )
+
+py_library(
+    name = "tensorflow_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python",
+        "//tensorflow/tools/api/generator:python_api",
+    ],
+)
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index 78ad6aec19f3bbbfcb389012ac1577573b3e4901..c8683e3976c90add3f1f54d8e575c798327e9273 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -20,14 +20,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # pylint: disable=wildcard-import
-from tensorflow.python import *  # pylint: disable=redefined-builtin
+from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 426f97b84472ba475b7b16ea49b64b4671ba6e74..8a9301d584775cff3ae315e6fd856b00d1734248 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -34,6 +34,8 @@ filegroup(
         exclude = [
             "c_api_experimental.cc",
             "c_api_experimental.h",
+            "python_api.cc",
+            "python_api.h",
             "*test*",
         ],
     ),
@@ -120,6 +122,7 @@ tf_cuda_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_platform",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -281,20 +284,7 @@ tf_cuda_library(
     deps = [
         ":c_api",
         ":c_api_internal",
+        # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+        "//tensorflow/python:cpp_shape_inference_proto_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index b32f574628c4d1dc5c3bb3f1265a1b12adee28bc..fe85f8ee0ed2c58c3ba9201a9ca895c9ec48c022 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1496,7 +1496,8 @@ TF_CAPI_EXPORT extern int TF_DeviceListCount(const TF_DeviceList* list);
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieves the type of the device at the given index.
 //
@@ -1506,14 +1507,15 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListName(const TF_DeviceList* list,
 // If index is out of bounds, an error code will be set in the status object,
 // and a null pointer will be returned.
 TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
-                                                    int index, TF_Status*);
+                                                    int index,
+                                                    TF_Status* status);
 
 // Retrieve the amount of memory associated with a given device.
 //
 // If index is out of bounds, an error code will be set in the status object,
 // and -1 will be returned.
 TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
-    const TF_DeviceList* list, int index, TF_Status*);
+    const TF_DeviceList* list, int index, TF_Status* status);
 
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index bea93785717e2161fcec941485ac3c3f7f3e3ed5..9678ee926fccc138cd69052107620fe5c5fda930 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 using tensorflow::FunctionDef;
@@ -56,57 +57,6 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
   }
 }
 
-void TF_InitializeTPU(TF_Session* session, TF_Status* status) {
-  VLOG(1) << "Initializing TPU";
-  TF_Operation* config_op =
-      TF_GraphOperationByName(session->graph, "ConfigureDistributedTPU");
-  if (config_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ConfigureDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_Output config_node{config_op, 0};
-
-  TF_Tensor* dummy_output;
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ &config_node, /*output_values*/ &dummy_output,
-                /*noutputs*/ 1,
-                /*targets*/ nullptr, /*ntargets*/ 0,
-                /*run_metadata*/ nullptr, status);
-  if (status->status.ok()) {
-    TF_DeleteTensor(dummy_output);
-  }
-}
-
-void TF_ShutdownTPU(TF_Session* session, TF_Status* status) {
-  {
-    tensorflow::mutex_lock c(session->graph->mu);
-    VLOG(1) << "Shutting down TPU, with input graph: "
-            << session->graph->graph.ToGraphDefDebug().DebugString();
-  }
-
-  TF_Operation* shutdown_op =
-      TF_GraphOperationByName(session->graph, "ShutdownDistributedTPU");
-  if (shutdown_op == nullptr) {
-    status->status = tensorflow::errors::Internal(
-        "Unable to find node ShutdownDistributedTPU in the TF graph.");
-    return;
-  }
-
-  TF_SessionRun(session, /*run_options*/ nullptr,
-                // input related parameters
-                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
-                // output related parameters
-                /*outputs*/ nullptr, /*output_values*/ nullptr,
-                /*noutputs*/ 0,
-                /*targets*/ &shutdown_op, /*ntargets*/ 1,
-                /*run_metadata*/ nullptr, status);
-}
-
 const char* TF_GraphDebugString(TF_Graph* graph, size_t* len) {
   tensorflow::mutex_lock c(graph->mu);
   const auto& debug_str = graph->graph.ToGraphDefDebug().DebugString();
@@ -240,6 +190,12 @@ library {
 //  be deleted by calling TF_DeleteFunction.
 static std::vector<UniqueFuncPtr> CreateImagenetDatasetFunctions(
     const char* file_path, std::string* dataset_name, TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return std::vector<UniqueFuncPtr>();
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -7118,6 +7074,7 @@ library {
         DCHECK(found);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 //  On success, returns a set of TF_Function instances encoding a dataset
@@ -7127,6 +7084,12 @@ library {
 static std::vector<UniqueFuncPtr> CreateMNISTDatasetFunctions(
     const char* file_path, int batch_size, std::string* dataset_name,
     TF_Status* status) {
+#if defined(PLATFORM_WINDOWS)
+  status->status = tensorflow::errors::Unimplemented(
+      "TF_MakeFileBasedIteratorGetNextWithDatasets in the experimental C API "
+      "is not implemented for Windows");
+  return nullptr;
+#else
   const char* func_def = R"PREFIX(
 library {
   function {
@@ -8256,6 +8219,7 @@ library {
         DCHECK(found_batch_size);
       };
   return CreateFunctionsFromTextProto(func_def, &mutate_proto_func, status);
+#endif
 }
 
 // Adds the input functions to `graph`.  On success, returns the created
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ebcec8176b63f9a91c847ebe96fba3ff023fc599..666342974ee0a51b707955cf7468e914fead85b3 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -60,27 +60,6 @@ extern "C" {
 TF_CAPI_EXPORT extern void TF_EnableXLACompilation(TF_SessionOptions* options,
                                                    unsigned char enable);
 
-// Initializes TPU system. Must be called exactly once before TF_SessionRun() is
-// called on a TPU graph.
-//
-// The session graph must contain a node named ConfigureDistributedTPU.
-// TODO(b/74774824): Improve the API on initializing TPU system.
-TF_CAPI_EXPORT extern void TF_InitializeTPU(TF_Session* session,
-                                            TF_Status* status);
-
-// Shuts down TPU system. For any `session` where TF_InitializeTPU() has
-// been successfully called, this call must be made exactly once before the
-// session is closed.
-// The session graph must contain a node named ShutdownDistributedTPU.
-TF_CAPI_EXPORT extern void TF_ShutdownTPU(TF_Session* session,
-                                          TF_Status* status);
-
-// Returns the graph content in a human-readable format, with length set in
-// `len`. The format is subject to change in the future.
-// The returned string is heap-allocated, and caller should call free() on it.
-TF_CAPI_EXPORT extern const char* TF_GraphDebugString(TF_Graph* graph,
-                                                      size_t* len);
-
 // Returns the graph content in a human-readable format, with length set in
 // `len`. The format is subject to change in the future.
 // The returned string is heap-allocated, and caller should call free() on it.
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 028f146be31790b211e546978302e81afe26b231..ca80db23ed3ccbbdc49c61db6cd03ff735470512 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -53,7 +53,7 @@ Status TF_TensorToTensor(const TF_Tensor* src, Tensor* dst);
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 8df7b5662353e98eb82a13b9e65819a8f4d6261a..a2d96357ac8a55be7fe03bf58e33ff1733967dd1 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -30,6 +30,8 @@ tf_cuda_library(
             "//tensorflow/core:core_cpu",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:execute_node",
             "//tensorflow/core/common_runtime/eager:kernel_and_device",
             "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core/common_runtime/eager:copy_to_device_node",
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index eaeb2fd07a3fdc2bfca97afc799bd65609955609..c96a38dec3ed7bcbbd77415ec3b158390def797e 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -199,18 +201,24 @@ TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h, TF_Status* status) {
   const tensorflow::Tensor* t = nullptr;
   status->status = h->handle->TensorAndDevice(&t, &d, &op_device);
   if (!status->status.ok()) return nullptr;
+  tensorflow::TensorHandle* h_cpu = nullptr;
   if (!IsCPU(d)) {
-    TF_SetStatus(status, TF_UNIMPLEMENTED,
-                 tensorflow::strings::StrCat(
-                     "TFE_TensorHandle can be resolved iff it is on CPU (this "
-                     "handle is on ",
-                     d->name(),
-                     "). Consider using TFE_TensorHandleCopyToDevice to get a "
-                     "copy of the tensor on CPU")
-                     .c_str());
-    return nullptr;
+    status->status = h->handle->CopyToDevice(
+        h->handle->Context(), h->handle->Context()->HostCPU(), &h_cpu);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    status->status = h_cpu->TensorAndDevice(&t, &d, &op_device);
+    if (!status->status.ok()) {
+      h_cpu->Unref();
+      return nullptr;
+    }
   }
-  return tensorflow::TF_TensorFromTensor(*t, status);
+  TF_Tensor* retval = tensorflow::TF_TensorFromTensor(*t, status);
+  if (h_cpu != nullptr) {
+    h_cpu->Unref();
+  }
+  return retval;
 }
 }  // extern "C"
 
@@ -256,17 +264,6 @@ void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
 }
 
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
-  if (op->device == nullptr) {
-    // Questionable heuristic ...
-    // - If a device was explicitly set on the op, always use that.
-    // - If not, place on the first non-host device seen.
-    tensorflow::Device* d = nullptr;
-    // TODO(agarwal): This call may block if h is not ready. Avoid this if
-    // possible.
-    status->status = h->handle->Device(&d);
-    if (!status->status.ok()) return;
-    if (!IsCPU(d)) op->device = d;
-  }
   h->handle->Ref();
   op->inputs.push_back(h->handle);
   op->attrs.NumInputs(op->inputs.size());
@@ -434,41 +431,39 @@ void TFE_OpSetAttrFunctionList(TFE_Op* op, const char* attr_name,
 
 namespace {
 
-// TODO(apassos) move to TensorHandle
-tensorflow::TensorHandle* TFE_TensorHandleCopyToDevice_Internal(
-    tensorflow::TensorHandle* h, TFE_Context* ctx, const char* device_name,
-    TF_Status* status) {
-  status->status = ctx->context.GetStatus();
-  if (!status->status.ok()) {
-    return nullptr;
+// Initializes the step stats if needed.
+void MaybeInitializeStepStats(tensorflow::StepStats* step_stats,
+                              tensorflow::EagerContext* ctx) {
+  // Lazily initialize the RunMetadata with information about all devices if
+  // this is the first call.
+  while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+    int device_idx = step_stats->dev_stats_size();
+    auto* dev_stats = step_stats->add_dev_stats();
+    dev_stats->set_device(ctx->devices()->at(device_idx)->name());
   }
-  tensorflow::Device* dstd = ctx->context.HostCPU();
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    status->status =
-        ctx->context.device_mgr()->LookupDevice(device_name, &dstd);
-    if (!status->status.ok()) return nullptr;
+}
+
+int StepStatsDeviceIndex(tensorflow::StepStats* step_stats,
+                         tensorflow::EagerContext* ctx,
+                         tensorflow::Device* device) {
+  // Find the current device's index.
+  if (device == nullptr) {
+    device = ctx->HostCPU();
   }
-  if (ctx->context.Async()) {
-    // Note that `h` may not be currently ready. However execution order will
-    // make sure that `h` is ready before the copy is actually done.
-    tensorflow::CopyToDeviceNode* node =
-        new tensorflow::CopyToDeviceNode(h, dstd, &ctx->context);
-    tensorflow::TensorHandle* output = node->dst();
-    // Note that calling Add makes `node` accessible by the EagerExecutor
-    // thread. So further accesses need to be thread-safe.
-    ctx->context.ExecutorAdd(node);
-    return output;
-  } else {
-    tensorflow::TensorHandle* output = nullptr;
-    status->status = h->CopyToDevice(&ctx->context, dstd, &output);
-    return output;
+  for (int i = 0; i < ctx->devices()->size(); ++i) {
+    if (ctx->devices()->at(i) == device ||
+        ctx->devices()->at(i)->name() == device->name()) {
+      return i;
+    }
   }
+  // TODO(apassos) do not fall back to host CPU if device is unknown.
+  return 0;
 }
 
 tensorflow::Status ValidateInputTypeAndPlacement(
-    TFE_Context* ctx, tensorflow::Device* host_device,
-    tensorflow::Device* op_device, TFE_Op* op,
-    const tensorflow::OpKernel* kernel) {
+    tensorflow::EagerContext* ctx, tensorflow::Device* op_device, TFE_Op* op,
+    const tensorflow::OpKernel* kernel, tensorflow::RunMetadata* run_metadata) {
+  tensorflow::Device* host_device = ctx->HostCPU();
   const tensorflow::MemoryTypeVector& memtypes = kernel->input_memory_types();
   if (memtypes.size() != op->inputs.size()) {
     return tensorflow::errors::InvalidArgument(
@@ -483,8 +478,8 @@ tensorflow::Status ValidateInputTypeAndPlacement(
     const tensorflow::Device* actual_device =
         handle_device == nullptr ? host_device : handle_device;
     if (expected_device != actual_device) {
-      switch (TFE_ContextGetDevicePlacementPolicy(ctx)) {
-        case TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32:
+      switch (ctx->GetDevicePlacementPolicy()) {
+        case tensorflow::DEVICE_PLACEMENT_SILENT_FOR_INT32:
           // TODO(xpan): See if we could bubble python related error up
           // to python level.
           if (handle->dtype == tensorflow::DT_INT32) {
@@ -493,7 +488,7 @@ tensorflow::Status ValidateInputTypeAndPlacement(
             break;
           }
           TF_FALLTHROUGH_INTENDED;
-        case TFE_DEVICE_PLACEMENT_EXPLICIT:
+        case tensorflow::DEVICE_PLACEMENT_EXPLICIT:
           return tensorflow::errors::InvalidArgument(
               "Tensors on conflicting devices:"
               " cannot compute ",
@@ -501,11 +496,13 @@ tensorflow::Status ValidateInputTypeAndPlacement(
               expected_device->name(), " but is actually on ",
               actual_device->name(), " (operation running on ",
               op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu(),"
-              " or transparently copied by using tfe.enable_eager_execution("
-              "tfe.DEVICE_PLACEMENT_SILENT). Copying tensors between devices"
+              " Tensors can be copied explicitly using .gpu() or .cpu() "
+              "methods,"
+              " or transparently copied by using tf.enable_eager_execution("
+              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+              "between devices"
               " may slow down your model");
-        case TFE_DEVICE_PLACEMENT_WARN:
+        case tensorflow::DEVICE_PLACEMENT_WARN:
           LOG(WARNING) << "before computing " << op->name << " input #" << i
                        << " was expected to be on " << expected_device->name()
                        << " but is actually on " << actual_device->name()
@@ -513,17 +510,27 @@ tensorflow::Status ValidateInputTypeAndPlacement(
                        << "). This triggers a copy which can be a performance "
                           "bottleneck.";
           break;
-        case TFE_DEVICE_PLACEMENT_SILENT:  // Do nothing.
+        case tensorflow::DEVICE_PLACEMENT_SILENT:  // Do nothing.
           break;
       }
       // We are only here if the policy is warn or silent copies, so we should
       // trigger a copy.
-      TF_Status* s = TF_NewStatus();
-      tensorflow::TensorHandle* copied_tensor =
-          TFE_TensorHandleCopyToDevice_Internal(
-              handle, ctx, expected_device->name().c_str(), s);
-      tensorflow::Status status = s->status;
-      TF_DeleteStatus(s);
+      auto pre_time = tensorflow::Env::Default()->NowMicros();
+      tensorflow::TensorHandle* copied_tensor = nullptr;
+      tensorflow::Status status = tensorflow::EagerCopyToDevice(
+          handle, ctx, expected_device->name().c_str(), &copied_tensor);
+      if (run_metadata != nullptr) {
+        auto* step_stats = run_metadata->mutable_step_stats();
+        MaybeInitializeStepStats(step_stats, ctx);
+        // Record the sending on the source device for now.
+        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+        auto* node_stats = dev_stats->add_node_stats();
+        node_stats->set_node_name("_Send");
+        node_stats->set_all_start_micros(pre_time);
+        node_stats->set_op_end_rel_micros(
+            tensorflow::Env::Default()->NowMicros() - pre_time);
+      }
       if (!status.ok()) {
         if (copied_tensor != nullptr) copied_tensor->Unref();
         return tensorflow::errors::Internal(
@@ -574,145 +581,6 @@ tensorflow::Device* SelectDevice(const tensorflow::NodeDef& ndef,
   return nullptr;
 }
 
-tensorflow::Status Execute(
-    TFE_Context* ctx, tensorflow::Device* device,
-    const tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4>&
-        op_inputs,
-    tensorflow::KernelAndDevice* kernel, tensorflow::NodeExecStats* maybe_stats,
-    tensorflow::TensorHandle** retvals, int num_retvals) {
-  if (!ctx->context.SoftPlacement() && device == nullptr) {
-    device = ctx->context.HostCPU();
-  }
-
-  if (device == nullptr) {
-    // TODO(apassos) debug how the assignment below might return a different
-    // device from the one requested above.
-    device = kernel->device();
-  }
-
-  std::vector<tensorflow::Tensor> outputs(1);
-  const tensorflow::MemoryTypeVector* output_memory_types = nullptr;
-  output_memory_types = &kernel->kernel()->output_memory_types();
-  std::vector<tensorflow::Tensor> inputs(op_inputs.size());
-  for (int i = 0; i < op_inputs.size(); ++i) {
-    const tensorflow::Tensor* input_tensor = nullptr;
-    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
-    inputs[i] = *input_tensor;
-  }
-  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
-  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def.
-  // But knowledge of the implementation
-  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
-  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
-  // This is quite subtle. Re-work things to make this better?  (Would it make
-  // sense for FunctionLibraryRuntime to ensure thread-safe access to
-  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
-  // for ops which are a part of functions.
-  // TODO(agarwal): change Run to take vector of handles ?
-  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
-  if (maybe_stats != nullptr) {
-    maybe_stats->set_op_end_rel_micros(tensorflow::Env::Default()->NowMicros() -
-                                       maybe_stats->all_start_micros());
-    tensorflow::mutex_lock ml(*ctx->context.MetadataMu());
-    if (ctx->context.ShouldStoreMetadata()) {
-      auto* step_stats = ctx->context.RunMetadataProto()->mutable_step_stats();
-      // Lazily initialize the RunMetadata with information about all devices if
-      // this is the first call.
-      while (step_stats->dev_stats_size() < ctx->context.devices()->size()) {
-        step_stats->add_dev_stats();
-      }
-      // Find the current device's index.
-      int device_idx = 0;
-      for (int i = 0; i < ctx->context.devices()->size(); ++i) {
-        if (ctx->context.devices()->at(i) == device) {
-          device_idx = i;
-          break;
-        }
-      }
-      // Populate the device stats for this device.
-      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-      dev_stats->set_device(device->name());
-      *dev_stats->add_node_stats() = *maybe_stats;
-    }
-  }
-  DCHECK_EQ(num_retvals, outputs.size());
-  tensorflow::Device* op_device = IsCPU(device) ? nullptr : device;
-  for (int i = 0; i < num_retvals; ++i) {
-    tensorflow::Device* d = op_device;
-    if (d != nullptr && output_memory_types != nullptr &&
-        (*output_memory_types)[i] == tensorflow::HOST_MEMORY) {
-      d = nullptr;
-    }
-    if (retvals[i] == nullptr) {
-      retvals[i] = new tensorflow::TensorHandle(outputs[i], d, op_device);
-    } else {
-      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
-    }
-  }
-  return tensorflow::Status::OK();
-}
-
-// TODO(agarwal): move EagerExecutor and EagerNode related code to a separate
-// file.
-class ExecuteNode : public tensorflow::EagerNode {
- public:
-  ExecuteNode(TFE_Op* op, tensorflow::KernelAndDevice* kernel,
-              tensorflow::NodeExecStats* maybe_stats,
-              const tensorflow::DataTypeVector& output_dtypes,
-              TFE_TensorHandle** retvals, int num_retvals)
-      : tensorflow::EagerNode(op->ctx->context.NextId()),
-        ctx_(op->ctx),
-        op_device_(op->device),
-        inputs_(op->inputs),
-        kernel_(kernel),
-        maybe_stats_(maybe_stats),
-        retvals_(num_retvals) {
-    for (auto handle : inputs_) {
-      handle->Ref();
-    }
-    TFE_Context* ctx = op->ctx;
-    for (int i = 0; i < num_retvals; ++i) {
-      tensorflow::TensorHandle* h =
-          new tensorflow::TensorHandle(id, output_dtypes[i], &ctx->context);
-      h->Ref();
-      retvals[i] = new TFE_TensorHandle(h);
-      retvals_[i] = h;
-    }
-  }
-
-  ~ExecuteNode() override {
-    for (auto handle : inputs_) {
-      handle->Unref();
-    }
-    for (auto handle : retvals_) {
-      handle->Unref();
-    }
-  }
-
-  tensorflow::Status Run() override {
-    const tensorflow::Status status =
-        Execute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
-                retvals_.begin(), retvals_.size());
-    if (status.ok()) {
-      return status;
-    } else {
-      return tensorflow::Status(
-          status.code(),
-          tensorflow::strings::StrCat("Got error, \"", status.error_message(),
-                                      "\" while executing kernel ",
-                                      kernel_->kernel()->def().DebugString()));
-    }
-  }
-
- private:
-  TFE_Context* ctx_;
-  tensorflow::Device* op_device_;
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
-  tensorflow::KernelAndDevice* kernel_;
-  std::unique_ptr<tensorflow::NodeExecStats> maybe_stats_;
-  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> retvals_;
-};
-
 
 #ifdef TENSORFLOW_EAGER_USE_XLA
 // Synthesizes and returns a wrapper function over `op`, which must be a
@@ -956,8 +824,12 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     tensorflow::Device* input_op_device = nullptr;
     status->status = op->inputs[i]->OpDevice(&input_op_device);
     if (!status->status.ok()) return;
+    VLOG(2) << "for op " << op->name << " input " << i << " "
+            << tensorflow::DataTypeString(op->inputs[i]->dtype) << " "
+            << (input_op_device == nullptr ? "cpu" : input_op_device->name())
+            << " " << (op->device == nullptr ? "cpu" : op->device->name());
     if (op->inputs[i]->dtype == tensorflow::DT_RESOURCE &&
-        input_op_device != op->device) {
+        (input_op_device != op->device || input_op_device == nullptr)) {
       tensorflow::Device* d =
           input_op_device == nullptr ? ctx->context.HostCPU() : input_op_device;
       VLOG(1) << "Changing device of operation " << op->name << " to "
@@ -967,16 +839,13 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     }
   }
   tensorflow::Device* device = op->device;
-  if (!ctx->context.SoftPlacement() && device == nullptr) {
-    device = ctx->context.HostCPU();
-  }
 
   tensorflow::Fprint128 cache_key =
       op->attrs.CacheKey(device == nullptr ? "unspecified" : device->name());
   tensorflow::KernelAndDevice* kernel = ctx->context.GetCachedKernel(cache_key);
   if (kernel == nullptr) {
     const tensorflow::NodeDef& ndef = op->attrs.BuildNodeDef();
-    if (ctx->context.SoftPlacement() && device == nullptr) {
+    if (device == nullptr) {
       device = SelectDevice(ndef, ctx, status);
       if (!status->status.ok()) {
         return;
@@ -1037,8 +906,10 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // device from the one requested above.
     device = kernel->device();
   }
-  status->status = ValidateInputTypeAndPlacement(ctx, ctx->context.HostCPU(),
-                                                 device, op, kernel->kernel());
+  status->status = ValidateInputTypeAndPlacement(
+      &ctx->context, device, op, kernel->kernel(),
+      ctx->context.ShouldStoreMetadata() ? ctx->context.RunMetadataProto()
+                                         : nullptr);
   if (!status->status.ok()) return;
   std::unique_ptr<tensorflow::NodeExecStats> maybe_stats;
   if (ctx->context.ShouldStoreMetadata()) {
@@ -1053,18 +924,27 @@ void TFE_Execute(TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
     // Note that for async mode, execution order will make sure that all
     // input handles are ready before executing them.
     // TODO(agarwal): Consider executing "cheap" kernels inline for performance.
-    tensorflow::EagerNode* node =
-        new ExecuteNode(op, kernel, maybe_stats.release(), output_dtypes,
-                        retvals, *num_retvals);
+    tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> handle_retvals(
+        *num_retvals);
+    tensorflow::uint64 id = op->ctx->context.NextId();
+    for (int i = 0; i < *num_retvals; ++i) {
+      tensorflow::TensorHandle* h =
+          new tensorflow::TensorHandle(id, output_dtypes[i], &op->ctx->context);
+      retvals[i] = new TFE_TensorHandle(h);
+      handle_retvals[i] = h;
+    }
+    tensorflow::EagerNode* node = new tensorflow::ExecuteNode(
+        id, &op->ctx->context, op->device, op->inputs, kernel,
+        maybe_stats.release(), output_dtypes, handle_retvals);
     ctx->context.ExecutorAdd(node);
   } else {
     // Execute checks if retvals[i] is nullptr or not to figure if it needs to
     // allocate it.
     std::vector<tensorflow::TensorHandle*> handle_retvals(*num_retvals,
                                                           nullptr);
-    status->status =
-        Execute(op->ctx, op->device, op->inputs, kernel, maybe_stats.get(),
-                handle_retvals.data(), *num_retvals);
+    status->status = tensorflow::EagerExecute(
+        &op->ctx->context, op->device, op->inputs, kernel, maybe_stats.get(),
+        handle_retvals.data(), *num_retvals);
     for (int i = 0; i < *num_retvals; ++i) {
       retvals[i] = new TFE_TensorHandle(handle_retvals[i]);
     }
@@ -1075,8 +955,9 @@ TFE_TensorHandle* TFE_TensorHandleCopyToDevice(TFE_TensorHandle* h,
                                                TFE_Context* ctx,
                                                const char* device_name,
                                                TF_Status* status) {
-  tensorflow::TensorHandle* handle = TFE_TensorHandleCopyToDevice_Internal(
-      h->handle, ctx, device_name, status);
+  tensorflow::TensorHandle* handle;
+  status->status = tensorflow::EagerCopyToDevice(h->handle, &ctx->context,
+                                                 device_name, &handle);
   if (status->status.ok()) {
     return new TFE_TensorHandle(handle);
   }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index a5029bf2115c7dac54d03b8bc6397bc63349c068..3926c22ce1f9e194b1452c796c83944d10cfdc64 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -61,17 +61,15 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetConfig(
 // Controls how to act when we try to run an operation on a given device but
 // some input tensors are not on that device.
 typedef enum TFE_ContextDevicePlacementPolicy {
-  // Running operations with input tensors on the wrong device will fail. When
-  // soft placement is enabled acts like TFE_DEVICE_PLACEMENT_SILENT.
+  // Running operations with input tensors on the wrong device will fail.
   TFE_DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   TFE_DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the
-  // operation will be blocked till the copy completes.
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default placement
+  // policy.
   TFE_DEVICE_PLACEMENT_SILENT = 2,
-  // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.  When soft placement is enabled acts like
-  // TFE_DEVICE_PLACEMENT_SILENT.
+  // Placement policy which silently copies int32 tensors but not other dtypes.
   TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
 
@@ -162,7 +160,11 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
-// This function will block till the operation that produces `h` has completed.
+// This function will block till the operation that produces `h` has
+// completed. The memory returned might alias the internal memory used by
+// TensorFlow. Hence, callers should not mutate this memory (for example by
+// modifying the memory region pointed to by TF_TensorData() on the returned
+// TF_Tensor).
 TF_CAPI_EXPORT extern TF_Tensor* TFE_TensorHandleResolve(TFE_TensorHandle* h,
                                                          TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index e6d2ab75ffd2849d7fafb630eb452122ef36339b..05dc64f521735f944559392f470a37590e93f17c 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -50,8 +50,7 @@ struct TFE_ContextOptions {
   TF_SessionOptions session_options;
   // true if async execution is enabled.
   bool async = false;
-  TFE_ContextDevicePlacementPolicy policy{
-      TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32};
+  TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
 };
 
 struct TFE_Context {
@@ -71,7 +70,7 @@ struct TFE_Context {
 struct TFE_TensorHandle {
   TFE_TensorHandle(const tensorflow::Tensor& t, tensorflow::Device* d,
                    tensorflow::Device* op_device)
-      : handle(new tensorflow::TensorHandle(t, d, op_device)) {}
+      : handle(new tensorflow::TensorHandle(t, d, op_device, nullptr)) {}
 
   TFE_TensorHandle(tensorflow::uint64 node_id, tensorflow::DataType dtype,
                    tensorflow::EagerContext* ctx)
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 2268aba90d60b7b2f10e99f64fd7aa3ae719badb..701175e4943d1d23532fe595319f67711316ed4d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -590,7 +590,13 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_TensorHandle* m1 = TestMatrixTensorHandle();
   TFE_TensorHandle* m2 = TestMatrixTensorHandle3X2();
   TFE_Op* matmul = MatMulOp(ctx, m1, m2);
+  TFE_OpSetDevice(matmul, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* matmul2 = MatMulOp(ctx, m1, m1);
+  TFE_OpSetDevice(matmul2, "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_TensorHandle* retvals[1] = {nullptr};
   int num_retvals = 1;
   TFE_Execute(matmul, &retvals[0], &num_retvals, status);
@@ -688,19 +694,19 @@ TEST(CAPI, Execute_Min_CPU) {
   TFE_DeleteOp(minOp);
   TFE_DeleteTensorHandle(input);
   TFE_DeleteTensorHandle(axis);
-  TFE_DeleteContext(ctx, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   ASSERT_EQ(1, num_retvals);
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
   float output[2] = {0};
   EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
   memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
+  TFE_DeleteContext(ctx, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index c7bd3bdafd787e5c72625b190ea8bf8b8264d22d..97c323b87228039ba10f4ed5e434aa83621b1220 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -601,23 +601,28 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
   }
   CHECK(state.op_tape.empty());
   result->reserve(source_tensor_ids.size());
+  gtl::FlatSet<int64> used_gradient_ids(source_tensor_ids.size());
   for (auto is : source_tensor_ids) {
     auto grad_it = gradients.find(is);
     if (grad_it == gradients.end()) {
       result->push_back(nullptr);
     } else {
-      if (grad_it->second.size() == 1) {
-        result->push_back(grad_it->second[0]);
-      } else {
-        result->push_back(vspace.AggregateGradients(grad_it->second));
+      if (grad_it->second.size() > 1) {
+        Gradient* grad = vspace.AggregateGradients(grad_it->second);
+        grad_it->second.clear();
+        grad_it->second.push_back(grad);
       }
-      gradients.erase(grad_it);
+      result->push_back(grad_it->second[0]);
+      used_gradient_ids.insert(is);
     }
   }
-  VLOG(1) << "Final gradients size: " << gradients.size();
+  VLOG(1) << "Final gradients size: "
+          << gradients.size() - used_gradient_ids.size();
   for (auto grad_pair : gradients) {
-    for (const auto& g : grad_pair.second) {
-      vspace.DeleteGradient(g);
+    if (used_gradient_ids.find(grad_pair.first) == used_gradient_ids.end()) {
+      for (const auto& g : grad_pair.second) {
+        vspace.DeleteGradient(g);
+      }
     }
   }
   return Status::OK();
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index cd604538f1fa142c6fe6a76624c048baddaa52fb..93155998b86d59ec78c7ff25f146b8e3c8eac380 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/python_api.h"
 
 #include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/python/framework/cpp_shape_inference.pb.h"
 
 namespace tensorflow {
 
@@ -109,4 +110,29 @@ void ExtendSession(TF_Session* session, TF_Status* status) {
   session->extend_before_run = false;
 }
 
+std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output) {
+  Node* node = &output.oper->node;
+  CppShapeInferenceResult::HandleData handle_data;
+  handle_data.set_is_set(true);
+  {
+    mutex_lock l(graph->mu);
+    tensorflow::shape_inference::InferenceContext* ic =
+        graph->refiner.GetContext(node);
+    CHECK(ic != nullptr);
+    CHECK_LT(output.index, ic->num_outputs());
+    const auto* shapes_and_types =
+        ic->output_handle_shapes_and_types(output.index);
+    if (shapes_and_types == nullptr) return "";
+
+    for (const auto& p : *shapes_and_types) {
+      auto* out_shape_and_type = handle_data.add_shape_and_type();
+      ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
+      out_shape_and_type->set_dtype(p.dtype);
+    }
+  }
+  string result;
+  handle_data.SerializeToString(&result);
+  return result;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 13b680b3a24afa2d285ea18207578aff4350f6d5..2d4c8cd9ed7bc926f448dab1f6b50ed74179ea14 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_C_PYTHON_API_H_
 #define TENSORFLOW_C_PYTHON_API_H_
 
+#include <string>
+
 #include "tensorflow/c/c_api.h"
 
 // These functions can be removed without notice. They exist to facilitate some
@@ -51,6 +53,11 @@ void SetRequireShapeInferenceFns(TF_Graph* graph, bool require);
 // the graph after the session has been made aware of them.
 void ExtendSession(TF_Session* session, TF_Status* status);
 
+// Returns the serialized CppShapeInferenceResult::HandleData proto for
+// `output` if its a resource tensor, or otherwise returns the empty string.
+// TODO(b/74620627): remove when _USE_C_SHAPES is removed
+std::string ResourceHandleShapeAndType(TF_Graph* graph, TF_Output output);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 9060c19e9d2cf965c2b9be07be07c42017da45a8..079e063d3e3fbdaf833e9031f5f9438853c14099 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -620,18 +620,6 @@ tf_cc_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "queue_runner",
     srcs = ["training/queue_runner.cc"],
diff --git a/tensorflow/cc/framework/cc_op_gen_test.cc b/tensorflow/cc/framework/cc_op_gen_test.cc
index 1e0f2d241bb350897a840dda90d6d0c009b1daad..5d9dfd95a5538ae0f3d2d111a1f989552c3363b8 100644
--- a/tensorflow/cc/framework/cc_op_gen_test.cc
+++ b/tensorflow/cc/framework/cc_op_gen_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -61,12 +62,12 @@ op {
 )";
 
 void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(s.contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
 void ExpectDoesNotHaveSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_FALSE(s.contains(expected))
+  EXPECT_FALSE(str_util::StrContains(s, expected))
       << "'" << s << "' contains '" << expected << "'";
 }
 
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 71642492627422e09c19b7bcb4dc522846cf08b1..c143b978338815ebc7134eb0a07867c5d8b13dca 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -218,7 +219,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
   if (GetNodeAttr(attrs, kColocationAttrName, &node_constraints).ok()) {
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
-      if (s.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.insert(s.ToString());
       }
     }
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index 00799526fce572e7bb80199ccb8ce1cc89874031..cf65fe1ab99b49207a64e86310178141b30d07d7 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -9,6 +9,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 tf_cuda_cc_test(
     name = "profiler_test",
     srcs = ["profiler_test.cc"],
+    tags = [
+        "noguitar",  # b/77649654
+    ],
     deps = [
         ":profiler",
         "//tensorflow/cc:cc_ops",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index d29ad3ebcbe29087d5572b51c7713e0c98d0d840..06a3be18e08f611d3ecf9804908d791d15fdab13 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -94,18 +94,3 @@ filegroup(
         "testdata/half_plus_two/**",
     ]),
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 4c64d2cfe3c10e6c7ed82a2d72460a0b34283bb2..72b8bc18710b0ee77cb01ed3ad0c2abb5183efb2 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -133,9 +134,9 @@ TEST_F(LoaderTest, NoTagMatch) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {"missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message())
-                  .contains("Could not find meta graph def matching supplied "
-                            "tags: { missing-tag }"))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: { missing-tag }"))
       << st.error_message();
 }
 
@@ -149,9 +150,9 @@ TEST_F(LoaderTest, NoTagMatchMultiple) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe, "missing-tag"}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(
-      StringPiece(st.error_message())
-          .contains("Could not find meta graph def matching supplied tags: "))
+  EXPECT_TRUE(str_util::StrContains(
+      st.error_message(),
+      "Could not find meta graph def matching supplied tags: "))
       << st.error_message();
 }
 
@@ -169,7 +170,7 @@ TEST_F(LoaderTest, SessionCreationFailure) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
-  EXPECT_TRUE(StringPiece(st.error_message()).contains(kInvalidTarget))
+  EXPECT_TRUE(str_util::StrContains(st.error_message(), kInvalidTarget))
       << st.error_message();
 }
 
diff --git a/tensorflow/cc/saved_model/python/BUILD b/tensorflow/cc/saved_model/python/BUILD
index f5fbc75edcba9d5ae9ef7432de224df766bcab9e..6f04ebdc55cda329527c95f62efc37c8dfbb4ae5 100644
--- a/tensorflow/cc/saved_model/python/BUILD
+++ b/tensorflow/cc/saved_model/python/BUILD
@@ -7,18 +7,6 @@ package(
     default_visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_py_clif_cc")
 
 tf_py_clif_cc(
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index f413a5cc52e9eb4bc393b8186f5b591681fa2e5e..6f1c87354076565af22f7ba0610a5c6bb999d25c 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -41,18 +41,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/cc/tutorials/example_trainer.cc b/tensorflow/cc/tutorials/example_trainer.cc
index 3675d72ee354533a7d84b5e8783cde452d8d60c9..5dbc4f5f6aa389978e55ca2656c17ff97202203d 100644
--- a/tensorflow/cc/tutorials/example_trainer.cc
+++ b/tensorflow/cc/tutorials/example_trainer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/default_device.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -166,7 +167,8 @@ namespace {
 
 bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     int32* dst) {
-  if (arg.Consume(flag) && arg.Consume("=")) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag) &&
+      tensorflow::str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     return (sscanf(arg.data(), "%d%c", dst, &extra) == 1);
   }
@@ -176,7 +178,7 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    bool* dst) {
-  if (arg.Consume(flag)) {
+  if (tensorflow::str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *dst = true;
       return true;
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index ffa2d088295375bbbcd2cdd9365982907f2bf480..fa03b1f3c2dfc334d4a3871e6a1bf5503fa8d5f8 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -250,17 +250,3 @@ exports_files([
     "benchmark_main.template",  # used by tf_library(...,gen_benchmark=True)
     "test.cc",  # used by tf_library(...,gen_test=True)
 ])
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 972b7d51ecb3798e61757ac55e973075a23b433a..2642536c4f67eba8eedf315f24d800e7913d62a0 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 28aab6eb614ca7123d9e00f7f5cc3661b62e23f7..b053dad1b57c258b7cb0d6831923e6a0f30f5e7e 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -182,17 +182,3 @@ tf_cc_test(
         "//third_party/eigen3",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index e2f01179d4e2e4f6ef72b2761d06e130ffa3a94f..8ea014c2eede2cb7a9cede9dd4ade8b970bd519c 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -55,7 +55,7 @@ const char kUsageHeader[] =
     "\n";
 
 Status ReadProtoFile(const string& fname, protobuf::Message* proto) {
-  if (StringPiece(fname).ends_with(".pbtxt")) {
+  if (str_util::EndsWith(fname, ".pbtxt")) {
     return ReadTextProto(Env::Default(), fname, proto);
   } else {
     return ReadBinaryProto(Env::Default(), fname, proto);
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 8e505da6221b23b0130548405f12a61dcda100d7..4cefc08645a589614c39178f3259ebd9d2c77575 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -120,11 +120,13 @@ cc_library(
 )
 
 cc_library(
-    name = "xla_tensor_info",
-    srcs = ["xla_tensor_info.cc"],
-    hdrs = ["xla_tensor_info.h"],
+    name = "xla_tensor",
+    srcs = ["xla_tensor.cc"],
+    hdrs = ["xla_tensor.h"],
     deps = [
         ":common",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -154,7 +156,7 @@ cc_library(
         ":common",
         ":jit_compilation_passes",
         ":xla_launch_util",
-        ":xla_tensor_info",
+        ":xla_tensor",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
@@ -201,15 +203,15 @@ cc_library(
     deps = [
         ":common",
         ":xla_compilation_cache",
-        ":xla_tensor_info",
+        ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -316,6 +318,25 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
+cc_library(
+    name = "producer_consumer_queue",
+    hdrs = ["producer_consumer_queue.h"],
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_cc_test(
+    name = "producer_consumer_queue_test",
+    size = "small",
+    srcs = ["producer_consumer_queue_test.cc"],
+    deps = [
+        ":producer_consumer_queue",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "graph_to_functiondef_test",
     size = "small",
@@ -359,26 +380,13 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index 7fc43fb26318335909d52d5bbd83ebf61f42a703..b04b333141a616e7c4db2751c14ec6eb0b7725b5 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -254,7 +254,8 @@ class Encapsulator {
 
     // Adds _RecvAtHost and _SendFromHost nodes, where needed, to graph_out.
     Status AddOutsideCompilationHostIONodes(
-        const string& subgraph_name,
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
         const std::unordered_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
@@ -405,7 +406,9 @@ class Encapsulator {
 
     // Builds a _RecvAtHost node producing all the inputs of an
     // outside_compilation subgraph and stores it in oc_subgraph.recv_at_host.
-    Status AddRecvAtHostNode(const string& subgraph_name,
+    Status AddRecvAtHostNode(const string& group_attribute,
+                             const string& subgraph_name,
+                             const string& outside_compilation_attribute,
                              const string& oc_subgraph_name,
                              OutsideCompilationSubgraph* oc_subgraph,
                              Graph* graph_out);
@@ -414,8 +417,10 @@ class Encapsulator {
     // outside_compilation subgraph and stores it in oc_subgraph.send_from_host.
     Status AddSendFromHostNode(
         const std::unordered_map<const Node*, Node*>& node_images,
-        const string& subgraph_name, const string& oc_subgraph_name,
-        OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out);
+        const string& group_attribute, const string& subgraph_name,
+        const string& outside_compilation_attribute,
+        const string& oc_subgraph_name, OutsideCompilationSubgraph* oc_subgraph,
+        Graph* graph_out);
 
     // The subgraph extracted from the input graph, suitable for being turned
     // into a FunctionDef. Inputs are fed by _Arg nodes, and outputs are
@@ -820,6 +825,7 @@ Status Encapsulator::Subgraph::AddHostComputes(
       builder.Attr("key",
                    strings::StrCat("host_compute_channel_", subgraph_name, "_",
                                    oc_subgraph_name));
+      builder.Attr("_outside_compilation_subgraph", oc_subgraph_name);
       Status s = builder.Finalize(&host_compute_def);
       if (!s.ok()) return s;
 
@@ -1114,7 +1120,8 @@ Status Encapsulator::Subgraph::AddHostComputeKeyPlaceholder(
 }
 
 Status Encapsulator::Subgraph::AddRecvAtHostNode(
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
   if (host_compute_key_placeholder_ == nullptr) {
     TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
@@ -1135,14 +1142,15 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_recv"),
                          kRecvAtHostOp);
-  // TODO(misard) When we add replication the device placement will have to be
-  // redone.
   builder.Device(device_);
   builder.Attr("Toutputs", dtypes);
-  // TODO(misard) For now we only support TPU device 0.
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
   builder.Attr("device_ordinal", 0);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
   builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&recv_def);
   if (!s.ok()) return s;
@@ -1163,7 +1171,8 @@ Status Encapsulator::Subgraph::AddRecvAtHostNode(
 
 Status Encapsulator::Subgraph::AddSendFromHostNode(
     const std::unordered_map<const Node*, Node*>& node_images,
-    const string& subgraph_name, const string& oc_subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute, const string& oc_subgraph_name,
     OutsideCompilationSubgraph* oc_subgraph, Graph* graph_out) {
   if (host_compute_key_placeholder_ == nullptr) {
     TF_RETURN_IF_ERROR(AddHostComputeKeyPlaceholder(oc_subgraph, graph_out));
@@ -1188,14 +1197,15 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
   NodeDefBuilder builder(strings::StrCat("outside_compilation_", subgraph_name,
                                          "_", oc_subgraph_name, "_send"),
                          kSendFromHostOp);
-  // TODO(misard) When we add replication the device placement will have to be
-  // redone.
   builder.Device(device_);
   builder.Attr("Tinputs", dtypes);
   builder.Attr("key", strings::StrCat("host_compute_channel_", subgraph_name,
                                       "_", oc_subgraph_name));
-  // TODO(misard) For now we only support TPU device 0.
+  // The correct device_ordinal will be inserted during replication in a
+  // subsequent rewrite.
   builder.Attr("device_ordinal", 0);
+  builder.Attr(group_attribute, subgraph_name);
+  builder.Attr(outside_compilation_attribute, oc_subgraph_name);
   builder.Input(inputs);
   builder.Input(host_compute_key_placeholder_->name(), 0, DT_STRING);
   Status s = builder.Finalize(&send_def);
@@ -1216,7 +1226,8 @@ Status Encapsulator::Subgraph::AddSendFromHostNode(
 }
 
 Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
-    const string& subgraph_name,
+    const string& group_attribute, const string& subgraph_name,
+    const string& outside_compilation_attribute,
     const std::unordered_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   for (auto& outside_compilation_subgraph_entry :
@@ -1226,14 +1237,16 @@ Status Encapsulator::Subgraph::AddOutsideCompilationHostIONodes(
         outside_compilation_subgraph_entry.second;
 
     if (!oc_subgraph.inputs.empty() || !oc_subgraph.control_inputs.empty()) {
-      TF_RETURN_IF_ERROR(
-          AddRecvAtHostNode(subgraph_name, oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddRecvAtHostNode(group_attribute, subgraph_name,
+                                           outside_compilation_attribute,
+                                           oc_name, &oc_subgraph, graph_out));
     }
 
     if (!oc_subgraph.outputs_by_src.empty() ||
         !oc_subgraph.control_outputs.empty()) {
-      TF_RETURN_IF_ERROR(AddSendFromHostNode(node_images, subgraph_name,
-                                             oc_name, &oc_subgraph, graph_out));
+      TF_RETURN_IF_ERROR(AddSendFromHostNode(
+          node_images, group_attribute, subgraph_name,
+          outside_compilation_attribute, oc_name, &oc_subgraph, graph_out));
     }
   }
   return Status::OK();
@@ -1450,8 +1463,6 @@ Status Encapsulator::CopyNodesToOutputGraph(
             "Parallel checking is not supported when outside_compilation "
             "clusters are present.");
       }
-      image->ClearAttr(group_attribute_);
-      image->ClearAttr(outside_compilation_attribute_);
     }
     (*node_images)[node] = image;
   }
@@ -1477,7 +1488,8 @@ Status Encapsulator::AddOutsideCompilationHostIONodes(
     const string& subgraph_name = subgraph_entry.first;
     Subgraph& subgraph = subgraph_entry.second;
     TF_RETURN_IF_ERROR(subgraph.AddOutsideCompilationHostIONodes(
-        subgraph_name, node_images, graph_out));
+        group_attribute_, subgraph_name, outside_compilation_attribute_,
+        node_images, graph_out));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 94481a1fde986b705764f6f0c6de14fb28002496..8599a7038af9663e5af6f3231429cb7f6ea5f69b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
@@ -382,24 +383,36 @@ Node* KeyPlaceholder(const string& call_node,
       .FinalizeBuilder(&node_builder);
 }
 
-Node* RecvAtHost(ops::NodeOut key_input, const string& key,
+Node* RecvAtHost(ops::NodeOut key_input, const string& cluster,
+                 const string& oc_cluster,
                  const gtl::ArraySlice<DataType>& dtypes,
                  const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaRecvAtHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_recv");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaRecvAtHost"),
                            "_XlaRecvAtHost", opts.op_registry());
   node_builder.Input(std::move(key_input));
   return opts.WithAttr("Toutputs", dtypes)
       .WithAttr("key", key)
       .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
-Node* SendFromHost(ops::NodeOut key_input, const string& key,
+Node* SendFromHost(ops::NodeOut key_input, const string& cluster,
+                   const string& oc_cluster,
                    const std::vector<ops::NodeOut>& inputs,
                    const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
-  NodeBuilder node_builder(opts.GetNameForOp("_XlaSendFromHost"),
+  string key =
+      strings::StrCat("host_compute_channel_", cluster, "_", oc_cluster);
+  string name = strings::StrCat("outside_compilation_", cluster, "_",
+                                oc_cluster, "_send");
+  NodeBuilder node_builder(opts.WithName(name).GetNameForOp("_XlaSendFromHost"),
                            "_XlaSendFromHost", opts.op_registry());
   node_builder.Input(inputs);
   node_builder.Input(std::move(key_input));
@@ -410,6 +423,8 @@ Node* SendFromHost(ops::NodeOut key_input, const string& key,
   return opts.WithAttr("Tinputs", dtypes)
       .WithAttr("key", key)
       .WithAttr("device_ordinal", 0)
+      .WithAttr("_encapsulate", cluster)
+      .WithAttr("_outside", oc_cluster)
       .FinalizeBuilder(&node_builder);
 }
 
@@ -757,7 +772,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Simple) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -802,7 +817,7 @@ TEST(EncapsulateSubgraphsWithGuaranteeConstOpTest, Add) {
         Graph* graph = graph_ptr->get();
         for (const Node* n : graph->nodes()) {
           if (n->type_string() == "_Arg" &&
-              StringPiece(n->name()).starts_with("const")) {
+              str_util::StartsWith(n->name(), "const")) {
             ++guaranteed_consts;
             EXPECT_TRUE(HasGuaranteeConstAttr(*n));
           } else {
@@ -856,14 +871,14 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
@@ -887,7 +902,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -901,17 +917,16 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost(ops::NodeOut(key_constant, 0),
-                              "host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
@@ -976,14 +991,14 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     GraphDefBuilder shape1(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape1.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape1.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape1.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape1.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape1.opts().WithName("outside_compilation_F1_O1_send"));
+                     shape1.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape1.opts());
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape1, "F1_O1", &library_expected));
   }
@@ -992,19 +1007,21 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
     GraphDefBuilder shape2(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape2.opts().WithName("KnownShape/_0"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     shape2.opts().WithName("E"));
-    Node* recv2 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape2.opts().WithName("outside_compilation_F1_O2_recv"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, shape2.opts().WithName("H"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                 {h}, shape2.opts().WithName("outside_compilation_F1_O2_send"));
+                     shape2.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, shape2.opts());
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     shape2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, shape2.opts());
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape2, "F1_O2", &library_expected));
   }
@@ -1030,7 +1047,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O2"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O2"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O2"}},
            {"F"}},
           {{"outside_compilation_O1_host_compute"},
            "XlaHostCompute",
@@ -1040,7 +1058,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"i_0_retval", "I:o:0"}});
@@ -1054,28 +1073,32 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0),
-                               "host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
-
-    Node* recv2 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O2_recv"));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
+
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* g = Binary(e, ops::NodeOut(recv2, 1),
-                     b2.opts().WithName("G").WithControlInputs({recv2, e}));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O2", {h},
-        b2.opts().WithName("outside_compilation_F1_O2_send"));
+                     b2.opts()
+                         .WithName("G")
+                         .WithControlInputs({recv2, e})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O2"));
+    Node* send2 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O2", {h}, b2.opts());
 
     Node* s = Sequencer(b2.opts()
                             .WithName("F1_sequencer")
@@ -1139,14 +1162,14 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
     GraphDefBuilder shape(GraphDefBuilder::kFailImmediately);
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
-    Node* recv =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   shape.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT, DT_FLOAT}, shape.opts());
     Node* e = Binary(ops::NodeOut(recv, 0), ops::NodeOut(recv, 1),
-                     shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
+                     shape.opts()
+                         .WithName("E")
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
@@ -1173,7 +1196,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"d_0_retval", "D:o:0"}, {"f_0_retval", "F:o:0"}});
@@ -1194,7 +1218,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
             {"key", "host_compute_channel_F2_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"g_0_retval", "G:o:0"}, {"i_0_retval", "I:o:0"}});
 
@@ -1207,17 +1232,16 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
     Node* key_constant1 =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant1, 0), "host_compute_channel_F1_O1",
-                   {DT_FLOAT, DT_FLOAT},
-                   b2.opts().WithName("outside_compilation_F1_O1_recv"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant1, 0), "F1", "O1",
+                             {DT_FLOAT, DT_FLOAT}, b2.opts());
     Node* e = Binary(ops::NodeOut(recv1, 0), ops::NodeOut(recv1, 1),
-                     b2.opts().WithName("E").WithControlInputs({recv1, b}));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0),
-                               "host_compute_channel_F1_O1", {e},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+                     b2.opts()
+                         .WithName("E")
+                         .WithControlInputs({recv1, b})
+                         .WithAttr("_encapsulate", "F1")
+                         .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant1, 0), "F1", "O1", {e},
+                               b2.opts().WithControlInput(e));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
         "F1");
@@ -1229,13 +1253,15 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
 
     Node* key_constant2 =
         KeyPlaceholder("F2", b2.opts().WithName("F2_key_placeholder"));
-    Node* recv2 = RecvAtHost(
-        ops::NodeOut(key_constant2, 0), "host_compute_channel_F2_O1",
-        {DT_FLOAT}, b2.opts().WithName("outside_compilation_F2_O1_recv"));
-    Node* h = Binary(ops::NodeOut(call1, 1), recv2, b2.opts().WithName("H"));
-    Node* send2 = SendFromHost(
-        ops::NodeOut(key_constant2, 0), "host_compute_channel_F2_O1", {h},
-        b2.opts().WithName("outside_compilation_F2_O1_send"));
+    Node* recv2 = RecvAtHost(ops::NodeOut(key_constant2, 0), "F2", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* h = Binary(ops::NodeOut(call1, 1), recv2,
+                     b2.opts()
+                         .WithName("H")
+                         .WithAttr("_encapsulate", "F2")
+                         .WithAttr("_outside", "O1"));
+    Node* send2 = SendFromHost(ops::NodeOut(key_constant2, 0), "F2", "O1", {h},
+                               b2.opts());
 
     Node* s2 = Sequencer(
         b2.opts().WithName("F2_sequencer").WithControlInputs({recv2, send2}),
@@ -1300,7 +1326,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1311,12 +1338,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
     Node* a = InputShaped(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* send1 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {e},
-        b2.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInput(send1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
@@ -1380,7 +1409,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
             {"shapes",
-             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})}},
+             gtl::ArraySlice<TensorShapeProto>({shape_proto_expected})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"D"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1395,12 +1425,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
     Node* recv1 =
-        RecvAtHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                   {}, b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(a, b2.opts().WithName("E").WithControlInput(recv1));
-    Node* send1 = SendFromHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {e},
-        b2.opts().WithName("outside_compilation_F1_O1_send"));
+        RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1", {}, b2.opts());
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithControlInput(recv1)
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
+    Node* send1 =
+        SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, b2.opts());
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
         "F1");
@@ -1457,7 +1489,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1470,10 +1503,12 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInput(recv1), "F1");
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
@@ -1534,7 +1569,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
             {"Toutputs", gtl::ArraySlice<DataType>({})},
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph", ""},
-            {"shapes", gtl::ArraySlice<TensorShapeProto>({})}}},
+            {"shapes", gtl::ArraySlice<TensorShapeProto>({})},
+            {"_outside_compilation_subgraph", "O1"}}},
       },
       {{"f_0_retval", "F:o:0"}});
 
@@ -1547,15 +1583,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv1 = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = Unary(recv1, b2.opts().WithName("E"));
-    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0),
-                               "host_compute_channel_F1_O1", {},
-                               b2.opts()
-                                   .WithName("outside_compilation_F1_O1_send")
-                                   .WithControlInput(e));
+    Node* recv1 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                             {DT_FLOAT}, b2.opts());
+    Node* e = Unary(recv1, b2.opts()
+                               .WithName("E")
+                               .WithAttr("_encapsulate", "F1")
+                               .WithAttr("_outside", "O1"));
+    Node* send1 = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {},
+                               b2.opts().WithControlInput(e));
     Node* s1 = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv1, send1}),
         "F1");
@@ -1615,7 +1650,10 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
     Node* a = Input(b2.opts().WithName("A"));
     Node* b = Input(b2.opts().WithName("B"));
 
-    Node* e = Unary(a, b2.opts().WithName("E"));
+    Node* e = Unary(a, b2.opts()
+                           .WithName("E")
+                           .WithAttr("_encapsulate", "F1")
+                           .WithAttr("_outside", "O1"));
     NodeBuilder node_builder1("F1", "F1", lib_def.get());
     node_builder1.Input(a).Input(b);
     Node* call1 = b2.opts().FinalizeBuilder(&node_builder1);
@@ -1666,12 +1704,14 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
     Node* key_constant =
         KeyPlaceholderShape(shape.opts().WithName("KnownShape/_0"));
     Node* known = KnownShape({2}, shape.opts().WithName("KnownShape/_1"));
-    Node* recv = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        shape.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(known, recv, shape.opts().WithName("E"));
-    SendFromHost(ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1",
-                 {e}, shape.opts().WithName("outside_compilation_F1_O1_send"));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, shape.opts());
+    Node* e = BinaryUnknownShape(known, recv,
+                                 shape.opts()
+                                     .WithName("E")
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e}, shape.opts());
     TF_EXPECT_OK(
         AddGraphDefToFunctionLibrary(shape, "F1_O1", &library_expected));
   }
@@ -1694,7 +1734,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
             {"key", "host_compute_channel_F1_O1"},
             {"shape_inference_graph",
              "_outside_compilation_shape_inference_F1_O1"},
-            {"shapes", gtl::ArraySlice<DataType>({})}},
+            {"shapes", gtl::ArraySlice<DataType>({})},
+            {"_outside_compilation_subgraph", "O1"}},
            {"c"}},
       },
       {{"f_0_retval", "F:o:0"}});
@@ -1709,17 +1750,16 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
 
     Node* key_constant =
         KeyPlaceholder("F1", b2.opts().WithName("F1_key_placeholder"));
-    Node* recv = RecvAtHost(
-        ops::NodeOut(key_constant, 0), "host_compute_channel_F1_O1", {DT_FLOAT},
-        b2.opts().WithName("outside_compilation_F1_O1_recv"));
-    Node* e = BinaryUnknownShape(
-        c, ops::NodeOut(recv, 0),
-        b2.opts().WithName("E").WithControlInputs({recv, b}));
-    Node* send = SendFromHost(ops::NodeOut(key_constant, 0),
-                              "host_compute_channel_F1_O1", {e},
-                              b2.opts()
-                                  .WithName("outside_compilation_F1_O1_send")
-                                  .WithControlInput(e));
+    Node* recv = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O1",
+                            {DT_FLOAT}, b2.opts());
+    Node* e = BinaryUnknownShape(c, ops::NodeOut(recv, 0),
+                                 b2.opts()
+                                     .WithName("E")
+                                     .WithControlInputs({recv, b})
+                                     .WithAttr("_encapsulate", "F1")
+                                     .WithAttr("_outside", "O1"));
+    Node* send = SendFromHost(ops::NodeOut(key_constant, 0), "F1", "O1", {e},
+                              b2.opts().WithControlInput(e));
 
     Node* s = Sequencer(
         b2.opts().WithName("F1_sequencer").WithControlInputs({recv, send}),
diff --git a/tensorflow/compiler/jit/graph_to_functiondef.cc b/tensorflow/compiler/jit/graph_to_functiondef.cc
index 6fa21fa6204dcc9446081d07e2a59ccace216713..8f5e11dfa47956f1fdaa4d1ff115affa375c5c73 100644
--- a/tensorflow/compiler/jit/graph_to_functiondef.cc
+++ b/tensorflow/compiler/jit/graph_to_functiondef.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -229,7 +230,7 @@ Status GraphToFunctionDef(const Graph& graph, const string& name,
   for (int n_index = 0; n_index < fdef->node_def_size(); ++n_index) {
     NodeDef* node_def = fdef->mutable_node_def(n_index);
     for (int i = 0; i < node_def->input_size(); ++i) {
-      if (StringPiece(node_def->input(i)).starts_with("^")) {
+      if (str_util::StartsWith(node_def->input(i), "^")) {
         // Control input
         const string normalized =
             node_names.Renormalize(node_def->input(i).substr(1));
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 15507b3851751c681044a744c07c247410fb3e2d..676f71a75aede2a7720ae0c8a579d64cc184509a 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -27,17 +27,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 616a7f8f1541d3debff97a90bd390c76c665d196..00a6f4075f9a18efc3895b033eb6d08e36088a53 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -41,17 +41,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 8a8e8bb8df1a8d0a40af054e6713616745224cc8..f48941fce329313e4484b3c2dd900eeac884ed34 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -116,11 +116,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   const XlaDevice::Metadata* metadata;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
-
-  XlaTensorInfoManager* tensor_info_manager = nullptr;
-  if (s.ok()) {
-    tensor_info_manager = &metadata->tensor_info_manager();
-  }
+  bool allocate_xla_tensors = s.ok();
 
   // Get the platform_id_ for XLA_* devices.
   if (platform_id_ == nullptr) {
@@ -134,8 +130,23 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
-  // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
+  XlaAllocator local_xla_allocator(client->backend().platform(),
+                                   ctx->device()->GetAllocator({}));
+  xla::DeviceMemoryAllocator* xla_allocator;
+  // If we are on an XlaDevice, use the underlying XLA platform's allocator
+  // directly. We could use the StreamExecutor's allocator which may
+  // theoretically be more correct, but XLA returns a nice OOM message in a
+  // Status and StreamExecutor does not.
+  //
+  // Importantly we can't use ctx->device()->GetAllocator() as the allocator
+  // (which local_xla_allocator above uses) as on an XlaDevice, this is a
+  // dummy allocator that returns XlaTensor objects. The XlaCompiler needs a
+  // real allocator to allocate real buffers.
+  if (allocate_xla_tensors) {
+    xla_allocator = client->backend().memory_allocator();
+  } else {
+    xla_allocator = &local_xla_allocator;
+  }
 
   XlaCompiler::Options options;
   options.client = client;
@@ -143,7 +154,10 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == gpu::host::kHostPlatformId);
-  options.device_allocator = &xla_allocator;
+  options.device_allocator = xla_allocator;
+  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
+  // is restricted to Variables, but we need something like this to apply to
+  // normal Tensors too.
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
@@ -159,17 +173,19 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Executing XLA Computation...";
 
   XlaComputationLaunchContext launch_context(
-      num_resource_args_, client, &xla_allocator, tensor_info_manager);
+      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(xla_allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
+  run_options.set_rng_seed(ctx->step_id());
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+
   auto run_result = executable->Run(launch_context.arguments(), run_options);
   OP_REQUIRES(ctx, run_result.ok(), run_result.status());
 
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
index 9cd66fc13c9e0658fdf105d5d9d92f0320ddd179..5d211f4d733d8d807426e62dd116092799184f35 100644
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ b/tensorflow/compiler/jit/legacy_flags/BUILD
@@ -63,17 +63,3 @@ cc_library(
             "//tensorflow/core:lib",
         ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index f651768a67278628e40445291d7fb271bb1ae611..386240ff8d1a562ff4894c40ac79022b8c61fb15 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/public/version.h"
 
@@ -432,6 +433,9 @@ string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
   }
 
   auto node_name = [&cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
     auto* node = graph.FindNodeId(node_id);
     if (node == nullptr) {
       return string("(null)");
@@ -728,11 +732,15 @@ Status MarkForCompilationPass::RunImpl(
     }
   }
 
-  // Count the number of elements in each cluster.
-  std::vector<int> cluster_sizes(graph->num_node_ids());
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph->num_node_ids());
   for (const Node* n : compilation_candidates) {
     int cluster = clusters[n->id()].Get().representative;
-    cluster_sizes[cluster]++;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
   }
 
   // Names for each cluster.
@@ -765,9 +773,12 @@ Status MarkForCompilationPass::RunImpl(
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
-    // Or compile if this is a cluster of >= min_cluster_size compilable
-    // operators.
-    if (cluster_sizes[cluster] >= min_cluster_size || marked_for_compilation ||
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    // Also, always compile if the operator is placed on a device that requires
+    // compilation, or if it contains at least one op that is marked for
+    // compilation that is not an Identity op.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size ||
+        (effective_cluster_sizes[cluster] > 0 && marked_for_compilation) ||
         registration->requires_compilation) {
       string& name = cluster_names[cluster];
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1a8858cccef623185709ab5dc2187a313dd130f7..80edaf28b83348e3a8071a6e9696bc3ebad5d70f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/graph_def_builder_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -137,7 +140,7 @@ TEST(XlaCompilationTest, CompilableCycles) {
   EXPECT_EQ(clusters["A"], clusters["C"]);
 }
 
-TEST(XlaCompilationTest, UnsupportedTypes) {
+TEST(XlaCompilationTest, Complex128Unsupported) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
   {
@@ -157,6 +160,27 @@ TEST(XlaCompilationTest, UnsupportedTypes) {
   EXPECT_TRUE(clusters.empty());
 }
 
+TEST(XlaCompilationTest, HalfSupported) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  GraphDef graphdef;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Tensor t(DT_HALF, TensorShape());
+    t.scalar<Eigen::half>()() = static_cast<Eigen::half>(0.0f);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_HALF)
+                                         .WithAttr("value", t));
+    Node* b = ops::UnaryOp("Neg", a, builder.opts().WithName("B"));
+    ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
+    TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  }
+
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+  EXPECT_FALSE(clusters.empty());
+}
+
 TEST(XlaCompilationTest, ConcatWithConstArg) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   GraphDef graphdef;
@@ -519,11 +543,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
 
   Status status = MarkForCompilation(&graph);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.ToString())
-                  .contains("Edge from c to a would create a cycle.\n"
-                            "+-> a\n"
-                            "|   b\n"
-                            "+-- c\n"));
+  EXPECT_TRUE(str_util::StrContains(status.ToString(),
+                                    "Edge from c to a would create a cycle.\n"
+                                    "+-> a\n"
+                                    "|   b\n"
+                                    "+-- c\n"));
 }
 
 TEST(XlaCompilationTest, Retval) {
@@ -553,5 +577,37 @@ TEST(XlaCompilationTest, Retval) {
   EXPECT_EQ(clusters["A"], clusters["B"]);
 }
 
+TEST(XlaCompilationTest, DontCountIdentityOps) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    auto c = ops::Identity(root.WithOpName("C"), b);
+    auto r = ops::_Retval(root.WithOpName("R"), c, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST(XlaCompilationTest, DontCountIdentityOpsWithLocalJit) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  Scope root = Scope::NewRootScope().ExitOnError();
+  {
+    auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+    auto b = ops::Identity(root.WithOpName("B"), a);
+    b.node()->AddAttr(kXlaCompileAttr, true);
+    auto r = ops::_Retval(root.WithOpName("R"), b, 0);
+  }
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+  auto clusters = GetClusters(*graph);
+
+  EXPECT_TRUE(clusters.empty());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index e5787ca4c8cff436e4404b8488970248b24a5eda..c9e46bc1475aed0e35a48765ad70eef4362e8281 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -17,17 +17,3 @@ cc_library(
     deps = ["//tensorflow/core:framework"],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c8c04152d2f3a0fd46711df24756b7e68b967ea
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue.h
@@ -0,0 +1,132 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
+
+#include <deque>
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// A thread-safe, first-in-first-out queue.
+template <typename T>
+class ProducerConsumerQueue {
+ public:
+  ProducerConsumerQueue()
+      : capacity_(std::numeric_limits<std::size_t>::max()) {}
+  ~ProducerConsumerQueue() = default;
+
+  // Wait until the queue is non-full, then append a copy of v.
+  void Put(const T &v);
+
+  // Wait until the queue is non-empty, then remove and return the head value.
+  T Get();
+
+  // If the queue is non-empty, remove the head value, placing it in *pv, and
+  // return true; otherwise return false.
+  bool TryGet(T *pv);
+
+  // Set the capacity of the queue; the queue is full whenever count() >=
+  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
+  void set_capacity(std::size_t size);
+
+  // Return the capacity of the queue.
+  std::size_t capacity() const;
+
+  // Return the number of elements in the queue.
+  std::size_t count() const;
+
+  // Implementation details follow.  Clients should ignore.
+ private:
+  mutable tensorflow::mutex mu_;  // protects all fields below
+  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
+  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
+  std::size_t capacity_ GUARDED_BY(mu_);
+  std::deque<T> queue_ GUARDED_BY(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
+};
+
+// ------------------------------------------------------
+// Implementation details follow.  Clients should ignore.
+
+// Wait until the queue is non-full, then append a copy of v.
+template <typename T>
+void ProducerConsumerQueue<T>::Put(const T &v) {
+  mutex_lock lock(mu_);
+  while (queue_.size() >= capacity_) {
+    non_full_.wait(lock);
+  }
+  queue_.push_back(v);
+  non_empty_.notify_one();
+}
+
+// Wait until the queue is non-empty, then remove and return the head value.
+template <typename T>
+T ProducerConsumerQueue<T>::Get() {
+  mutex_lock lock(mu_);
+  while (queue_.empty()) {
+    non_empty_.wait(lock);
+  }
+  non_full_.notify_one();
+  T result_value = queue_.front();
+  queue_.pop_front();
+  return result_value;
+}
+
+// If the queue is non-empty, remove the head value, placing it in *pv, and
+// return true; otherwise return false.
+template <typename T>
+bool ProducerConsumerQueue<T>::TryGet(T *pv) {
+  mutex_lock lock(mu_);
+  bool got_element = !queue_.empty();
+  if (got_element) {
+    non_full_.notify_one();
+    *pv = queue_.front();
+    queue_.pop_front();
+  }
+  return got_element;
+}
+
+// Set the capacity of the queue; the queue is full whenever count() >=
+// capacity().  The initial value is the maximum size_t.  Requires size > 0.
+template <typename T>
+void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
+  mutex_lock lock(mu_);
+  CHECK_NE(size, 0);
+  capacity_ = size;
+  non_full_.notify_all();
+}
+
+// Return the capacity of the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::capacity() const {
+  mutex_lock lock(mu_);
+  std::size_t max_elements = capacity_;
+  return max_elements;
+}
+
+// Return the number of elements in the queue.
+template <typename T>
+std::size_t ProducerConsumerQueue<T>::count() const {
+  mutex_lock lock(mu_);
+  std::size_t num_elements = queue_.size();
+  return num_elements;
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f61260c6e52756ee039829afdc7452f5f760c221
--- /dev/null
+++ b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/producer_consumer_queue.h"
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+typedef ProducerConsumerQueue<int> IntQueue;
+
+// Insert integers between low inclusive and high exclusive into q.
+void PushRange(IntQueue *q, int low, int high) {
+  while (low != high) {
+    q->Put(low);
+    VLOG(2) << "Pushing " << low;
+    ++low;
+  }
+}
+
+// Push the numbers between 0 and 999 inclusive from several threads in the
+// pool.
+void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
+  VLOG(1) << "Adding 20-36";
+  pool->Schedule([queue] { PushRange(queue, 20, 36); });
+  VLOG(1) << "Adding 7-20";
+  pool->Schedule([queue] { PushRange(queue, 7, 20); });
+  VLOG(1) << "Adding 36-501";
+  pool->Schedule([queue] { PushRange(queue, 36, 501); });
+  VLOG(1) << "Adding 501-1000";
+  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
+  VLOG(1) << "Adding 0-5";
+  pool->Schedule([queue] { PushRange(queue, 0, 5); });
+  VLOG(1) << "Adding 5-7";
+  pool->Schedule([queue] { PushRange(queue, 5, 7); });
+}
+
+// Pop elements from queue using Get().  Make sure that exactly <high> elements
+// were present and their values are all integers between 0 and high-1
+// inclusive.
+void GetRange(IntQueue *queue, int high) {
+  VLOG(1) << "Testing Wait";
+  std::vector<int> results;
+  for (int i = 0; i != high; ++i) {
+    int r = queue->Get();
+    VLOG(2) << "Waited and got " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK(results[i] == i);
+  }
+}
+
+// Pop elements from queue using TryGet().  Make sure that exactly <high>
+// elements were present and their values are all integers between 0 and high-1
+// inclusive.
+void TryGetRange(IntQueue *queue, int high) {
+  std::vector<int> results;
+  // Give up if we don't get all the elements back from the queue
+  // in 10 seconds.
+  int timeout = 10;
+  int r;
+  for (int i = 0; i != high; ++i) {
+    while (!queue->TryGet(&r)) {
+      if (!timeout--) {
+        LOG(FATAL) << "Can't find all elements in the queue";
+      }
+      VLOG(1) << "Sleeping for a second...";
+      sleep(1);
+    }
+    VLOG(2) << "Popped " << r;
+    results.push_back(r);
+  }
+  CHECK_EQ(queue->count(), 0);
+  CHECK(!queue->TryGet(&r));
+  std::sort(results.begin(), results.end());
+  for (int i = 0; i != high; ++i) {
+    CHECK_EQ(i, results[i]);
+  }
+}
+
+const int kNumThreads = 15;
+
+TEST(ProducerConsumerQueue, GetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  GetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, TryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    PushRanges(&queue, &pool);
+  }
+  TryGetRange(&queue, 1000);
+}
+
+TEST(ProducerConsumerQueue, ParallelGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { GetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+TEST(ProducerConsumerQueue, ParallelTryGetRange) {
+  IntQueue queue;
+  {
+    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
+    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
+    PushRanges(&queue, &pool);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.h b/tensorflow/compiler/jit/xla_compilation_cache.h
index 5c0c79b880c474969464f23b4485734c404cef07..be1043d8c3fc0573922837e541615114a6d7a1a5 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.h
+++ b/tensorflow/compiler/jit/xla_compilation_cache.h
@@ -52,13 +52,14 @@ class XlaCompilationCache : public ResourceBase {
   // Compiles a function into a XlaCompiler::CompilationResult that can be used
   // to execute an XLA Computation. Compilation results are cached.
   // `function` is the name of a Tensorflow function to compile.
-  // `constant_args` is a maps of tensorflow argument number to constant value.
+  // `constant_args` is a map of tensorflow argument number to its constant
+  //  value.
   // `variable_args` is a snapshot of the current values of the
   // resource variable arguments to `function`; uninitialized variables are
   // represented by an absent OptionalTensor.
   // The result of compilation is written to `*compilation_result`, which must
   // be non-null. If `executable` is non-null, also builds an
-  // xla::LocalExecutable and sets `executable to point to it. The resulting
+  // xla::LocalExecutable and sets `executable` to point to it. The resulting
   // executable pointer may be null if the computation has no non-constant
   // outputs.
   Status Compile(const XlaCompiler::Options& options,
@@ -96,6 +97,7 @@ class XlaCompilationCache : public ResourceBase {
                      xla::LocalExecutable** executable,
                      const XlaCompiler::CompileOptions* compile_options,
                      bool compile_single_op);
+
   // Takes `result` which has been compiled from a Tensorflow subgraph to a
   // XLA computation already, and generates an XLA LocalExecutable `executable`.
   Status BuildExecutable(const XlaCompiler::Options& options,
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 915b9ce84ab8268ef4e652351bc981aa5bf7b10c..682d6ea8ccc4a54912ccad4666cf0a7a03a7a698 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -51,12 +51,10 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   int64 num_resource_args = variables.size();
 
   xla::LocalClient* client = metadata.client();
-  XlaTensorInfoManager* tensor_info_manager = &metadata.tensor_info_manager();
 
   // Builds an XLA allocator for the device.
-  XlaAllocator xla_allocator(client->platform(), ctx);
   XlaComputationLaunchContext launch_context(
-      num_resource_args, client, &xla_allocator, tensor_info_manager);
+      num_resource_args, client, client->backend().memory_allocator(), true);
 
   launch_context.PopulateInputs(ctx, result, variables);
 
@@ -67,7 +65,7 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
-  run_options.set_allocator(&xla_allocator);
+  run_options.set_allocator(client->backend().memory_allocator());
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
 
   auto run_result = executable->Run(launch_context.arguments(), run_options);
@@ -106,16 +104,13 @@ Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaDevice::Metadata& metadata,
     const XlaCompiler::CompilationResult** result,
     xla::LocalExecutable** executable) {
-  XlaTensorInfoManager* tensor_info_manager = &metadata.tensor_info_manager();
-
   std::map<int, Tensor> constant_arguments;
   for (int64 i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& device_tensor = ctx->input(i);
-    if (const XlaTensorInfo* tensor_info =
-            tensor_info_manager->GetTensorInfo(device_tensor)) {
-      if (tensor_info->has_host_tensor() &&
+    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
+      if (xla_tensor->has_host_tensor() &&
           ShouldArgumentBeConstant(&ctx->op_kernel(), i)) {
-        constant_arguments[i] = tensor_info->host_tensor();
+        constant_arguments[i] = xla_tensor->host_tensor();
       }
     }
     if (constant_arguments.count(i) == 0 &&
@@ -123,8 +118,10 @@ Status XlaCompileOnDemandOp::Compile(
       // Slow path; the argument is not available as a host constant so we must
       // fetch it synchronously.
       Tensor host_tensor;
+      AllocatorAttributes attrs;
+      attrs.set_on_host(true);
       TF_RETURN_IF_ERROR(ctx->allocate_temp(
-          device_tensor.dtype(), device_tensor.shape(), &host_tensor));
+          device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
       Notification n;
       ctx->op_device_context()->CopyDeviceTensorToCPU(
           &device_tensor, "ConstantArgument",
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index d2dfdeea68129b536477aa75f66c9d267f5a9434..bc07dbd7bdf005fde781f7a1e6775080e363abfb 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -62,8 +62,8 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_CPU, XlaCpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaCpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 7> kAllXlaCpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_CPU, XlaLocalLaunchOp, kAllXlaCpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_CPU, kAllXlaCpuTypes);
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 82048f5d78957dfeaf9656d332374ba86a5e920b..12f471735f68394a3079541e9ac8532e329bd694 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -100,7 +100,7 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   }
 
   std::unique_ptr<XlaDeviceAllocator> alloc =
-      xla::MakeUnique<XlaDeviceAllocator>(backend, device_ordinal);
+      xla::MakeUnique<XlaDeviceAllocator>();
   XlaDeviceAllocator* alloc_ptr = alloc.get();
   state.allocators_[{backend, device_ordinal}] = std::move(alloc);
   return alloc_ptr;
@@ -136,13 +136,11 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   return Status::OK();
 }
 
-XlaDevice::Metadata::Metadata(
-    int device_ordinal, se::Platform* platform, const DeviceType& device_type,
-    std::unique_ptr<XlaTensorInfoManager>* tensor_info_manager)
+XlaDevice::Metadata::Metadata(int device_ordinal, se::Platform* platform,
+                              const DeviceType& device_type)
     : device_ordinal_(device_ordinal),
       device_type_(device_type),
-      platform_(platform),
-      tensor_info_manager_(*tensor_info_manager) {}
+      platform_(platform) {}
 
 int XlaDevice::Metadata::device_ordinal() const { return device_ordinal_; }
 
@@ -157,12 +155,9 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
   return device_type_;
 }
 
-XlaTensorInfoManager& XlaDevice::Metadata::tensor_info_manager() const {
-  return *tensor_info_manager_;
-}
-
 /* static */ Status XlaDevice::GetMetadata(OpKernelContext* ctx,
                                            const Metadata** metadata) {
+  *metadata = nullptr;
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (xla_device == nullptr) {
@@ -181,15 +176,11 @@ XlaDevice::XlaDevice(const SessionOptions& options,
                      const DeviceType& jit_device_name, se::Platform* platform,
                      bool transfer_as_literal)
     : LocalDevice(options, attrs),
-      xla_metadata_(
-          device_ordinal, platform, jit_device_name,
-          // Pass tensor_info_manager_ by reference as it is initialized lazily.
-          &tensor_info_manager_),
+      xla_metadata_(device_ordinal, platform, jit_device_name),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      tensor_info_manager_(nullptr),
       transfer_as_literal_(transfer_as_literal) {}
 
 XlaDevice::~XlaDevice() {}
@@ -215,7 +206,6 @@ Allocator* XlaDevice::GetAllocator(AllocatorAttributes attr) {
     xla::Backend* backend = client()->mutable_backend();
     xla_allocator_ = XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
         backend, device_ordinal_);
-    tensor_info_manager_.reset(new XlaTensorInfoManager(xla_allocator_));
   }
   return xla_allocator_;
 }
@@ -236,8 +226,7 @@ Status XlaDevice::FillContextMap(const Graph* graph,
   // Call GetAllocator for the side-effect of ensuring the allocator and
   // XlaTensorInfoManager is created.
   (void)GetAllocator({});
-  auto ctx = new XlaDeviceContext(stream, tensor_info_manager_.get(),
-                                  transfer_as_literal_);
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -285,8 +274,7 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream, tensor_info_manager_.get(),
-                               transfer_as_literal_);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 9cd9167e523961c0ddd99fbc9ca9bdc20b9be7b5..4fe7dd8c9fa9eb954804555e9615160dc4bc3e8a 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -26,7 +26,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
-#include "tensorflow/compiler/jit/xla_tensor_info.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -50,8 +50,7 @@ class XlaDevice : public LocalDevice {
   class Metadata {
    public:
     Metadata(int device_ordinal, perftools::gputools::Platform* platform,
-             const DeviceType& device_type,
-             std::unique_ptr<XlaTensorInfoManager>* tensor_info_manager);
+             const DeviceType& device_type);
 
     // The index of the device on this host.
     int device_ordinal() const;
@@ -59,13 +58,11 @@ class XlaDevice : public LocalDevice {
     perftools::gputools::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
-    XlaTensorInfoManager& tensor_info_manager() const;
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
     perftools::gputools::Platform* platform_;  // Not owned.
-    std::unique_ptr<XlaTensorInfoManager>& tensor_info_manager_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -123,13 +120,6 @@ class XlaDevice : public LocalDevice {
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
   xla::Backend::StreamPtr stream_;
-  // Manages sideband data about tensors, in particular the on-device shape tree
-  // if the tensor requires multiple device buffers to represent (for example,
-  // tuple shapes).
-  // This is a unique_ptr because XlaTensorInfoManager is non-copy-constructible
-  // and we need to initialize this lazily (as we also lazily initialize the
-  // underlying allocator).
-  std::unique_ptr<XlaTensorInfoManager> tensor_info_manager_;
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 88f7c15f0b74a8c99935647f75352e7dec4689fc..43eb164012610723214cf39360698010c9dbdbd4 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/platform/mem.h"
 
@@ -27,39 +28,59 @@ namespace se = ::perftools::gputools;
 namespace tensorflow {
 
 // The allocator used for Tensors assigned to the XLA device.
-XlaDeviceAllocator::XlaDeviceAllocator(const xla::Backend* backend,
-                                       int device_ordinal)
-    : backend_(backend), device_ordinal_(device_ordinal) {}
-
+XlaDeviceAllocator::XlaDeviceAllocator() {}
 XlaDeviceAllocator::~XlaDeviceAllocator() = default;
 
 string XlaDeviceAllocator::Name() { return "xla"; }
 
 void* XlaDeviceAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  se::DeviceMemoryBase dmem =
-      backend_->memory_allocator()
-          ->Allocate(device_ordinal_, num_bytes, /*retry_on_failure=*/false)
-          .ValueOrDie();
-  VLOG(2) << "Allocated XLA device tensor " << dmem.opaque() << "(" << num_bytes
-          << ")";
-  return dmem.opaque();
+  // We always return an empty XlaTensor object, encoded as an opaque tagged
+  // pointer. We can return an empty object and ignore num_bytes here because we
+  // have control over all of the uses of this device tensor, and can lazily
+  // allocate memory when used. This allows us to also know the shape of the
+  // allocated Tensor, which is useful if the device's tensor representation
+  // differs from the host.
+  return XlaTensor::ToOpaquePointer(new XlaTensor());
 }
 
 void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
-  se::DeviceMemoryBase dmem(ptr);
-  TF_CHECK_OK(backend_->memory_allocator()->Deallocate(device_ordinal_, &dmem));
-  VLOG(2) << "Deallocated XLA device tensor " << ptr;
+  delete XlaTensor::FromOpaquePointer(ptr);
 }
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(
-    se::Stream* stream, XlaTensorInfoManager* tensor_info_manager,
-    bool transfer_as_literal)
+XlaTransferManager::XlaTransferManager(se::Stream* stream,
+                                       xla::LocalClient* client,
+                                       bool transfer_as_literal)
     : stream_(stream),
-      tensor_info_manager_(tensor_info_manager),
+      client_(client),
+      transfer_manager_(client->backend().transfer_manager()),
       transfer_as_literal_(transfer_as_literal) {}
 
+Status XlaTransferManager::TransferLiteralToDevice(
+    const Tensor& host_tensor, Tensor* device_tensor) const {
+  xla::Literal literal;
+  TF_RETURN_IF_ERROR(HostTensorToLiteral(host_tensor, &literal));
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString();
+
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal,
+                                                    shaped_buffer);
+}
+
+Status XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor) const {
+  const xla::ShapedBuffer& shaped_buffer =
+      XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
+                      transfer_manager_->TransferLiteralFromDevice(
+                          stream_->parent(), shaped_buffer));
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString();
+  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+}
+
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                                Device* device,
                                                Tensor* device_tensor,
@@ -74,14 +95,24 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
 
     void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* dst_ptr = DMAHelper::base(device_tensor);
-    se::DeviceMemoryBase dev_dst_ptr(dst_ptr, total_bytes);
 
+    XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
+    CHECK(xla_tensor);
+    if (!xla_tensor->has_shaped_buffer()) {
+      Status s = xla_tensor->AllocateShapedBuffer(
+          device_tensor->dtype(), device_tensor->shape(), client_,
+          stream_->parent()->device_ordinal());
+      if (!s.ok()) {
+        done(s);
+        return;
+      }
+    }
+
+    se::DeviceMemoryBase dev_dst_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyCPUTensorToDevice not implemented for "
-          "literals");
+      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
     } else {
       stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
@@ -92,10 +123,7 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
             block_status.error_message().c_str());
       }
     }
-
-    XlaTensorInfo* tensor_info =
-        tensor_info_manager_->GetOrCreateTensorInfo(*device_tensor);
-    tensor_info->set_host_tensor(*cpu_tensor);
+    xla_tensor->set_host_tensor(*cpu_tensor);
 
     done(status);
     return;
@@ -119,15 +147,13 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
             << device_tensor->NumElements();
 
     const int64 total_bytes = cpu_tensor->TotalBytes();
-    void* src_ptr = const_cast<void*>(DMAHelper::base(device_tensor));
-    se::DeviceMemoryBase dev_src_ptr(src_ptr, total_bytes);
+    se::DeviceMemoryBase dev_src_ptr =
+        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     void* dst_ptr = DMAHelper::base(cpu_tensor);
 
     Status status;
     if (transfer_as_literal_) {
-      status = xla::Unimplemented(
-          "XlaTransferManager::CopyDeviceTensorToCPU not implemented for "
-          "literals");
+      status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
     } else {
       stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
@@ -147,10 +173,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream,
-                                   XlaTensorInfoManager* tensor_info_manager,
+XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
                                    bool transfer_as_literal)
-    : manager_(stream, tensor_info_manager, transfer_as_literal) {}
+    : manager_(stream, client, transfer_as_literal) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index df02f4eac482f385f8864476d11c5430971f00c8..ad914a1c23b5f2ea7063722f85e027a99fdb68f9 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/jit/xla_tensor_info.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -27,11 +27,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-// The allocator used for Tensors assigned to the XLA device. It uses
-// XLA backend's allocator.
+// The allocator used for Tensors assigned to the XLA device. The allocator
+// ignores the alignment and size of the request and always returns a new,
+// empty, XlaTensor.
 class XlaDeviceAllocator : public Allocator {
  public:
-  XlaDeviceAllocator(const xla::Backend* backend, int device_ordinal);
+  XlaDeviceAllocator();
   ~XlaDeviceAllocator() override;
 
   string Name() override;
@@ -39,19 +40,13 @@ class XlaDeviceAllocator : public Allocator {
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
   void GetStats(AllocatorStats* stats) override;
-
- private:
-  // Which backend in the client this allocator belongs to.
-  const xla::Backend* backend_;
-  // Which hardware device in the client's backend this allocator belongs to.
-  const int device_ordinal_;
 };
 
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
   explicit XlaTransferManager(perftools::gputools::Stream* stream,
-                              XlaTensorInfoManager* tensor_info_manager,
+                              xla::LocalClient* client,
                               bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
@@ -62,11 +57,18 @@ class XlaTransferManager {
   perftools::gputools::Stream* stream() const { return stream_; }
 
  private:
+  Status TransferLiteralToDevice(const Tensor& host_tensor,
+                                 Tensor* device_tensor) const;
+  Status TransferLiteralFromDevice(Tensor* host_tensor,
+                                   const Tensor& device_tensor) const;
+
   // Stream obtained from a Device, used to transfer tensors between
   // CPU and device.
   perftools::gputools::Stream* stream_;
-  // The tensor info manager, for access to sideband information about tensors.
-  XlaTensorInfoManager* tensor_info_manager_;
+  // For the underlying memory allocator and XLA's TransferManager.
+  xla::LocalClient* client_;
+  // Transfer manager, for marshalling data to and from the device.
+  xla::TransferManager* transfer_manager_;
   // True if we must use XLA's TransferManager for correct device transfers.
   bool transfer_as_literal_;
 };
@@ -77,8 +79,7 @@ class XlaTransferManager {
 class XlaDeviceContext : public DeviceContext {
  public:
   explicit XlaDeviceContext(perftools::gputools::Stream* stream,
-                            XlaTensorInfoManager* tensor_info_manager,
-                            bool transfer_as_literal);
+                            xla::LocalClient* client, bool transfer_as_literal);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 5a1db817745f56d6bcc26ff6fc441b7c902ee2b5..ac60423d959ca44e7d92e2d965cf731287b1f83f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -62,8 +62,9 @@ REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_GPU, XlaGpuDeviceFactory);
 
 // Kernel registrations
 
-constexpr std::array<DataType, 6> kAllXlaGpuTypes = {
-    {DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 8> kAllXlaGpuTypes = {
+    {DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE, DT_COMPLEX64, DT_BOOL,
+     DT_BFLOAT16}};
 
 REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_GPU, XlaLocalLaunchOp, kAllXlaGpuTypes);
 REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_GPU, kAllXlaGpuTypes);
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index bb7316c60c61f8755b6cdd575676fab343f26d11..50b0061d692f2a8c5ea475c0b00c4cb42a1a84e6 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
@@ -52,60 +54,66 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
   return snapshot;
 }
 
-XlaAllocator::XlaAllocator(const gpu::Platform* platform,
-                           OpKernelContext* op_context)
-    : xla::DeviceMemoryAllocator(platform), op_context_(op_context) {}
+XlaAllocator::XlaAllocator(const gpu::Platform* platform, Allocator* wrapped)
+    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
 
-XlaAllocator::~XlaAllocator() { CHECK(allocated_.empty()); }
+XlaAllocator::~XlaAllocator() {}
 
 xla::StatusOr<gpu::DeviceMemoryBase> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  void* data = op_context_->device()->GetAllocator({})->AllocateRaw(
-      Allocator::kAllocatorAlignment, size);
-  allocated_.insert(data);
-  return gpu::DeviceMemoryBase(data, size);
+  void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
+  if (data == nullptr) {
+    return errors::ResourceExhausted("Out of memory while trying to allocate ",
+                                     size, " bytes.");
+  } else {
+    return gpu::DeviceMemoryBase(data, size);
+  }
 }
 
-void XlaAllocator::Release(void* ptr) { allocated_.erase(ptr); }
-
 Status XlaAllocator::Deallocate(int device_ordinal,
                                 gpu::DeviceMemoryBase* mem) {
-  if (allocated_.count(mem->opaque())) {
-    op_context_->device()->GetAllocator({})->DeallocateRaw(mem->opaque());
-    allocated_.erase(mem->opaque());
-  }
+  wrapped_->DeallocateRaw(mem->opaque());
   return Status::OK();
 }
 
 namespace {
-// Return the 'index''th subtree of the given ShapedBuffer as a ShapedBuffer.
-xla::ShapedBuffer ExtractSubShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                         int index) {
+// Return the 'index''th subtree of the given ShapedBuffer as a
+// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
+// subtree, and sets the input's buffer pointers to nullptr for the subtree.
+std::unique_ptr<xla::ScopedShapedBuffer> ExtractSubShapedBuffer(
+    xla::ShapedBuffer* shaped_buffer, int index,
+    xla::DeviceMemoryAllocator* allocator) {
   xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer.on_host_shape(), index);
+      shaped_buffer->on_host_shape(), index);
   xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
-      shaped_buffer.on_device_shape(), index);
+      shaped_buffer->on_device_shape(), index);
 
   xla::ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
-                                      shaped_buffer.platform(),
-                                      shaped_buffer.device_ordinal());
+                                      shaped_buffer->platform(),
+                                      shaped_buffer->device_ordinal());
 
-  auto& shape_tree = shaped_buffer.buffers();
+  auto& shape_tree = shaped_buffer->buffers();
   auto& sub_shape_tree = sub_shaped_buffer.buffers();
   sub_shape_tree.CopySubtreeFrom(shape_tree,
                                  /*source_base_index=*/{index},
                                  /*target_base_index=*/{});
-  return sub_shaped_buffer;
+  for (auto& index_to_buffer : shape_tree) {
+    if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
+      index_to_buffer.second = gpu::DeviceMemoryBase(nullptr, 0);
+    }
+  }
+  return xla::ScopedShapedBuffer::MakeScoped(&sub_shaped_buffer, allocator)
+      .ValueOrDie();
 }
 }  // namespace
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
     int64 num_resource_args, xla::LocalClient* client,
-    XlaAllocator* xla_allocator, XlaTensorInfoManager* tensor_info_manager)
+    xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors)
     : num_resource_args_(num_resource_args),
       client_(client),
       xla_allocator_(xla_allocator),
-      tensor_info_manager_(tensor_info_manager) {}
+      allocate_xla_tensors_(allocate_xla_tensors) {}
 
 void XlaComputationLaunchContext::PopulateInputs(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult* kernel,
@@ -130,20 +138,17 @@ void XlaComputationLaunchContext::PopulateInputs(
     const xla::Shape on_device_shape =
         client_->backend().transfer_manager()->HostShapeToDeviceShape(shape);
     if (xla::ShapeUtil::IsTuple(on_device_shape)) {
-      CHECK(tensor_info_manager_);
-      const XlaTensorInfo* tensor_info =
-          tensor_info_manager_->GetTensorInfo(*t);
-      CHECK(tensor_info && tensor_info->has_shaped_buffer());
+      const XlaTensor* xla_tensor = XlaTensor::FromTensor(t);
+      CHECK(xla_tensor && xla_tensor->has_shaped_buffer());
       arg_ptrs_[i] =
-          const_cast<xla::ShapedBuffer*>(&tensor_info->shaped_buffer());
+          const_cast<xla::ShapedBuffer*>(&xla_tensor->shaped_buffer());
     } else {
       CHECK(xla::ShapeUtil::Equal(shape, on_device_shape))
           << "On-device shape "
           << xla::ShapeUtil::HumanStringWithLayout(on_device_shape)
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
-      gpu::DeviceMemoryBase dmem = gpu::DeviceMemoryBase(
-          const_cast<char*>(t->tensor_data().data()), t->tensor_data().size());
+      gpu::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
       arg_buffers_[i] = xla::MakeUnique<xla::ShapedBuffer>(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
@@ -162,17 +167,15 @@ void XlaComputationLaunchContext::PopulateOutputs(
   // Computation output should always be a tuple.
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "Result tuple shape: " << output->on_host_shape().DebugString();
+    VLOG(2) << "Result tuple shape (on device): "
+            << output->on_device_shape().DebugString();
   }
   CHECK_EQ(ctx->num_outputs(), kernel->outputs.size());
 
   // Copy XLA results to the OpOutputList.
   int output_num = 0;
   for (int i = 0; i < ctx->num_outputs(); ++i) {
-    AllocatorAttributes alloc_attrs = ctx->output_alloc_attr(i);
     Allocator* allocator = ctx->device()->GetAllocator({});
-    if (tensor_info_manager_ && !alloc_attrs.on_host()) {
-      allocator = tensor_info_manager_;
-    }
     if (kernel->outputs[i].is_constant) {
       // Output is a constant.
       const Tensor& const_tensor = kernel->outputs[i].constant_value;
@@ -180,49 +183,63 @@ void XlaComputationLaunchContext::PopulateOutputs(
       const size_t total_bytes = const_tensor.TotalBytes();
       if (stream && total_bytes > 0) {
         // Copy host -> device. (Empty tensors don't have backing buffers.)
+        // Manually allocate memory using an XlaTensorBuffer so we can allocate
+        // as much memory as the device requires (as given by
+        // GetByteSizeRequirement). This avoids XlaTransferManager having to
+        // reallocate the device buffer later.
         VLOG(1) << "Constant output tensor on device";
 
-        TF_CHECK_OK(
-            ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
-
-        const void* src_ptr = DMAHelper::base(&const_tensor);
-        void* dst_ptr = DMAHelper::base(output_tensor);
-        gpu::DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
-        // Memcpying asynchronously is safe for the GPU, but the CPU uses a
-        // shared allocator so hold a reference to the copied-to buffer until
-        // complete.
-        TensorReference ref(*output_tensor);
-        stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
-        stream->ThenDoHostCallback([ref] { ref.Unref(); });
+        OP_REQUIRES_OK(
+            ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
+        if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+          OP_REQUIRES_OK(ctx, xla_tensor->AllocateShapedBuffer(
+                                  const_tensor.dtype(), const_tensor.shape(),
+                                  client_, stream->parent()->device_ordinal()));
+        }
+
+        Device* device = dynamic_cast<Device*>(ctx->device());
+        OP_REQUIRES(ctx, device != nullptr,
+                    errors::Internal("DeviceBase was not a Device."));
+        ctx->op_device_context()->CopyCPUTensorToDevice(
+            &const_tensor, device, output_tensor,
+            [&](Status status) { TF_CHECK_OK(status); });
+
+        if (device->device_type() == DEVICE_GPU) {
+          // The GPUDeviceContext enqueues the host->device transfer in a
+          // separate stream from the main compute stream. We must ensure the
+          // compute stream is synchronized with the host->device transfer
+          // stream now otherwise we will create a race condition.
+          auto* gpu_device_context =
+              static_cast<GPUDeviceContext*>(ctx->op_device_context());
+          gpu_device_context->stream()->ThenWaitFor(
+              gpu_device_context->host_to_device_stream());
+        }
       } else {
         // No copy required.
         ctx->set_output(i, const_tensor);
         output_tensor = ctx->mutable_output(i);
       }
-      if (tensor_info_manager_) {
-        XlaTensorInfo* tensor_info =
-            tensor_info_manager_->GetOrCreateTensorInfo(*output_tensor);
-        tensor_info->set_host_tensor(const_tensor);
+      if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
+        xla_tensor->set_host_tensor(const_tensor);
       }
     } else {
       const TensorShape& shape = kernel->outputs[i].shape;
       VLOG(2) << "Retval " << i << " shape " << shape.DebugString();
 
       gpu::DeviceMemoryBase buffer = output->buffer({output_num});
-      Tensor output_tensor = XlaTensorBuffer::MakeTensor(
-          ctx->expected_output_dtype(i), shape, buffer, allocator);
-      xla_allocator_->Release(buffer.opaque());
-
-      xla::Shape output_shape = xla::ShapeUtil::GetTupleElementShape(
-          output->on_device_shape(), output_num);
-      if (xla::ShapeUtil::IsTuple(output_shape)) {
-        CHECK(tensor_info_manager_);
-        XlaTensorInfo* tensor_info =
-            tensor_info_manager_->GetOrCreateTensorInfo(output_tensor);
-        tensor_info->set_shaped_buffer(
-            ExtractSubShapedBuffer(*output, output_num));
+      if (allocate_xla_tensors_) {
+        Tensor* output_tensor;
+        OP_REQUIRES_OK(ctx, ctx->allocate_output(i, shape, &output_tensor));
+        XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor);
+        CHECK(xla_tensor);
+        xla_tensor->set_shaped_buffer(
+            ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+      } else {
+        Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+            ctx->expected_output_dtype(i), shape, buffer, allocator);
+        output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+        ctx->set_output(i, output_tensor);
       }
-      ctx->set_output(i, output_tensor);
       ++output_num;
     }
 
@@ -235,9 +252,6 @@ void XlaComputationLaunchContext::PopulateOutputs(
   VLOG(2) << "Applying variable updates";
   for (int i = 0; i < kernel->resource_updates.size(); ++i) {
     Allocator* allocator = ctx->device()->GetAllocator({});
-    if (tensor_info_manager_) {
-      allocator = tensor_info_manager_;
-    }
     const XlaCompiler::ResourceUpdate& write = kernel->resource_updates[i];
     OP_REQUIRES(ctx,
                 write.input_index >= 0 && write.input_index < ctx->num_inputs(),
@@ -260,18 +274,21 @@ void XlaComputationLaunchContext::PopulateOutputs(
     mutex_lock ml(*variable->mu());
     OP_REQUIRES(ctx, variable->tensor()->dtype() == write.type,
                 errors::Internal("Mismatched type in variable write"));
-    *variable->tensor() =
-        XlaTensorBuffer::MakeTensor(write.type, write.shape, buffer, allocator);
-    xla_allocator_->Release(buffer.opaque());
-
-    xla::Shape output_shape = xla::ShapeUtil::GetTupleElementShape(
-        output->on_device_shape(), output_num);
-    if (xla::ShapeUtil::IsTuple(output_shape)) {
-      CHECK(tensor_info_manager_);
-      XlaTensorInfo* tensor_info =
-          tensor_info_manager_->GetOrCreateTensorInfo(*variable->tensor());
-      tensor_info->set_shaped_buffer(
-          ExtractSubShapedBuffer(*output, output_num));
+
+    if (allocate_xla_tensors_) {
+      Tensor output_tensor;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(write.type, write.shape, &output_tensor));
+      XlaTensor* xla_tensor = XlaTensor::FromTensor(&output_tensor);
+      CHECK(xla_tensor);
+      xla_tensor->set_shaped_buffer(
+          ExtractSubShapedBuffer(output.get(), output_num, xla_allocator_));
+      *variable->tensor() = output_tensor;
+    } else {
+      Tensor output_tensor = XlaTensorBuffer::MakeTensor(
+          write.type, write.shape, buffer, allocator);
+      output->set_buffer(gpu::DeviceMemoryBase(nullptr, 0), {output_num});
+      *variable->tensor() = output_tensor;
     }
     ++output_num;
   }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 8694f6ce58b72ca188bf831528db30daf93b905d..14f70fe35891040ff3460567adb223be0f1c910f 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_LAUNCH_UTIL_H_
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
-#include "tensorflow/compiler/jit/xla_tensor_info.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
@@ -47,16 +47,13 @@ std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
 class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
   XlaAllocator(const perftools::gputools::Platform* platform,
-               OpKernelContext* op_context);
+               Allocator* wrapped);
   ~XlaAllocator() override;
   xla::StatusOr<perftools::gputools::DeviceMemoryBase> Allocate(
       int device_ordinal, uint64 size, bool retry_on_failure) override;
   Status Deallocate(int device_ordinal,
                     perftools::gputools::DeviceMemoryBase* mem) override;
 
-  // Un-track 'ptr' - do not delete it on destruction.
-  void Release(void* ptr);
-
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
   // compute stream to enforce a happens-before relationship between a memory
@@ -67,17 +64,19 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
   bool AllowsAsynchronousDeallocation() const override { return true; }
 
  private:
-  OpKernelContext* const op_context_;
-  std::unordered_set<void*> allocated_;
+  Allocator* wrapped_;
 };
 
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
 class XlaComputationLaunchContext {
  public:
+  // Create a new launch context. 'allocate_xla_tensors' is true if allocated
+  // output tensors and variables are always XlaTensors. If false they are
+  // assumed to be "normal" device pointers.
   XlaComputationLaunchContext(int64 num_resource_args, xla::LocalClient* client,
-                              XlaAllocator* xla_allocator,
-                              XlaTensorInfoManager* tensor_info_manager);
+                              xla::DeviceMemoryAllocator* xla_allocator,
+                              bool allocate_xla_tensors);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
@@ -97,8 +96,8 @@ class XlaComputationLaunchContext {
  private:
   int64 num_resource_args_;
   xla::LocalClient* client_;
-  XlaAllocator* xla_allocator_;
-  XlaTensorInfoManager* tensor_info_manager_;
+  xla::DeviceMemoryAllocator* xla_allocator_;
+  bool allocate_xla_tensors_;
   std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
   std::vector<xla::ShapedBuffer*> arg_ptrs_;
 };
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..956328e6757f4c903e3995a54635682d19052794
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+
+namespace tensorflow {
+
+/*static*/ XlaTensor* XlaTensor::FromTensor(Tensor* tensor) {
+  if (tensor->NumElements() == 0) {
+    return nullptr;
+  }
+  XlaTensor* xla_tensor =
+      FromOpaquePointer(const_cast<char*>(tensor->tensor_data().data()));
+  return xla_tensor;
+}
+
+/*static*/ const XlaTensor* XlaTensor::FromTensor(const Tensor* tensor) {
+  return FromTensor(const_cast<Tensor*>(tensor));
+}
+
+/*static*/ perftools::gputools::DeviceMemoryBase
+XlaTensor::DeviceMemoryFromTensor(const Tensor& tensor) {
+  const XlaTensor* xla_tensor = FromTensor(&tensor);
+  if (xla_tensor) {
+    CHECK(xla_tensor->has_shaped_buffer());
+    return xla_tensor->shaped_buffer().root_buffer();
+  } else {
+    return perftools::gputools::DeviceMemoryBase(
+        const_cast<char*>(tensor.tensor_data().data()),
+        tensor.tensor_data().size());
+  }
+}
+
+Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                                       xla::LocalClient* client,
+                                       int device_ordinal) {
+  xla::Shape on_host_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(dtype, shape, &on_host_shape));
+  xla::Shape on_device_shape =
+      client->backend().transfer_manager()->HostShapeToDeviceShape(
+          on_host_shape);
+
+  xla::ShapedBuffer buffer(on_host_shape, on_device_shape, client->platform(),
+                           device_ordinal);
+  for (auto& index_to_buffer : buffer.buffers()) {
+    xla::Shape subshape =
+        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
+    uint64 size =
+        client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
+    TF_ASSIGN_OR_RETURN(index_to_buffer.second,
+                        client->backend().memory_allocator()->Allocate(
+                            device_ordinal, size, /*retry_on_failure=*/false));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto scoped_buffer,
+                      xla::ScopedShapedBuffer::MakeScoped(
+                          &buffer, client->backend().memory_allocator()));
+  set_shaped_buffer(std::move(scoped_buffer));
+  return Status::OK();
+}
+
+// The pointer tag, OR-ed into the XlaTensor's address to distinguish it from
+// device-side tensors, which are either CPU or GPU memory pointers. This works
+// because we're guaranteed that CPU and GPU pointers are aligned to > 1 bits.
+namespace {
+constexpr uintptr_t kTag = 0x1ULL;
+}
+
+/*static*/ XlaTensor* XlaTensor::FromOpaquePointer(void* ptr) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(ptr);
+  if (value & kTag) {
+    return reinterpret_cast<XlaTensor*>(value & ~kTag);
+  } else {
+    return nullptr;
+  }
+}
+
+/*static*/ void* XlaTensor::ToOpaquePointer(XlaTensor* tensor) {
+  uintptr_t value = reinterpret_cast<uintptr_t>(tensor);
+  CHECK_EQ(value & kTag, 0);
+  value |= kTag;
+  return reinterpret_cast<XlaTensor*>(value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tensor_info.h b/tensorflow/compiler/jit/xla_tensor.h
similarity index 51%
rename from tensorflow/compiler/jit/xla_tensor_info.h
rename to tensorflow/compiler/jit/xla_tensor.h
index fbd6ad770fbf9b80829ca80f1a85704e3288a680..5ff2fb08f03548260215c6aeded2c124f8d28f43 100644
--- a/tensorflow/compiler/jit/xla_tensor_info.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_INFO_H_
-#define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_INFO_H_
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_TENSOR_H_
 
+#include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -24,11 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Information about a tensor. The XlaTensorInfoManager can maintain one of
-// these per device Tensor.
-class XlaTensorInfo {
+// The implementation of a Tensor for an XlaDevice. All device tensors are
+// actually one of these.
+//
+// To distinguish between "normal" device tensors and XlaTensors, the raw
+// pointer data stored in the TensorBuffer is a tagged pointer.
+class XlaTensor {
  public:
-  XlaTensorInfo() {}
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static XlaTensor* FromTensor(Tensor* tensor);
+  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
+  // fails.
+  static const XlaTensor* FromTensor(const Tensor* tensor);
+
+  // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
+  // which case the returned value is shaped_buffer()->root_buffer(), or a
+  // normal Tensor in which case the returned value is
+  // {tensor.tensor_data().data(), tensor.tensor_data().size}.
+  static perftools::gputools::DeviceMemoryBase DeviceMemoryFromTensor(
+      const Tensor& tensor);
+
+  // Assign the internal ShapedBuffer to new memory for the given dtype and
+  // shape. If a ShapedBuffer exists already (has_shaped_buffer() == true), it
+  // is replaced and the managed memory deallocated.
+  Status AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
+                              xla::LocalClient* client, int device_ordinal);
 
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
   // manage the memory for these tensors a ShapedBuffer may be required.
@@ -37,10 +59,14 @@ class XlaTensorInfo {
   bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
   // Return the contained ShapedBuffer.
   // REQUIRES: has_shaped_buffer()
-  const xla::ShapedBuffer& shaped_buffer() const { return *shaped_buffer_; }
+  const xla::ShapedBuffer& shaped_buffer() const {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
   // Mutates the TensorInfo to set the ShapedBuffer.
-  void set_shaped_buffer(xla::ShapedBuffer shaped_buffer) {
-    shaped_buffer_.reset(new xla::ShapedBuffer(std::move(shaped_buffer)));
+  void set_shaped_buffer(
+      std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer) {
+    shaped_buffer_ = std::move(shaped_buffer);
   }
 
   // Some tensors on the device may have known values on the host. We use these
@@ -57,45 +83,18 @@ class XlaTensorInfo {
     host_tensor_.reset(new Tensor(tensor));
   }
 
+  // Convert from a raw pointer to an XlaTensor, removing the pointer tag.
+  static XlaTensor* FromOpaquePointer(void* ptr);
+  // Convert to a raw pointer from an XlaTensor, adding the pointer tag.
+  static void* ToOpaquePointer(XlaTensor* tensor);
+
  private:
   // The optional contained ShapedBuffer.
-  std::unique_ptr<xla::ShapedBuffer> shaped_buffer_;
+  std::unique_ptr<xla::ScopedShapedBuffer> shaped_buffer_;
   // An optional host tensor value.
   std::unique_ptr<Tensor> host_tensor_;
 };
 
-// Manages XlaTensorInfo objects. This class is also an Allocator, so that
-// XlaTensorInfo objects can be deleted when their Tensor is deallocated.
-class XlaTensorInfoManager : public AllocatorWrapper {
- public:
-  // Creates a new XlaTensorInfoManager, delegating all DeallocateRaw calls to
-  // allocator.
-  XlaTensorInfoManager(Allocator* allocator) : AllocatorWrapper(allocator) {}
-
-  // Returns the XlaTensorInfo for the given device memory pointer or nullptr if
-  // none exists.
-  const XlaTensorInfo* GetTensorInfo(const void* device_ptr) const;
-  // Returns the XlaTensorInfo for the device memory pointer extracted from
-  // tensor or nullptr if none exists.
-  const XlaTensorInfo* GetTensorInfo(const Tensor& tensor);
-
-  // Returns the XlaTensorInfo for the given device memory pointer, creating one
-  // if necessary.
-  XlaTensorInfo* GetOrCreateTensorInfo(const Tensor& tensor);
-  // Returns the XlaTensorInfo for the device memory pointer extracted from
-  // tensor, creating one if necessary.
-  XlaTensorInfo* GetOrCreateTensorInfo(const void* device_ptr);
-
-  // Allocator interface
-  void DeallocateRaw(void* ptr) override;
-
- private:
-  mutable mutex lock_;
-  // The managed tensor infos. The mapped value is a unique_ptr so that returned
-  // references are stable over rehashes.
-  std::unordered_map<const void*, std::unique_ptr<XlaTensorInfo>> tensor_infos_
-      GUARDED_BY(lock_);
-};
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/compiler/jit/xla_tensor_info.cc b/tensorflow/compiler/jit/xla_tensor_info.cc
deleted file mode 100644
index 0ce18c27cbe1d46eb61f8000506396fedc509e9c..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/xla_tensor_info.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/xla_tensor_info.h"
-
-namespace tensorflow {
-
-const XlaTensorInfo* XlaTensorInfoManager::GetTensorInfo(
-    const void* device_ptr) const {
-  mutex_lock lock(lock_);
-  auto iterator = tensor_infos_.find(device_ptr);
-  return (iterator == tensor_infos_.end()) ? nullptr
-                                           : tensor_infos_.at(device_ptr).get();
-}
-
-XlaTensorInfo* XlaTensorInfoManager::GetOrCreateTensorInfo(
-    const void* device_ptr) {
-  mutex_lock lock(lock_);
-  auto iterator = tensor_infos_.find(device_ptr);
-  if (iterator != tensor_infos_.end()) {
-    return iterator->second.get();
-  }
-  auto iterator_and_inserted =
-      tensor_infos_.emplace(device_ptr, MakeUnique<XlaTensorInfo>());
-  CHECK(iterator_and_inserted.second);
-  return iterator_and_inserted.first->second.get();
-}
-
-const XlaTensorInfo* XlaTensorInfoManager::GetTensorInfo(const Tensor& tensor) {
-  return GetTensorInfo(tensor.tensor_data().data());
-}
-
-XlaTensorInfo* XlaTensorInfoManager::GetOrCreateTensorInfo(
-    const Tensor& tensor) {
-  return GetOrCreateTensorInfo(tensor.tensor_data().data());
-}
-
-void XlaTensorInfoManager::DeallocateRaw(void* ptr) {
-  wrapped()->DeallocateRaw(ptr);
-  mutex_lock lock(lock_);
-  tensor_infos_.erase(ptr);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index da4bc44c7a75c9f8faf16c537a17a1f2d16d5d61..238fd15166c0b08ee109d6a3888e16c39f87a603 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -49,17 +49,3 @@ cc_library(
         "//tensorflow/compiler/jit:xla_device",
     ],
 )
-
-#-----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 1c5a8f8e695cb2922f118f231082ebb53cb2bc9b..47c6ab58c09ed31643b95851ac9ecd9ac0364933 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -124,6 +124,7 @@ tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
+    tags = ["optonly"],
     deps = [
         ":xla_test",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -191,6 +192,31 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "oom_test",
+    size = "medium",
+    srcs = ["oom_test.py"],
+    disabled_backends = [
+        "cpu",
+        "cpu_ondemand",
+    ],
+    tags = [
+        # Allocates very large amounts of memory and does not work under TSAN.
+        "notsan",
+        "optonly",  # Times out frequently in fastbuild.
+    ],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "conv2d_test",
     size = "medium",
@@ -835,17 +861,3 @@ tf_xla_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index ba7b9bacd2b794c74409d517a9c05bfbb14a845f..1e4dd32916c3a40282735fb8f75670b0e9ef0dc9 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -190,19 +190,24 @@ class BinaryOpsTest(XLATestCase):
           ],
           equality_test=self.ListsAreClose)
 
-      self._testBinary(
-          gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
-          np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
-                    [0.9, 1.0, 1.1, 1.2]], dtype=dtype),
-          np.array([2, 1, 7], dtype=np.int32),
-          expected=[
-              np.array([1.342536, 1.442536, np.nan], dtype=dtype),
-              np.array([[0.213838, 0.236328, -0.738817, 0.288651],
-                        [0.213838, -0.763672, 0.261183, 0.288651],
-                        [np.nan, np.nan, np.nan, np.nan]],
-                       dtype=dtype),
-          ],
-          equality_test=self.ListsAreClose)
+      # TODO(b/68813416): Fails with bfloat16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_nn_ops.sparse_softmax_cross_entropy_with_logits,
+            np.array(
+                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                 [0.9, 1.0, 1.1, 1.2]],
+                dtype=dtype),
+            np.array([2, 1, 7], dtype=np.int32),
+            expected=[
+                np.array([1.342536, 1.442536, np.nan], dtype=dtype),
+                np.array(
+                    [[0.213838, 0.236328, -0.738817, 0.288651], [
+                        0.213838, -0.763672, 0.261183, 0.288651
+                    ], [np.nan, np.nan, np.nan, np.nan]],
+                    dtype=dtype),
+            ],
+            equality_test=self.ListsAreClose)
 
   def testIntOps(self):
     for dtype in self.int_types:
@@ -260,12 +265,6 @@ class BinaryOpsTest(XLATestCase):
           np.array([[1], [2]], dtype=dtype),
           dtype(7),
           expected=np.array([[8], [9]], dtype=dtype))
-      self._testBinary(
-          math_ops.add,
-          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
-          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
-          expected=np.array(
-              [1 << 32, 1 << 36, 1 << 32, 1 << 36], dtype=np.int64))
 
       self._testBinary(
           math_ops.subtract,
@@ -361,6 +360,14 @@ class BinaryOpsTest(XLATestCase):
           np.array([2, -1], dtype=dtype),
           expected=np.array([[[[3, 1], [5, 3]]]], dtype=dtype))
 
+    if np.int64 in self.numeric_types:
+      self._testBinary(
+          math_ops.add,
+          np.array([0xffffffff, 0xfffffffff, 1, 1], dtype=np.int64),
+          np.array([1, 1, 0xffffffff, 0xfffffffff], dtype=np.int64),
+          expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
+                            dtype=np.int64))
+
   def testComplexOps(self):
     for dtype in self.complex_types:
       ctypes = {np.complex64: np.float32}
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 0528a5415d579a844e68403ace1bb8982a10a841..45b6a6eb86f8ec3d20e06efa103bec2944b9b095 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -51,12 +51,12 @@ def tf_xla_py_test(name, srcs=[], deps=[], tags=[], data=[], main=None,
     if backend == "cpu":
       backend_args += [
           "--test_device=XLA_CPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_HALF,DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
       ]
     elif backend == "gpu":
       backend_args += [
           "--test_device=XLA_GPU",
-          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64"
+          "--types=DT_FLOAT,DT_DOUBLE,DT_INT32,DT_INT64,DT_BOOL,DT_COMPLEX64,DT_BFLOAT16"
       ]
       backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
@@ -89,4 +89,3 @@ def generate_backend_suites(backends=[]):
     backends = all_backends()
   for backend in backends:
     native.test_suite(name="%s_tests" % backend, tags=["tf_xla_%s" % backend])
-
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 5010fe5e21d0782e68d4e6d5bf6b4df1b44793a3..1a8989d7c2f617525c301f30fd899a01362310bf 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -34,6 +34,13 @@ from tensorflow.python.platform import test
 
 class CholeskyOpTest(XLATestCase):
 
+  # Cholesky defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/cholesky)
+  @property
+  def float_types(self):
+    return set(super(CholeskyOpTest, self).float_types).intersection(
+        (np.float64, np.float32, np.complex64, np.complex128))
+
   def _verifyCholeskyBase(self, sess, placeholder, x, chol, verification, atol):
     chol_np, verification_np = sess.run([chol, verification], {placeholder: x})
     self.assertAllClose(x, verification_np, atol=atol)
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index f9d87c2d1cfe5c1a7487e124c971a54ffcfede15..1f7da659e5590b86c96964bbd14a4175341783c8 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.contrib.compiler import jit
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -38,6 +39,18 @@ from tensorflow.python.platform import test
 jit_scope = jit.experimental_jit_scope
 
 
+# Disable rewrites to make sure we don't end up having to update this test
+# whenever we implement new ones.
+def NoRewriteSessionConfig():
+  rewriter_config = rewriter_config_pb2.RewriterConfig(
+      disable_model_pruning=True,
+      arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
+      function_optimization=rewriter_config_pb2.RewriterConfig.OFF)
+  graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config)
+  return config_pb2.ConfigProto(graph_options=graph_options)
+
+
 def CompiledKernel(fn, *inputs, **kwargs):
   """Execute 'fn' as a compiled XLA kernel, with 'inputs'."""
   name = kwargs.pop("name", None)
@@ -81,7 +94,7 @@ class JitLaunchTest(test.TestCase):
   # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
-    with session_lib.Session() as sess:
+    with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
       placeholders = []
       feeds = {}
       for arg in args:
@@ -258,7 +271,7 @@ class XlaCompilationTest(test.TestCase):
   def testReshape(self):
     """Tests an operator with compile-time constant and non-constant inputs."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -282,7 +295,7 @@ class XlaCompilationTest(test.TestCase):
   def testIgnoredArguments(self):
     """Tests that JIT computations can ignore formal parameters."""
 
-    with self.test_session() as sess:
+    with self.test_session(config=NoRewriteSessionConfig()) as sess:
       x = array_ops.placeholder(dtypes.int32)
       y = array_ops.placeholder(dtypes.int32)
       with jit_scope():
@@ -306,7 +319,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoops(self):
     """Tests that compilation accepts computations containing loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         c = lambda i, _: math_ops.less(i, 5)
@@ -324,7 +337,7 @@ class XlaCompilationTest(test.TestCase):
   def testCond(self):
     """Tests that compilation handles switch operators."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       y = array_ops.placeholder(dtypes.float32)
       c = array_ops.placeholder(dtypes.bool)
@@ -365,7 +378,8 @@ class XlaCompilationTest(test.TestCase):
       inp = array_ops.placeholder(dtypes.float32)
       out = Entry(inp)
 
-    with self.test_session(graph=g, use_gpu=True) as sess:
+    with self.test_session(
+        config=NoRewriteSessionConfig(), graph=g, use_gpu=True) as sess:
       run_metadata = config_pb2.RunMetadata()
       val = sess.run(out,
                      feed_dict={inp: [2., 10.]},
@@ -377,7 +391,7 @@ class XlaCompilationTest(test.TestCase):
   def testLoopDeadlock(self):
     """Regression test for bug that caused deadlocks in graphs with loops."""
 
-    with self.test_session() as session:
+    with self.test_session(config=NoRewriteSessionConfig()) as session:
       x = array_ops.placeholder(dtypes.float32)
       with jit_scope():
         y = x + 1.0
@@ -404,10 +418,10 @@ class XlaCompilationTest(test.TestCase):
         y = Forward(x)
         dx, = gradients_impl.gradients(y, [x], 1.0)
 
-      cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-          optimizer_options=config_pb2.OptimizerOptions(
-              opt_level=config_pb2.OptimizerOptions.L1,
-              do_function_inlining=True)))
+      cfg = NoRewriteSessionConfig()
+      cfg.graph_options.optimizer_options.opt_level = (
+          config_pb2.OptimizerOptions.L1)
+      cfg.graph_options.optimizer_options.do_function_inlining = True
       with session_lib.Session(graph=g, config=cfg) as sess:
         run_metadata = config_pb2.RunMetadata()
         dx_val = sess.run(dx,
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index cccb7f5789dce39ef8c3d4b3a7573aaa983b3fbd..5819b2bf2b55b9213a039c0ba82dd0bf1c738b00 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -37,6 +37,14 @@ def MakePlaceholder(x):
 
 class MatrixTriangularSolveOpTest(XLATestCase):
 
+  #  MatrixTriangularSolve defined for float64, float32, complex64, complex128
+  # (https://www.tensorflow.org/api_docs/python/tf/matrix_triangular_solve)
+  @property
+  def float_types(self):
+    return set(super(MatrixTriangularSolveOpTest,
+                     self).float_types).intersection(
+                         (np.float64, np.float32, np.complex64, np.complex128))
+
   def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
                                  placeholder_b, a, clean_a, b, verification,
                                  atol):
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1434e965e3d7eaeca94ad0fa97498f884e30e115
--- /dev/null
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -0,0 +1,61 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for out-of-memory conditions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+class OutOfMemoryTest(xla_test.XLATestCase):
+
+  def testOutputOutOfMemory(self):
+    """Allocates tensors until out of memory.
+
+    Generates a large rank-1 tensor. The tensor is an output of an XLA
+    computation, not constant.
+
+    Check that a ResourceExhaustedError is raised and can be caught.
+
+    We spin in a loop generating larger and larger tensors until an OOM event
+    happens. We may be running sandboxed, so have a small host memory limit, so
+    any hardcoded value is unlikely to land in the sweet spot between device
+    memory size and host memory size with stability.
+    """
+
+    def test_loop():
+      size = 2e8
+      while True:
+        with self.test_session():
+          # Force the compiled code to not be constant by feeding in an addend.
+          p = array_ops.placeholder(dtypes.float32, shape=[])
+          with self.test_scope():
+            # Create a large R1 tensor.
+            c = array_ops.zeros([size, 1]) + p
+
+            c.eval(feed_dict={p: 1.0})
+            size *= 2
+
+    self.assertRaises(errors.ResourceExhaustedError, test_loop)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/compiler/tests/spacetobatch_op_test.py b/tensorflow/compiler/tests/spacetobatch_op_test.py
index 92518aadc4bf5c601cfb4192c093799784b6aa72..ef47187477545d019ed6e8f12ebe4a771248e607 100644
--- a/tensorflow/compiler/tests/spacetobatch_op_test.py
+++ b/tensorflow/compiler/tests/spacetobatch_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.platform import test
@@ -156,6 +157,15 @@ class SpaceToBatchNDTest(XLATestCase):
     paddings = np.array(paddings).reshape((len(block_shape), 2))
     with self.test_session() as sess, self.test_scope():
       for dtype in self.float_types:
+        # TODO(b/68813416): Skip bfloat16's as the input type for direct is
+        # float32 and results in a mismatch, while making testDirect provide the
+        # correctly typed input results in 'no fill-function for data-type'
+        # error.
+        if dtype == dtypes.bfloat16.as_numpy_dtype:
+          continue
+        # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+        if dtype == np.float16 and self.device == "XLA_CPU":
+          continue
         placeholder = array_ops.placeholder(dtype)
         # outputs = space_to_batch(inputs)
         x_tf = array_ops.space_to_batch_nd(placeholder, block_shape, paddings)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 3d3e112f4821ea8e57ea9589a5b4433647ad294b..ba79f393a8f9b24ac506d2130957c38ecd442509 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -154,6 +154,9 @@ class UnaryOpsTest(XLATestCase):
 
   def testFloatOps(self):
     for dtype in self.float_types:
+      # TODO(b/77694432): Half test failed on CPU, last ran on 04-06-2018.
+      if dtype == np.float16 and self.device == "XLA_CPU":
+        continue
       x = np.arange(-0.90, 0.90, 0.25)
       self._assertOpOutputMatchesExpected(
           math_ops.acos,
@@ -600,6 +603,20 @@ class UnaryOpsTest(XLATestCase):
               src,
               expected=dst)
 
+  def testBitcast(self):
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1, 0x3f800000], np.int32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.float32),
+        np.array([1, 0x3f800000], np.int32),
+        expected=np.array([1e-45, 1.0], np.float32))
+    self._assertOpOutputMatchesExpected(
+        lambda x: array_ops.bitcast(x, dtypes.int32),
+        np.array([1e-45, 1.0], np.float32),
+        expected=np.array([1, 0x3f800000], np.int32))
+
   def testInvertPermutation(self):
     self._assertOpOutputMatchesExpected(
         array_ops.invert_permutation,
@@ -779,7 +796,10 @@ class UnaryOpsTest(XLATestCase):
       self._assertSoftplusMatchesExpected([[-2, 0, 8]], dtype)
       self._assertSoftplusMatchesExpected(
           [[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]], dtype)
-      log_eps = np.log(np.finfo(dtype).eps)
+      if dtype == dtypes.bfloat16.as_numpy_dtype:
+        log_eps = np.log(np.finfo(np.float32).eps)
+      else:
+        log_eps = np.log(np.finfo(dtype).eps)
       one = dtype(1)
       ten = dtype(10)
       self._assertSoftplusMatchesExpected([
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index b08d6ab21e0746558cb3d4818d4c822c45d2e9ee..8ecad00f6e23b3a7746bbb473102ac847bf4cbfd 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -230,7 +230,10 @@ class SliceAssignTest(XLATestCase):
       # shrink shape changes
       checker[1:2, 1] = [66]
       checker[1, 1:2] = [66]
-      checker[1, 1] = 66
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker[1, 1] = 66
       # newaxis shape changes
       checker[:, None, :] = [[[10, 20, 30]], [[40, 50, 50]]]
       # shrink and newaxis
@@ -243,8 +246,11 @@ class SliceAssignTest(XLATestCase):
 
       # Assign vector to scalar (rank-0) using newaxis
       checker2 = StridedSliceAssignChecker(self, 222, dtype=dtype)
-      checker2[()] = 6  # no indices
-      checker2[...] = 6  # ellipsis
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        # TODO(b/68813416): valnp call above results in an ndarray and not a
+        # number for bfloat16s.
+        checker2[()] = 6  # no indices
+        checker2[...] = 6  # ellipsis
       checker2[None] = [6]  # new axis
 
   def testUninitialized(self):
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index eb20ca501c80b01c76198e1ad54173f1c601714d..e7daf4e01c45c3705216fce7dd3db5baa0c261fc 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -332,6 +332,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -462,17 +463,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 311dddca94c458a60fd00afe5532840e0dbf0437..c30bb9cacd48fb93ac359a6a25699ba6a74183c5 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -51,17 +51,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8b7beef83ec2ed0df780d6a9cb2a4bcf737d008b..16b9142cbf7d2afe99c22acbc32fb17c09b00081 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -901,6 +901,14 @@ FunctionalizeCond::DeterminePredicateSwitchOrder() {
       int src_depth = switch_depth[src_id];
       if (!e->IsControlEdge() || new_switch_depth == src_depth) {
         if (src_depth != new_switch_depth) {
+          // TODO(b/77601805) remove this when outside_compilation supports
+          // control flow.
+          if (str_util::StrContains(src->name(), "outside_compilation") ||
+              str_util::StrContains(n->name(), "outside_compilation")) {
+            return errors::InvalidArgument(
+                "outside_compilation is not yet supported within TensorFlow "
+                "control flow constructs b/77601805");
+          }
           return errors::InvalidArgument(
               "Unable to functionalize control flow in graph: Operand ('",
               src->name(), "') and operator ('", n->name(),
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 0bbfe86de389ff6063b1f9604003f35b41d28e3b..f1bc7d6af49a09f84ef251eaa1c3d684792d0c1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -217,17 +217,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 43a6a747c6bcc441f33f276fde4a66f367d99731..c52b2dcb7e9ef81fd52565dfbda05e33a52ed43a 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -62,5 +62,50 @@ class CastOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Cast"), CastOp);
 
+class BitcastOp : public XlaOpKernel {
+ public:
+  explicit BitcastOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &src_dtype_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("type", &dst_dtype_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(src_dtype_, &src_type_));
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dst_dtype_, &dst_type_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::ComputationBuilder* builder = ctx->builder();
+    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::ComputationDataHandle output;
+
+    if (src_dtype_ == dst_dtype_) {
+      output = input;
+    } else {
+      // The only complex type in XLA is C64, so error out if the bitcast has a
+      // complex source or destination type and the bitcast is not trivial.
+      OP_REQUIRES(ctx,
+                  !xla::primitive_util::IsComplexType(src_type_) &&
+                      !xla::primitive_util::IsComplexType(dst_type_),
+                  errors::Unimplemented("Complex types not supported."));
+      // XLA bitcast requires that the bit-width of the source and destination
+      // matches, and currently only the simple lowering is performed.
+      OP_REQUIRES(ctx,
+                  xla::primitive_util::BitWidth(src_type_) ==
+                      xla::primitive_util::BitWidth(dst_type_),
+                  errors::Unimplemented(
+                      "Only bitcasts between equally sized types supported."));
+      output = builder->BitcastConvertType(input, dst_type_);
+    }
+
+    ctx->SetOutput(0, output);
+  }
+
+ protected:
+  DataType src_dtype_, dst_dtype_;
+  xla::PrimitiveType src_type_, dst_type_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(BitcastOp);
+};
+
+REGISTER_XLA_OP(Name("Bitcast"), BitcastOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index aa47cb799f1f3d01f6fcb01ff9f2e410f7f0ac5a..463788b8b461c370a8e7ab4d79a94fc0143b8b45 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace {
@@ -28,7 +29,7 @@ namespace {
 class SoftmaxOp : public XlaOpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 488fda74bf7b5c1d66f8d706a1be3cc1fc29a492..344773c8c5f8e1a552d585d0317c62c56d9f9d46 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -140,17 +140,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 86c02ac2e65c12d3527c4022df0cc603e522ef7a..495d9c60780b0a728e8dbfb4537d33d92b4bb5b7 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -54,7 +54,6 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
         auto result,
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
                            cond_builder.get()));
-    TF_RETURN_IF_ERROR(cond_builder->SetReturnValue(result));
   }
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index 98f72b3792eb147f5a1847c5e1ecef18bccbca5f..aeb743a6634673f2e8c4dee9ae1e5017944aae2c 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -39,17 +39,3 @@ tf_gen_op_wrapper_py(
         ":sendrecv_ops",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc
index 1a0e09758f7cc6714793300c6ece14093a8ad246..5759c72af301785f3ca1110b58eeb2fe7dead713 100644
--- a/tensorflow/compiler/tf2xla/sharding_util.cc
+++ b/tensorflow/compiler/tf2xla/sharding_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -65,8 +66,8 @@ ParseShardingFromDevice(
   if (explicit_sharding.has_value()) {
     return explicit_sharding;
   } else if (!parsed_device.has_type || !parsed_device.has_id ||
-             !StringPiece(parsed_device.type)
-                  .contains(kDeviceSuffixReplicatedCore)) {
+             !str_util::StrContains(parsed_device.type,
+                                    kDeviceSuffixReplicatedCore)) {
     return tensorflow::gtl::optional<xla::OpSharding>();
   } else {
     const int core = parsed_device.id;
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index a9978e697b091715ce120f0d18fdddd259e08b32..b813668a9edd3a704a9dca1eaa588c1eced6ac31 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -90,6 +90,11 @@ TEST(ConvertGraphDefToXla, Sum) {
   TF_EXPECT_OK(result_or.status());
   std::unique_ptr<xla::Literal> result = std::move(result_or.ValueOrDie());
   EXPECT_EQ("(s32[]) (\n42\n)", result->ToString());
+
+  config.mutable_feed(0)->mutable_id()->set_output_index(
+      123); /* invalid output_index */
+  EXPECT_TRUE(errors::IsInvalidArgument(
+      ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index f428a194328935fec1210ea96245344de859e611..2fc77cc4bc536a9e0f48f3933615985650c2cd84 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -151,8 +151,15 @@ Status AddPlaceholdersForFeeds(
       Status status;
       Node* feed_node = g.AddNode(gd.node(0), &status);
       TF_RETURN_IF_ERROR(status);
-      info.data_type =
-          BaseType(feed_node->output_type(info.feed->id().output_index()));
+
+      if (info.feed->id().output_index() < feed_node->num_outputs()) {
+        info.data_type =
+            BaseType(feed_node->output_type(info.feed->id().output_index()));
+      } else {
+        return errors::InvalidArgument(
+            "Invalid output_index ", info.feed->id().output_index(),
+            " for feed node ", info.feed->id().node_name());
+      }
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ed10d80609641b090cf78bf2e17364fe2fa89c31..ae51446204baf14dc03fc6305641048dbf3872b0 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,7 +34,7 @@ namespace {
 
 void ExpectErrorContains(const Status& status, StringPiece str) {
   EXPECT_NE(Status::OK(), status);
-  EXPECT_TRUE(StringPiece(status.error_message()).contains(str))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), str))
       << "expected error: " << status.error_message() << " to contain: " << str;
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index a18eeacd41808884fac9ec5d617cb0d274ea27d8..096dc7160bfc0a3a751f33e7d646471ebea56070 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -257,10 +258,10 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
                             std::move(graph), args, &result);
   EXPECT_FALSE(status.ok());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("depends on a parameter"))
+      str_util::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("[[Node: C = Reshape"))
+      str_util::StrContains(status.error_message(), "[[Node: C = Reshape"))
       << status.error_message();
 }
 
@@ -597,7 +598,8 @@ TEST_F(XlaCompilerTest, UndefinedFunctionFails) {
       compiler.CompileFunction(XlaCompiler::CompileOptions(), name_attr,
                                /*args=*/{}, &result);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
 }
 
@@ -676,11 +678,12 @@ TEST_F(XlaCompilerTest, LocalFunctionWithWrongArgumentsFail) {
 
   ASSERT_FALSE(status.ok());
   // Flib lookup failure.
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("is not defined."))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "is not defined."))
       << status.error_message();
   // Local flib lookup failure.
-  EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Attr T is not found"))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.error_message()),
+                                    "Attr T is not found"))
       << status.error_message();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 3b0b2f06ebae4af918cbe6fb8a384004c1858998..62a5114837e07f35134ad99e28880d6a9233a213 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -122,6 +122,9 @@ xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
 xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
                                                DataType data_type) {
   switch (data_type) {
+    case DT_HALF:
+      return b->ConstantR0<Eigen::half>(
+          static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
     case DT_BFLOAT16:
       return b->ConstantR0<bfloat16>(bfloat16::epsilon());
     case DT_FLOAT:
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index ff7453194af3a85bded86a5ce298f8779422dccb..e255b01dd7fdcb095c7992d4352d2d9bb7d36ac3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -51,13 +51,13 @@ constexpr std::array<DataType, 9> kNumericTypes = {
     {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BFLOAT16}};
 
-constexpr std::array<DataType, 8> kCpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
+constexpr std::array<DataType, 9> kCpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
      DT_COMPLEX64, DT_BOOL}};
 
-constexpr std::array<DataType, 8> kGpuAllTypes = {
-    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_FLOAT, DT_DOUBLE,
-     DT_COMPLEX64, DT_BOOL}};
+constexpr std::array<DataType, 10> kGpuAllTypes = {
+    {DT_UINT32, DT_UINT64, DT_INT32, DT_INT64, DT_HALF, DT_FLOAT, DT_DOUBLE,
+     DT_COMPLEX64, DT_BOOL, DT_BFLOAT16}};
 
 // Class that manages registrations of operators and devices for the XLA JIT.
 // Not thread-safe.
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index cd13db4d300bb5bba21a734173b6afb9223539d8..751777222fcc7ec073958349aa2677d5b4e6757d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -654,18 +654,6 @@ tf_cc_test(
 
 # -----------------------------------------------------------------------------
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
 cc_header_only_library(
     name = "xla_headers_lib",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 5094e5ce6786bb56da408ea6ec83f786be422b38..a299c2afd45aa6b785964b8a8e1400ddf54083a4 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -214,17 +214,3 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 5ce3c45528cfa36315977f7feac920ffd2272894..f0f94298a05f7c4bdc41cbfb8572454fbedd371d 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -193,6 +193,34 @@ StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
   return Transfer(*data, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
+    const XlaComputation& computation, const Layout* output_layout) const {
+  ComputeConstantGraphRequest request;
+  *request.mutable_computation() = computation.proto();
+  if (output_layout != nullptr) {
+    *request.mutable_output_layout() = *output_layout;
+  }
+
+  ComputeConstantResponse response;
+
+  VLOG(2) << "making compute-constant-graph request";
+  Status s = stub_->ComputeConstantGraph(&request, &response);
+  VLOG(2) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
+
+  if (!response.has_literal()) {
+    return InternalError(
+        "no computed literal in the provided response in ComputeConstantGraph "
+        "request");
+  }
+  return Literal::CreateFromProto(response.literal());
+}
+
 StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
   LoadComputationSnapshotRequest request;
   *request.mutable_module() = module;
@@ -276,7 +304,12 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
 
   if (execution_profile != nullptr) {
     *execution_profile = response.profile();
-    // TODO(b/74197823): Get execution stats for the graph and VLOG(1) them.
+    if (VLOG_IS_ON(1)) {
+      TF_ASSIGN_OR_RETURN(
+          auto execution_stats,
+          ExecutionStatsAsString(computation, response.profile()));
+      VLOG(1) << execution_stats;
+    }
   }
 
   return MakeUnique<GlobalData>(stub_, response.output());
@@ -317,6 +350,42 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   return std::move(outputs);
 }
 
+StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
+    tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
+  ExecuteGraphParallelRequest request;
+
+  for (const XlaComputationInstance& computation : computations) {
+    ExecuteGraphRequest single_request;
+    *single_request.mutable_computation() = computation.computation.proto();
+    for (GlobalData* argument : computation.arguments) {
+      *single_request.add_arguments() = argument->handle();
+    }
+    *single_request.mutable_execution_options() = computation.execution_options;
+    *request.add_requests() = single_request;
+  }
+
+  ExecuteParallelResponse response;
+  VLOG(1) << "making execute-graph-parallel request: "
+          << request.ShortDebugString();
+  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<std::unique_ptr<GlobalData>> outputs;
+  for (size_t i = 0; i < computations.size(); ++i) {
+    outputs.push_back(
+        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
+    if (computations[i].execution_profile != nullptr) {
+      *computations[i].execution_profile = response.responses(i).profile();
+    }
+  }
+
+  return std::move(outputs);
+}
+
 StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
     int64 device_count) {
   if (device_count < 1) {
@@ -393,6 +462,27 @@ StatusOr<ComputationStats> Client::GetComputationStats(
   return response.stats();
 }
 
+StatusOr<ComputationStats> Client::GetComputationStats(
+    const XlaComputation& computation,
+    const DebugOptions& debug_options) const {
+  ComputationGraphStatsRequest request;
+
+  // TODO(b/74197823): Find a way to avoid the copy of the hlo proto.
+  *request.mutable_computation() = computation.proto();
+  *request.mutable_debug_options() = debug_options;
+  ComputationStatsResponse response;
+
+  VLOG(1) << "making computation graph stats request";
+  Status s = stub_->GetComputationGraphStats(&request, &response);
+  VLOG(1) << "done with request";
+
+  if (!s.ok()) {
+    return s;
+  }
+  CHECK(response.has_stats());
+  return response.stats();
+}
+
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const Computation& computation) {
   GetComputationShapeRequest request;
@@ -410,6 +500,12 @@ StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
   return WrapUnique(response.release_program_shape());
 }
 
+StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
+    const XlaComputation& computation) {
+  TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
+  return MakeUnique<ProgramShape>(result);
+}
+
 StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   GetShapeRequest request;
   *request.mutable_data() = data.handle();
@@ -448,6 +544,28 @@ StatusOr<string> Client::ExecutionStatsAsString(
   return string("[Execution Statistics] not available.");
 }
 
+StatusOr<string> Client::ExecutionStatsAsString(
+    const XlaComputation& computation, const ExecutionProfile& profile) {
+  TF_ASSIGN_OR_RETURN(
+      auto computation_stats,
+      GetComputationStats(computation,
+                          legacy_flags::GetDebugOptionsFromFlags()));
+  int64 total_flops =
+      computation_stats.flop_count() + computation_stats.transcendental_count();
+  if (profile.compute_time_ns() > 0) {
+    int64 nanoseconds = profile.compute_time_ns();
+    int64 cycle_count = profile.compute_cycle_count();
+    double gflops = total_flops / nanoseconds;
+    return tensorflow::strings::StrCat(
+        "[Execution Statistics] flop count: ", computation_stats.flop_count(),
+        ", transcendental count: ", computation_stats.transcendental_count(),
+        ", compute execution time: ", nanoseconds, " nsec",
+        ", compute cycles: ", cycle_count, ", performance: ", gflops,
+        "gflop/s");
+  }
+  return string("[Execution Statistics] not available.");
+}
+
 StatusOr<ChannelHandle> Client::CreateChannelHandle() {
   CreateChannelHandleRequest request;
   CreateChannelHandleResponse response;
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index ec87646ebf3bfffc70aa1a8597fb2053a7fbe059..14c685d94ea31c382d84223ca4e2eba544420d78 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -99,6 +99,36 @@ class Client {
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
       tensorflow::gtl::ArraySlice<ComputationInstance> computations);
 
+  // A struct to represent a computation instance to be executed.
+  // * If execution_options.device_handles is not empty, the computation is
+  //   executed on the devices associated with the handles by partitioning the
+  //   computation based on the attached sharding attributes. Otherwise, a
+  //   device is chosen by the service.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  struct XlaComputationInstance {
+    const XlaComputation& computation;
+    std::vector<GlobalData*> arguments;
+    ExecutionOptions execution_options;
+    ExecutionProfile* execution_profile;
+
+    XlaComputationInstance(const XlaComputation& computation,
+                           std::vector<GlobalData*> arguments,
+                           ExecutionOptions execution_options,
+                           ExecutionProfile* execution_profile)
+        : computation(computation),
+          arguments(std::move(arguments)),
+          execution_options(execution_options),
+          execution_profile(execution_profile) {}
+  };
+
+  // Executes a list XlaComputationInstances and returns global data produced
+  // from each computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
+      tensorflow::gtl::ArraySlice<XlaComputationInstance> computations);
+
   // Requests device_count device handles available on the target. The returned
   // device handles are used to specify the devices to execute the computations
   // (see ExecuteParallel) or to transfer data (see TransferToServer or
@@ -164,6 +194,27 @@ class Client {
       const ExecutionOptions* execution_options = nullptr,
       ExecutionProfile* execution_profile = nullptr);
 
+  // Computes the value of the given computation using a non-optimized
+  // interpreter on the host.
+  //
+  // The computation must not depend on any parameters, or on stateful operators
+  // such as `RngNormal` or `Infeed`.
+  //
+  // This functionality can be useful when translating a computation into XLA
+  // where something that looked dynamic is required by XLA to be specified as a
+  // constant. E.g. the source computation (outside of XLA) may include a
+  // dynamic computation of the shape of something and ComputeConstant lets you
+  // determine what the value of that computation is in the case where the value
+  // can be determined at compile time.
+  //
+  // If output_layout is non-null, then the output of the computation will be
+  // stored using that layout.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
+      const XlaComputation& computation,
+      const Layout* output_layout = nullptr) const;
+
   // Unregister the memory for the given GlobalData on the device.
   Status Unregister(const GlobalData& data);
 
@@ -175,6 +226,13 @@ class Client {
   StatusOr<ComputationStats> GetComputationStats(
       const Computation& computation, const DebugOptions& debug_options) const;
 
+  // Retrieves the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<ComputationStats> GetComputationStats(
+      const XlaComputation& computation,
+      const DebugOptions& debug_options) const;
+
   // Returns the Shape of the given array specified by 'data'. The shape
   // includes the Layout of the array as it is stored on the service.
   StatusOr<Shape> GetShape(const GlobalData& data);
@@ -184,6 +242,13 @@ class Client {
   StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
       const Computation& computation);
 
+  // As above, but returns the shape of the provided computation (parameter
+  // types/names and return type).
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
+      const XlaComputation& computation);
+
   // Creates a channel handle that can be used to transfer data between
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
@@ -197,6 +262,8 @@ class Client {
   // ExecutionProfile returned from an execution of the computation.
   StatusOr<string> ExecutionStatsAsString(const Computation& computation,
                                           const ExecutionProfile& profile);
+  StatusOr<string> ExecutionStatsAsString(const XlaComputation& computation,
+                                          const ExecutionProfile& profile);
 
   ServiceInterface* stub_;  // Stub that this client is connected on.
 
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
index 39d02f0863f78d4094f2cc4805f534713fb7e929..4d3b0ee0d6e9ba82cfa09af0fbff0ae1efa0ac64 100644
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ b/tensorflow/compiler/xla/client/computation_builder.cc
@@ -253,26 +253,6 @@ StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
   return std::move(*response.mutable_program_shape());
 }
 
-ComputationDataHandle ComputationBuilder::CheckShape(
-    const ComputationDataHandle& operand, const Shape& expected_shape) {
-  std::unique_ptr<Shape> actual_shape = GetShape(operand).ConsumeValueOrDie();
-  CHECK(ShapeUtil::Equal(expected_shape, *actual_shape))
-      << "want " << ShapeUtil::HumanString(expected_shape) << " got "
-      << ShapeUtil::HumanString(*actual_shape);
-  return operand;
-}
-
-void ComputationBuilder::CheckSameShape(const ComputationDataHandle& lhs,
-                                        const ComputationDataHandle& rhs) {
-  std::unique_ptr<Shape> lhs_shape = GetShape(lhs).ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = GetShape(rhs).ConsumeValueOrDie();
-  VLOG(2) << "checking " << ShapeUtil::HumanString(*lhs_shape) << " equals "
-          << ShapeUtil::HumanString(*rhs_shape);
-  CHECK(ShapeUtil::Equal(*lhs_shape, *rhs_shape))
-      << "lhs " << ShapeUtil::HumanString(*lhs_shape) << " rhs "
-      << ShapeUtil::HumanString(*rhs_shape);
-}
-
 ComputationDataHandle ComputationBuilder::Slice(
     const ComputationDataHandle& operand,
     tensorflow::gtl::ArraySlice<int64> start_indices,
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
index 2141ebc2065a1a80d2fe820a7b6fe15434c89e28..019c6f3afb5d57bfe453988ded19120a4483cf36 100644
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ b/tensorflow/compiler/xla/client/computation_builder.h
@@ -104,15 +104,6 @@ class ComputationBuilder {
   // Retrieves the (inferred) result for the current computation's shape.
   StatusOr<ProgramShape> GetProgramShape();
 
-  // Checks that the operand has the given expected shape. Returns the operand
-  // if yes, fails with a CHECK error if no.
-  ComputationDataHandle CheckShape(const ComputationDataHandle& operand,
-                                   const Shape& expected_shape);
-
-  // Checks that the lhs and rhs results have the same shape.
-  void CheckSameShape(const ComputationDataHandle& lhs,
-                      const ComputationDataHandle& rhs);
-
   // Enqueues a constant with the value of the given literal onto the
   // computation.
   ComputationDataHandle ConstantLiteral(const Literal& literal);
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index fca2bf2688cd21b44f099da3bae3b890cbb069ab..f4673a8204f27e93441c73f6dcc9130d96cfcebc 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -24,6 +24,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -48,17 +50,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 24048a1e5a782661ba577ba50e3b5b2914f17c0a..63df449e0b3bdd642d548319dd7d621ca2f59b1d 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using InstructionGenerator =
     ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
                               const ComputationDataHandle&);
@@ -47,6 +48,27 @@ Computation CreateScalarComputation(const string& name, PrimitiveType type,
   generator(b.get(), lhs, rhs);
   return b->BuildAndNoteError();
 }
+
+using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
+
+XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
+                                       XlaBuilder* builder,
+                                       XlaOpGenerator generator) {
+  std::unique_ptr<XlaBuilder> b;
+  if (type == PRED) {
+    b = builder->CreateSubBuilder(name);
+  } else {
+    b = builder->CreateSubBuilder(
+        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
+  }
+
+  const Shape scalar = ShapeUtil::MakeShape(type, {});
+  auto lhs = b->Parameter(0, scalar, "lhs");
+  auto rhs = b->Parameter(1, scalar, "rhs");
+  generator(b.get(), lhs, rhs);
+  return b->BuildAndNoteError();
+}
+
 }  // namespace
 
 Computation CreateScalarAddComputation(PrimitiveType type,
@@ -60,7 +82,7 @@ Computation CreateScalarAddComputation(PrimitiveType type,
 Computation CreateScalarMultiplyComputation(PrimitiveType type,
                                             ComputationBuilder* builder) {
   return CreateScalarComputation(
-      "add", type, builder,
+      "mul", type, builder,
       [](ComputationBuilder* b, const ComputationDataHandle& lhs,
          const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
 }
@@ -114,4 +136,75 @@ StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
   return builder->Reduce(predicates, f, logical_or, all_dimensions);
 }
 
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "add", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Add(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "mul", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Mul(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "ge", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Ge(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "max", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Max(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "min", type, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Min(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "and", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->And(lhs, rhs);
+      });
+}
+
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder) {
+  return CreateScalarComputation(
+      "or", PRED, builder,
+      [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
+        return b->Or(lhs, rhs);
+      });
+}
+
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder) {
+  auto f = builder->ConstantR0<bool>(false);
+  XlaComputation logical_or = CreateScalarOrComputation(builder);
+  TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
+                      builder->GetShape(predicates));
+  std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
+  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
+  return builder->Reduce(predicates, f, logical_or, all_dimensions);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index ae89784bc227d837cf15f0a89687dd00dccc2745..f4d3fc801590fedbb84ed3d6283e62f47c56d5c7 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -56,6 +58,48 @@ Computation CreateScalarOrComputation(ComputationBuilder* builder);
 StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
                                     ComputationBuilder* builder);
 
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar add computation and returns it.
+XlaComputation CreateScalarAddComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar multiply computation and returns it.
+XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
+                                               XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar ge computation and returns it.
+XlaComputation CreateScalarGeComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar max computation and returns it.
+XlaComputation CreateScalarMaxComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar min computation and returns it.
+XlaComputation CreateScalarMinComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical AND computation and returns it.
+XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Creates a scalar logical OR computation and returns it.
+XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
+
+// TODO(b/74197823): This is a part of a NOT YET ready refactor.
+//
+// Returns whether any predicate in "predicates" is set.
+//
+// Note: if predicates is zero-sized, Any() vacuously returns false.
+StatusOr<XlaOp> Any(const XlaOp& predicates, XlaBuilder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index cc5f551c9c1a7b59426f3490e5e671f341543f34..31fa1241ee474a31575c45cf7652063dfc818fac 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -44,6 +44,7 @@ cc_library(
     hdrs = ["xla_builder.h"],
     deps = [
         ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -70,22 +71,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index fcaf393b6b1db6e8335eb84cf00a19c543df1087..7ccdc2ded2c099690bc9187936db6491ef4142dd 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -15,13 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 
+#include <functional>
 #include <numeric>
+#include <queue>
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -44,6 +48,7 @@ int64 GetUniqueId() {
 bool CanBeRoot(HloOpcode opcode) {
   switch (opcode) {
     case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
     case HloOpcode::kOutfeed:
     case HloOpcode::kTrace:
       return false;
@@ -52,15 +57,30 @@ bool CanBeRoot(HloOpcode opcode) {
   }
 }
 
+StatusOr<std::vector<Shape>> GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
+    operand_shapes.push_back(shape);
+  }
+  return operand_shapes;
+}
+
 }  // namespace
 
 StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
   return instr->shape();
 }
 
 StatusOr<Shape> XlaOp::GetShape() const {
-  TF_RET_CHECK(builder_ != nullptr);
+  if (builder_ == nullptr) {
+    return InvalidArgument(
+        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+  }
   return builder_->GetShape(*this);
 }
 
@@ -81,8 +101,24 @@ void XlaBuilder::NoteError(const Status& error) {
   }
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
+XlaOp XlaBuilder::NoteErrorOrReturn(
+    const std::function<StatusOr<XlaOp>()>& op_creator) {
+  if (!first_error_.ok()) {
+    return {};
+  }
+  auto op = op_creator();
+  if (!op.ok()) {
+    NoteError(op.status());
+    return {};
+  }
+  return op.ConsumeValueOrDie();
+}
+
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_RET_CHECK(root_id != nullptr);
+
   ProgramShape program_shape;
 
   // Not all instructions can be roots. Walk backwards from the last added
@@ -123,9 +159,68 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64* root_id) {
   return program_shape;
 }
 
-StatusOr<ProgramShape> XlaBuilder::GetProgramShape() {
-  int64 root_id;
-  return GetProgramShape(&root_id);
+StatusOr<ProgramShape> XlaBuilder::GetProgramShape() const {
+  int64 root;
+  return GetProgramShape(&root);
+}
+
+void XlaBuilder::IsConstantVisitor(const int64 op_handle,
+                                   std::set<int64>* visited,
+                                   bool* is_constant) const {
+  if (visited->count(op_handle) != 0 || !*is_constant) {
+    return;
+  }
+
+  CHECK(op_handle < instructions_.size() && op_handle >= 0);
+
+  const HloInstructionProto& instr = instructions_[op_handle];
+  const HloOpcode opcode = StringToHloOpcode(instr.opcode()).ValueOrDie();
+  switch (opcode) {
+    default:
+      for (const int64 operand_id : instr.operand_ids()) {
+        IsConstantVisitor(operand_id, visited, is_constant);
+      }
+      // TODO(b/32495713): We aren't checking the called computations.
+      break;
+
+    // Non functional ops.
+    case HloOpcode::kRng:
+    case HloOpcode::kCrossReplicaSum:
+      // TODO(b/33009255): Implmement constant folding for cross replica sum.
+    case HloOpcode::kInfeed:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kHostCompute:
+    case HloOpcode::kCall:
+      // TODO(b/32495713): We aren't checking the to_apply computation itself,
+      // so we conservatively say that computations containing the Call op
+      // cannot be constant.  We cannot set is_functional=false in other similar
+      // cases since we're already relying on IsConstant to return true.
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kWhile:
+      // TODO(b/32495713): We aren't checking the condition and body
+      // computations themselves.
+    case HloOpcode::kSend:
+    case HloOpcode::kRecv:
+    case HloOpcode::kParameter:
+      *is_constant = false;
+      break;
+  }
+  if (!*is_constant) {
+    VLOG(1) << "Non-constant: " << instr.name();
+  }
+  visited->insert(op_handle);
+}
+
+XlaComputation XlaBuilder::BuildAndNoteError() {
+  DCHECK(parent_builder_ != nullptr);
+  auto build_status = Build();
+  if (!build_status.ok()) {
+    parent_builder_->NoteError(
+        AddStatus(build_status.status(),
+                  tensorflow::strings::StrCat("error from: ", name_)));
+    return {};
+  }
+  return build_status.ConsumeValueOrDie();
 }
 
 StatusOr<XlaComputation> XlaBuilder::Build() {
@@ -136,23 +231,25 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
   }
 
   HloComputationProto entry;
-  entry.set_name(name_);
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id()));  // Ensure that the name is unique.
 
   {
     int64 root_id;
-    ProgramShape program_shape;
-    TF_ASSIGN_OR_RETURN(program_shape, GetProgramShape(&root_id));
-    entry.mutable_program_shape()->Swap(&program_shape);
+    TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(),
+                        GetProgramShape(&root_id));
     entry.set_root_id(root_id);
   }
 
   for (auto& instruction : instructions_) {
+    // Ensures that the instruction names are unique among the whole graph.
+    const string& new_name =
+        StrCat(instruction.name(), ".", entry.id(), ".", instruction.id());
+    instruction.set_name(new_name);
     entry.add_instructions()->Swap(&instruction);
   }
 
-  const int64 id = GetUniqueId();
-  entry.set_id(id);
-  XlaComputation computation(id);
+  XlaComputation computation(entry.id());
   HloModuleProto* module = computation.mutable_proto();
   module->set_name(entry.name());
   module->set_id(entry.id());
@@ -175,6 +272,8 @@ StatusOr<XlaComputation> XlaBuilder::Build() {
 StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
     const Shape& shape, const XlaOp& operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   HloInstructionProto instr;
   *instr.mutable_shape() = shape;
   for (int64 dim : broadcast_dimensions) {
@@ -185,6 +284,8 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
 
 StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
                                                  const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
@@ -228,7 +329,7 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
     return AddInstruction(std::move(instr), unop, {operand});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::BinaryOp(
@@ -285,7 +386,7 @@ XlaOp XlaBuilder::BinaryOp(
     }
 
     return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
@@ -323,7 +424,7 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     }
     return AddInstruction(std::move(instr), triop,
                           {updated_lhs, updated_rhs, updated_ehs});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Add(const XlaOp& lhs, const XlaOp& rhs,
@@ -342,7 +443,7 @@ XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
     *instr.mutable_shape() = literal.shape();
     *instr.mutable_literal() = literal.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConstant);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Call(const XlaComputation& computation,
@@ -350,43 +451,35 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     std::vector<const Shape*> operand_shape_ptrs;
-    std::vector<Shape> operand_shapes;
-    for (const auto& operand : operands) {
-      TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
-      operand_shapes.push_back(shape);
-    }
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                 [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferCallShape(
-                            operand_shape_ptrs,
-                            /*to_apply=*/computation.GetProgramShape()));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCallShape(operand_shape_ptrs,
+                                       /*to_apply=*/called_program_shape));
 
-    // Add called computation.
-    instr.add_called_computation_ids(
-        computation.proto().entry_computation_id());
-    for (const HloComputationProto& e : computation.proto().computations()) {
-      embedded_.insert({e.id(), e});
-    }
+    AddCalledComputation(computation, &instr);
 
     return AddInstruction(std::move(instr), HloOpcode::kCall, operands);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
                             const string& name) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    if (parameter_numbers_.find(parameter_number) != parameter_numbers_.end()) {
+    if (!parameter_numbers_.insert(parameter_number).second) {
       return InvalidArgument("parameter %lld already registered",
                              parameter_number);
     }
-    parameter_numbers_.insert(parameter_number);
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
     *instr.mutable_shape() = shape;
     return AddInstruction(std::move(instr), HloOpcode::kParameter);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Broadcast(
@@ -410,10 +503,12 @@ XlaOp XlaBuilder::Broadcast(
       dimensions[i] = i + ShapeUtil::Rank(shape) - operand_rank;
     }
     return InDimBroadcast(shape, operand, dimensions);
-  }());
+  });
 }
 
 StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   HloInstructionProto instr;
   *instr.mutable_shape() = shape;
   return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
@@ -423,32 +518,115 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
                         tensorflow::gtl::ArraySlice<int64> start_indices,
                         tensorflow::gtl::ArraySlice<int64> limit_indices,
                         tensorflow::gtl::ArraySlice<int64> strides) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferSliceShape(operand_shape, start_indices,
+                                        limit_indices, strides));
+    for (int i = 0; i < start_indices.size(); i++) {
+      auto* slice_config = instr.add_slice_dimensions();
+      slice_config->set_start(start_indices[i]);
+      slice_config->set_limit(limit_indices[i]);
+      slice_config->set_stride(strides[i]);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kSlice, {operand});
+  });
 }
 
 XlaOp XlaBuilder::SliceInDim(const XlaOp& operand, int64 start_index,
                              int64 limit_index, int64 stride, int64 dimno) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
+    std::vector<int64> limits(shape.dimensions().begin(),
+                              shape.dimensions().end());
+    std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
+    starts[dimno] = start_index;
+    limits[dimno] = limit_index;
+    strides[dimno] = stride;
+    return Slice(operand, starts, limits, strides);
+  });
 }
 
 XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                                tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicSliceShape(
+                            operand_shape, start_indices_shape, slice_sizes));
+
+    for (int64 size : slice_sizes) {
+      instr.add_dynamic_slice_sizes(size);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
+                          {operand, start_indices});
+  });
 }
 
 XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                                      const XlaOp& start_indices) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
+    TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
+                        GetShape(start_indices));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDynamicUpdateSliceShape(
+                            operand_shape, update_shape, start_indices_shape));
+
+    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                          {operand, update, start_indices});
+  });
 }
 
 XlaOp XlaBuilder::ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
                               int64 dimension) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+
+    instr.add_dimensions(dimension);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConcatenate, operands);
+  });
 }
 
 XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
                       const PaddingConfig& padding_config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
+                        GetShape(padding_value));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
+                                      padding_config));
+
+    *instr.mutable_padding_config() = padding_config;
+
+    return AddInstruction(std::move(instr), HloOpcode::kPad,
+                          {operand, padding_value});
+  });
 }
 
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
@@ -463,7 +641,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                            ? operand
                            : Transpose(operand, dimensions);
     return Reshape(shape, transposed);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
@@ -473,16 +651,59 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Collapse(const XlaOp& operand,
                            tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (dimensions.size() <= 1) {
+      // Not collapsing anything, trivially we can return the operand versus
+      // enqueueing a trivial reshape.
+      return operand;
+    }
+
+    // Out-of-order collapse is not supported.
+    // Checks that the collapsed dimensions are in order and consecutive.
+    for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
+         i < dimensions.size(); ++i) {
+      if (dimensions[i] - 1 != dimensions[i - 1]) {
+        return InvalidArgument(
+            "Collapsed dimensions are not in consecutive order.");
+      }
+    }
+
+    // Create a new sizes vector from the old shape, replacing the collapsed
+    // dimensions by the product of their sizes.
+    TF_ASSIGN_OR_RETURN(const Shape& original_shape, GetShape(operand));
+
+    VLOG(3) << "original shape: " << ShapeUtil::HumanString(original_shape);
+    VLOG(3) << "dims to collapse: "
+            << tensorflow::str_util::Join(dimensions, ",");
+
+    std::vector<int64> new_sizes;
+    for (int i = 0; i < ShapeUtil::Rank(original_shape); ++i) {
+      if (i <= dimensions.front() || i > dimensions.back()) {
+        new_sizes.push_back(original_shape.dimensions(i));
+      } else {
+        new_sizes.back() *= original_shape.dimensions(i);
+      }
+    }
+
+    VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
+            << "]";
+
+    return Reshape(operand, new_sizes);
+  });
 }
 
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_literal() = Literal::CreateR1U8(tag)->ToProto();
+    return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
@@ -491,11 +712,36 @@ XlaOp XlaBuilder::Select(const XlaOp& pred, const XlaOp& on_true,
 }
 
 XlaOp XlaBuilder::Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferVariadicOpShape(
+                            HloOpcode::kTuple, operand_shape_ptrs));
+    return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
+  });
 }
 
 XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& tuple_shape, GetShape(tuple_data));
+    if (!ShapeUtil::IsTuple(tuple_shape)) {
+      return InvalidArgument(
+          "Operand to GetTupleElement() is not a tuple; got %s",
+          ShapeUtil::HumanString(tuple_shape).c_str());
+    }
+    *instr.mutable_shape() =
+        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+
+    instr.set_tuple_index(index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kGetTupleElement,
+                          {tuple_data});
+  });
 }
 
 XlaOp XlaBuilder::Eq(const XlaOp& lhs, const XlaOp& rhs,
@@ -529,32 +775,126 @@ XlaOp XlaBuilder::Lt(const XlaOp& lhs, const XlaOp& rhs,
 }
 
 XlaOp XlaBuilder::Dot(const XlaOp& lhs, const XlaOp& rhs) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+
+    DotDimensionNumbers dimension_numbers;
+    dimension_numbers.add_lhs_contracting_dimensions(
+        lhs_shape.dimensions_size() == 1 ? 0 : 1);
+    dimension_numbers.add_rhs_contracting_dimensions(0);
+    return DotGeneral(lhs, rhs, dimension_numbers);
+  });
 }
 
 XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                              const DotDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
+                                                        dimension_numbers));
+    *instr.mutable_dot_dimension_numbers() = dimension_numbers;
+    return AddInstruction(std::move(instr), HloOpcode::kDot, {lhs, rhs});
+  });
+}
+
+Status XlaBuilder::VerifyConvolution(
+    const Shape& lhs_shape, const Shape& rhs_shape,
+    const ConvolutionDimensionNumbers& dimension_numbers) const {
+  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
+    return InvalidArgument(
+        "Convolution arguments must have same number of "
+        "dimensions. Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_dims = ShapeUtil::Rank(lhs_shape);
+  if (num_dims < 2) {
+    return InvalidArgument(
+        "Convolution expects argument arrays with >= 3 dimensions. "
+        "Got: %s and %s",
+        ShapeUtil::HumanString(lhs_shape).c_str(),
+        ShapeUtil::HumanString(rhs_shape).c_str());
+  }
+  int num_spatial_dims = num_dims - 2;
+
+  const auto check_spatial_dimensions =
+      [&](const char* const field_name,
+          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
+              numbers) {
+        if (numbers.size() != num_spatial_dims) {
+          return InvalidArgument("Expected %d elements for %s, but got %d.",
+                                 num_spatial_dims, field_name, numbers.size());
+        }
+        for (int i = 0; i < numbers.size(); ++i) {
+          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
+            return InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
+                                   field_name, i, numbers.Get(i));
+          }
+        }
+        return Status::OK();
+      };
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("input_spatial_dimensions",
+                               dimension_numbers.input_spatial_dimensions()));
+  TF_RETURN_IF_ERROR(
+      check_spatial_dimensions("kernel_spatial_dimensions",
+                               dimension_numbers.kernel_spatial_dimensions()));
+  return check_spatial_dimensions(
+      "output_spatial_dimensions",
+      dimension_numbers.output_spatial_dimensions());
 }
 
 XlaOp XlaBuilder::Conv(const XlaOp& lhs, const XlaOp& rhs,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
-  return UnimplementedOp();
+  return ConvWithGeneralDimensions(
+      lhs, rhs, window_strides, padding,
+      CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralPadding(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return ConvGeneral(lhs, rhs, window_strides, padding,
+                     CreateDefaultConvDimensionNumbers(window_strides.size()));
 }
 
 XlaOp XlaBuilder::ConvWithGeneralDimensions(
     const XlaOp& lhs, const XlaOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> base_area_dimensions(
+        dimension_numbers.input_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
+         ++i) {
+      base_area_dimensions[i] =
+          lhs_shape.dimensions(dimension_numbers.input_spatial_dimensions(i));
+    }
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+
+    return ConvGeneral(lhs, rhs, window_strides,
+                       MakePadding(base_area_dimensions, window_dimensions,
+                                   window_strides, padding),
+                       dimension_numbers);
+  });
 }
 
 XlaOp XlaBuilder::ConvGeneral(
@@ -562,7 +902,8 @@ XlaOp XlaBuilder::ConvGeneral(
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
+                            dimension_numbers);
 }
 
 XlaOp XlaBuilder::ConvGeneralDilated(
@@ -572,33 +913,174 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_RETURN_IF_ERROR(
+        VerifyConvolution(lhs_shape, rhs_shape, dimension_numbers));
+
+    std::vector<int64> window_dimensions(
+        dimension_numbers.kernel_spatial_dimensions_size());
+    for (std::vector<int64>::size_type i = 0; i < window_dimensions.size();
+         ++i) {
+      window_dimensions[i] =
+          rhs_shape.dimensions(dimension_numbers.kernel_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   lhs_dilation, rhs_dilation));
+
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape, instr.window(),
+                                           dimension_numbers));
+
+    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+
+    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
+                          {lhs, rhs});
+  });
+}
+
+StatusOr<Window> XlaBuilder::MakeWindow(
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation) const {
+  const auto verify_size = [&](const size_t x, const char* x_name) {
+    if (x == 0 || x == window_dimensions.size()) {
+      return Status::OK();
+    } else {
+      return InvalidArgument(
+          "%s", tensorflow::strings::StrCat(
+                    "Window has different number of window dimensions than of ",
+                    x_name,
+                    "\nNumber of window dimensions: ", window_dimensions.size(),
+                    "\nNumber of ", x_name, ": ", x, "\n")
+                    .c_str());
+    }
+  };
+  TF_RETURN_IF_ERROR(verify_size(window_strides.size(), "window strides"));
+  TF_RETURN_IF_ERROR(verify_size(padding.size(), "padding entries"));
+  TF_RETURN_IF_ERROR(verify_size(lhs_dilation.size(), "lhs dilation factors"));
+  TF_RETURN_IF_ERROR(verify_size(rhs_dilation.size(), "rhs dilation factors"));
+
+  Window window;
+  for (size_t i = 0; i < window_dimensions.size(); i++) {
+    auto dim = window.add_dimensions();
+    dim->set_size(window_dimensions[i]);
+    if (!window_strides.empty()) {
+      dim->set_stride(window_strides[i]);
+    } else {
+      dim->set_stride(1);
+    }
+    if (!padding.empty()) {
+      dim->set_padding_low(padding[i].first);
+      dim->set_padding_high(padding[i].second);
+    } else {
+      dim->set_padding_low(0);
+      dim->set_padding_high(0);
+    }
+    if (!lhs_dilation.empty()) {
+      dim->set_base_dilation(lhs_dilation[i]);
+    } else {
+      dim->set_base_dilation(1);
+    }
+    if (!rhs_dilation.empty()) {
+      dim->set_window_dilation(rhs_dilation[i]);
+    } else {
+      dim->set_window_dilation(1);
+    }
+    dim->set_window_reversal(false);
+  }
+  return window;
 }
 
 XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const tensorflow::gtl::ArraySlice<int64> fft_length) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
+
+    instr.set_fft_type(fft_type);
+    for (int64 i : fft_length) {
+      instr.add_fft_length(i);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kFft, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (!LayoutUtil::HasLayout(shape)) {
+      return InvalidArgument("Given shape to Infeed must have a layout");
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_infeed_config(config);
+    return AddInstruction(std::move(instr), HloOpcode::kInfeed);
+  });
 }
 
 void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                          const string& outfeed_config) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    *instr.mutable_shape() = ShapeUtil::MakeNil();
+
+    // Check and set outfeed shape.
+    if (!LayoutUtil::HasLayout(shape_with_layout)) {
+      return InvalidArgument("Given shape to Outfeed must have a layout");
+    }
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    if (!ShapeUtil::Compatible(operand_shape, shape_with_layout)) {
+      return InvalidArgument(
+          "Outfeed shape %s must be compatible with operand shape %s",
+          ShapeUtil::HumanStringWithLayout(shape_with_layout).c_str(),
+          ShapeUtil::HumanStringWithLayout(operand_shape).c_str());
+    }
+    *instr.mutable_outfeed_shape() = shape_with_layout;
+
+    instr.set_outfeed_config(outfeed_config);
+
+    return AddInstruction(std::move(instr), HloOpcode::kOutfeed, {operand});
+  });
 }
 
 XlaOp XlaBuilder::CustomCall(const string& call_target_name,
                              tensorflow::gtl::ArraySlice<XlaOp> operands,
                              const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    if (tensorflow::str_util::StartsWith(call_target_name, "$")) {
+      return InvalidArgument(
+          "Invalid custom_call_target \"%s\": Call targets that start with '$' "
+          "are reserved for internal use.",
+          call_target_name.c_str());
+    }
+    *instr.mutable_shape() = shape;
+    instr.set_custom_call_target(call_target_name);
+    return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
+  });
 }
 
 XlaOp XlaBuilder::HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
                               const string& channel_name,
                               int64 cost_estimate_ns, const Shape& shape) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape;
+    instr.set_channel_name(channel_name);
+    instr.set_cost_estimate_ns(cost_estimate_ns);
+    return AddInstruction(std::move(instr), HloOpcode::kHostCompute, operands);
+  });
 }
 
 XlaOp XlaBuilder::Complex(
@@ -607,7 +1089,9 @@ XlaOp XlaBuilder::Complex(
   return BinaryOp(HloOpcode::kComplex, real, imag, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::Conj(const XlaOp& operand) { return UnimplementedOp(); }
+XlaOp XlaBuilder::Conj(const XlaOp& operand) {
+  return Complex(Real(operand), Neg(Imag(operand)));
+}
 
 XlaOp XlaBuilder::Sub(const XlaOp& lhs, const XlaOp& rhs,
                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
@@ -745,12 +1229,22 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
       instr.add_dimensions(dim);
     }
     return AddInstruction(std::move(instr), HloOpcode::kTranspose, {operand});
-  }());
+  });
 }
 
 XlaOp XlaBuilder::Rev(const XlaOp& operand,
                       tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    for (int64 dim : dimensions) {
+      instr.add_dimensions(dim);
+    }
+    return AddInstruction(std::move(instr), HloOpcode::kReverse, {operand});
+  });
 }
 
 XlaOp XlaBuilder::Sort(const XlaOp& operand) {
@@ -769,12 +1263,27 @@ XlaOp XlaBuilder::Pow(const XlaOp& lhs, const XlaOp& rhs,
 
 XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
+  });
 }
 
 XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
                                      PrimitiveType new_element_type) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SquareF32(const XlaOp& operand) {
@@ -800,47 +1309,181 @@ XlaOp XlaBuilder::Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
                       const XlaComputation& computation,
                       tensorflow::gtl::ArraySlice<int64> dimensions,
                       tensorflow::gtl::ArraySlice<XlaOp> static_operands) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!static_operands.empty()) {
+      return Unimplemented("static_operands is not supported in Map");
+    }
+
+    HloInstructionProto instr;
+
+    std::vector<const Shape*> operand_shape_ptrs;
+    TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
+    c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
+                [](const Shape& shape) { return &shape; });
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
+                                      dimensions));
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kMap, operands);
+  });
+}
+
+XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
+                        tensorflow::gtl::ArraySlice<XlaOp> parameters,
+                        const Shape& shape) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Check the number of parameters per RNG distribution.
+    switch (distribution) {
+      case RandomDistribution::RNG_NORMAL:
+      case RandomDistribution::RNG_UNIFORM:
+        if (parameters.size() != 2) {
+          return InvalidArgument(
+              "RNG distribution (%s) expects 2 parameters, but got %ld",
+              RandomDistribution_Name(distribution).c_str(), parameters.size());
+        }
+        break;
+      default:
+        LOG(FATAL) << "unhandled distribution " << distribution;
+    }
+
+    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+    *instr.mutable_shape() = shape;
+
+    instr.set_distribution(distribution);
+
+    return AddInstruction(std::move(instr), HloOpcode::kRng, parameters);
+  });
 }
 
 XlaOp XlaBuilder::RngNormal(const XlaOp& mu, const XlaOp& sigma,
                             const Shape& shape) {
-  return UnimplementedOp();
+  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
 }
 
 XlaOp XlaBuilder::RngUniform(const XlaOp& a, const XlaOp& b,
                              const Shape& shape) {
-  return UnimplementedOp();
+  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
 }
 
 XlaOp XlaBuilder::While(const XlaComputation& condition,
                         const XlaComputation& body, const XlaOp& init) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Infer shape.
+    TF_ASSIGN_OR_RETURN(const auto& body_program_shape, body.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
+                        condition.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferWhileShape(condition_program_shape,
+                                        body_program_shape, init_shape));
+    // Body comes before condition computation in the vector.
+    AddCalledComputation(body, &instr);
+    AddCalledComputation(condition, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kWhile, {init});
+  });
 }
 
 XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& gather_indices,
                          const GatherDimensionNumbers& dimension_numbers,
                          tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
+    TF_ASSIGN_OR_RETURN(const Shape& gather_indices_shape,
+                        GetShape(gather_indices));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferGatherShape(input_shape, gather_indices_shape,
+                                         dimension_numbers, window_bounds));
+
+    *instr.mutable_gather_dimension_numbers() = dimension_numbers;
+    for (int64 bound : window_bounds) {
+      instr.add_gather_window_bounds(bound);
+    }
+
+    return AddInstruction(std::move(instr), HloOpcode::kGather,
+                          {input, gather_indices});
+  });
 }
 
 XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                               const XlaComputation& true_computation,
                               const XlaOp& false_operand,
                               const XlaComputation& false_computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& predicate_shape, GetShape(predicate));
+    TF_ASSIGN_OR_RETURN(const Shape& true_operand_shape,
+                        GetShape(true_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& true_computation_shape,
+                        true_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const Shape& false_operand_shape,
+                        GetShape(false_operand));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
+                        false_computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferConditionalShape(
+            predicate_shape, true_operand_shape, false_operand_shape,
+            true_computation_shape, false_computation_shape));
+
+    // The index of true_computation must be 0 and that of false computation
+    // must be 1.
+    AddCalledComputation(true_computation, &instr);
+    AddCalledComputation(false_computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kConditional,
+                          {predicate, true_operand, false_operand});
+  });
 }
 
 XlaOp XlaBuilder::Reduce(
     const XlaOp& operand, const XlaOp& init_value,
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReduceShape(
+                            operand_shape, init_shape, dimensions_to_reduce,
+                            called_program_shape));
+
+    for (int64 dim : dimensions_to_reduce) {
+      instr.add_dimensions(dim);
+    }
+
+    AddCalledComputation(computation, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kReduce,
+                          {operand, init_value});
+  });
 }
 
 XlaOp XlaBuilder::ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                             const XlaComputation& computation) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    std::vector<int64> all_dimnos(ShapeUtil::Rank(operand_shape));
+    std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
+    return Reduce(operand, init_value, computation, all_dimnos);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindow(
@@ -848,7 +1491,21 @@ XlaOp XlaBuilder::ReduceWindow(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_RETURN_IF_ERROR(
+        ValidatePaddingValues(AsInt64Slice(operand_shape.dimensions()),
+                              window_dimensions, window_strides));
+
+    std::vector<std::pair<int64, int64>> padding_values =
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding);
+    return ReduceWindowWithGeneralPadding(operand, init_value, computation,
+                                          window_dimensions, window_strides,
+                                          padding_values);
+  });
 }
 
 XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
@@ -857,31 +1514,111 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
+                        computation.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
+                                               instr.window(), to_apply_shape));
+
+    AddCalledComputation(computation, &instr);
+    return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
+                          {operand, init_value});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                                     const XlaOp& offset, float epsilon,
                                     int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferBatchNormTrainingShape(
+            operand_shape, scale_shape, offset_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormTraining,
+                          {operand, scale, offset});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                                      const XlaOp& offset, const XlaOp& mean,
                                      const XlaOp& variance, float epsilon,
                                      int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
+    TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
+    TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormInferenceShape(
+                            operand_shape, scale_shape, offset_shape,
+                            mean_shape, variance_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormInference,
+                          {operand, scale, offset, mean, variance});
+  });
 }
 
 XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                                 const XlaOp& batch_mean, const XlaOp& batch_var,
                                 const XlaOp& grad_output, float epsilon,
                                 int64 feature_index) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
+    TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
+    TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferBatchNormGradShape(
+                            operand_shape, scale_shape, batch_mean_shape,
+                            batch_var_shape, grad_output_shape, feature_index));
+
+    instr.set_epsilon(epsilon);
+    instr.set_feature_index(feature_index);
+
+    return AddInstruction(std::move(instr), HloOpcode::kBatchNormGrad,
+                          {operand, scale, batch_mean, batch_var, grad_output});
+  });
 }
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(
+        *instr.mutable_shape(),
+        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+
+    return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
+                          {operand});
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatter(
@@ -890,7 +1627,14 @@ XlaOp XlaBuilder::SelectAndScatter(
     tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    return SelectAndScatterWithGeneralPadding(
+        operand, select, window_dimensions, window_strides,
+        MakePadding(AsInt64Slice(operand_shape.dimensions()), window_dimensions,
+                    window_strides, padding),
+        source, init_value, scatter);
+  });
 }
 
 XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
@@ -900,39 +1644,280 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     const XlaOp& source, const XlaOp& init_value,
     const XlaComputation& scatter) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(const Shape& source_shape, GetShape(source));
+    TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init_value));
+    TF_ASSIGN_OR_RETURN(const ProgramShape& select_shape,
+                        select.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(const ProgramShape& scatter_shape,
+                        scatter.GetProgramShape());
+    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+                        MakeWindow(window_dimensions, window_strides, padding,
+                                   /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferSelectAndScatterShape(
+                            operand_shape, select_shape, instr.window(),
+                            source_shape, init_shape, scatter_shape));
+
+    AddCalledComputation(select, &instr);
+    AddCalledComputation(scatter, &instr);
+
+    return AddInstruction(std::move(instr), HloOpcode::kSelectAndScatter,
+                          {operand, source, init_value});
+  });
 }
 
 XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
                                   const int mantissa_bits) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+                        ShapeInference::InferReducePrecisionShape(
+                            operand_shape, exponent_bits, mantissa_bits));
+    instr.set_exponent_bits(exponent_bits);
+    instr.set_mantissa_bits(mantissa_bits);
+    return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
+                          {operand});
+  });
 }
 
 void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
-  UnimplementedOp();
+  NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Send instruction produces a tuple of {aliased operand, U32 context}.
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(
+        XlaOp send,
+        AddInstruction(std::move(instr), HloOpcode::kSend, {operand}));
+
+    HloInstructionProto send_done_instr;
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeNil();
+    send_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
+                          {send});
+  });
 }
 
 XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
-  return UnimplementedOp();
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+
+    // Recv instruction produces a tuple of {receive buffer, U32 context}.
+    *instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
+    instr.set_channel_id(handle.handle());
+    TF_ASSIGN_OR_RETURN(XlaOp recv,
+                        AddInstruction(std::move(instr), HloOpcode::kRecv, {}));
+
+    HloInstructionProto recv_done_instr;
+    *recv_done_instr.mutable_shape() = shape;
+    recv_done_instr.set_channel_id(handle.handle());
+    return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
+                          {recv});
+  });
+}
+
+StatusOr<bool> XlaBuilder::IsConstant(const XlaOp& operand) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  // Verify that the handle is valid.
+  TF_RETURN_IF_ERROR(LookUpInstruction(operand).status());
+
+  bool is_constant = true;
+  std::set<int64> visited;
+  IsConstantVisitor(operand.handle(), &visited, &is_constant);
+  return is_constant;
+}
+
+StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
+    const XlaOp& root_op) const {
+  TF_ASSIGN_OR_RETURN(bool is_constant, IsConstant(root_op));
+  if (!is_constant) {
+    auto op_status = LookUpInstruction(root_op);
+    string op_string =
+        op_status.ok() ? op_status.ValueOrDie()->name() : "<unknown operation>";
+    return InvalidArgument(
+        "Operand to BuildConstantSubGraph depends on a parameter.\n\n"
+        "  op requested for constant subgraph: %s\n\n"
+        "This is an internal error that typically happens when the XLA user "
+        "(e.g. TensorFlow) is attempting to determine a value that must be a "
+        "compile-time constant (e.g. an array dimension) but it is not capable "
+        "of being evaluated at XLA compile time.\n\n"
+        "Please file a usability bug with the framework being used (e.g. "
+        "TensorFlow).",
+        op_string.c_str());
+  }
+
+  TF_ASSIGN_OR_RETURN(const HloInstructionProto* root,
+                      LookUpInstruction(root_op));
+  TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(root->opcode()));
+  if (!CanBeRoot(opcode)) {
+    return InvalidArgument("the operand with opcode %s cannot be root",
+                           root->opcode().c_str());
+  }
+
+  HloComputationProto entry;
+  entry.set_id(GetUniqueId());  // Give the computation a global unique id.
+  entry.set_name(StrCat(name_, entry.id(), "_compute_constant"));
+  entry.set_root_id(root->id());
+  ProgramShape* program_shape = entry.mutable_program_shape();
+  *program_shape->mutable_result() = root->shape();
+
+  // We use std::set to keep the instruction ids in ascending order (which is
+  // also a valid denpendency order). The related ops will be added to the
+  // subgraph in the same order.
+  std::set<int64> related_ops;
+  tensorflow::gtl::FlatSet<int64> related_calls;  // Related computations.
+  std::queue<int64> worklist;
+  worklist.push(root->id());
+  related_ops.insert(root->id());
+  while (!worklist.empty()) {
+    int64 node = worklist.front();
+    worklist.pop();
+    for (int64 id : instructions_[node].operand_ids()) {
+      if (related_ops.insert(id).second) {
+        worklist.push(id);
+      }
+    }
+    for (int64 called_id : instructions_[node].called_computation_ids()) {
+      related_calls.insert(called_id);
+    }
+  }
+
+  // Add related ops to the computation.
+  for (int64 id : related_ops) {
+    auto* instr = entry.add_instructions();
+    *instr = instructions_[id];
+    // Ensures that the instruction names are unique among the graph.
+    const string& new_name =
+        StrCat(instr->name(), ".", entry.id(), ".", instr->id());
+    instr->set_name(new_name);
+  }
+
+  XlaComputation computation(entry.id());
+  HloModuleProto* module = computation.mutable_proto();
+  module->set_name(entry.name());
+  module->set_id(entry.id());
+  module->set_entry_computation_name(entry.name());
+  module->set_entry_computation_id(entry.id());
+  *module->mutable_program_shape() = *program_shape;
+  for (auto& e : embedded_) {
+    if (related_calls.find(e.second.id()) != related_calls.end()) {
+      *module->add_computations() = e.second;
+    }
+  }
+  *module->add_computations() = std::move(entry);
+
+  return std::move(computation);
+}
+
+std::unique_ptr<XlaBuilder> XlaBuilder::CreateSubBuilder(
+    const string& computation_name) {
+  auto sub_builder = MakeUnique<XlaBuilder>(computation_name);
+  sub_builder->parent_builder_ = this;
+  sub_builder->die_immediately_on_error_ = this->die_immediately_on_error_;
+  return sub_builder;
+}
+
+/* static */ ConvolutionDimensionNumbers
+XlaBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
+  ConvolutionDimensionNumbers dimension_numbers;
+  dimension_numbers.set_input_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_input_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_output_batch_dimension(kConvBatchDimension);
+  dimension_numbers.set_output_feature_dimension(kConvFeatureDimension);
+  dimension_numbers.set_kernel_output_feature_dimension(
+      kConvKernelOutputDimension);
+  dimension_numbers.set_kernel_input_feature_dimension(
+      kConvKernelInputDimension);
+  for (int i = 0; i < num_spatial_dims; ++i) {
+    dimension_numbers.add_input_spatial_dimensions(i + 2);
+    dimension_numbers.add_kernel_spatial_dimensions(i + 2);
+    dimension_numbers.add_output_spatial_dimensions(i + 2);
+  }
+  return dimension_numbers;
+}
+
+/* static */ Status XlaBuilder::Validate(
+    const ConvolutionDimensionNumbers& dnum) {
+  if (dnum.input_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("input spacial dimension < 2: %d",
+                              dnum.input_spatial_dimensions_size());
+  }
+  if (dnum.kernel_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("kernel spacial dimension < 2: %d",
+                              dnum.kernel_spatial_dimensions_size());
+  }
+  if (dnum.output_spatial_dimensions_size() < 2) {
+    return FailedPrecondition("output spacial dimension < 2: %d",
+                              dnum.output_spatial_dimensions_size());
+  }
+
+  if (std::set<int64>(
+          {dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+           dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.input_batch_dimension(), dnum.input_feature_dimension(),
+        dnum.input_spatial_dimensions(0), dnum.input_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.kernel_output_feature_dimension(),
+                       dnum.kernel_input_feature_dimension(),
+                       dnum.kernel_spatial_dimensions(0),
+                       dnum.kernel_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the weight are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.kernel_output_feature_dimension(),
+        dnum.kernel_input_feature_dimension(),
+        dnum.kernel_spatial_dimensions(0), dnum.kernel_spatial_dimensions(1));
+  }
+  if (std::set<int64>({dnum.output_batch_dimension(),
+                       dnum.output_feature_dimension(),
+                       dnum.output_spatial_dimensions(0),
+                       dnum.output_spatial_dimensions(1)})
+          .size() != 4) {
+    return FailedPrecondition(
+        "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
+        "%lld)",
+        dnum.output_batch_dimension(), dnum.output_feature_dimension(),
+        dnum.output_spatial_dimensions(0), dnum.output_spatial_dimensions(1));
+  }
+  return Status::OK();
 }
 
 StatusOr<XlaOp> XlaBuilder::AddInstruction(
     HloInstructionProto&& instr, HloOpcode opcode,
     tensorflow::gtl::ArraySlice<XlaOp> operands) {
+  TF_RETURN_IF_ERROR(first_error_);
+
   const int64 handle = instructions_.size();
   instr.set_id(handle);
   instr.set_opcode(HloOpcodeString(opcode));
   if (instr.name().empty()) {
-    instr.set_name(StrCat(instr.opcode(), ".", handle));
-  } else {
-    // Append the handle to make sure the name is unique.
-    instr.set_name(StrCat(instr.name(), ".", handle));
+    instr.set_name(StrCat(instr.opcode()));
   }
   for (const auto& operand : operands) {
-    TF_RET_CHECK(operand.builder_ != nullptr);
-    TF_RET_CHECK(operand.builder_ == this)
-        << "Do not add XlaOp from builder " << operand.builder_->name()
-        << " to builder " << this->name();
+    if (operand.builder_ == nullptr) {
+      return InvalidArgument("invalid XlaOp with handle %lld",
+                             operand.handle());
+    }
+    if (operand.builder_ != this) {
+      return InvalidArgument("Do not add XlaOp from builder %s to builder %s",
+                             operand.builder_->name().c_str(),
+                             this->name().c_str());
+    }
     instr.add_operand_ids(operand.handle());
   }
 
@@ -947,8 +1932,22 @@ StatusOr<XlaOp> XlaBuilder::AddInstruction(
   return op;
 }
 
+void XlaBuilder::AddCalledComputation(const XlaComputation& computation,
+                                      HloInstructionProto* instr) {
+  instr->add_called_computation_ids(computation.proto().entry_computation_id());
+  for (const HloComputationProto& e : computation.proto().computations()) {
+    embedded_.insert({e.id(), e});
+  }
+}
+
 StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
+  TF_RETURN_IF_ERROR(first_error_);
+
+  if (op.builder_ != this) {
+    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+  }
+
   TF_RET_CHECK(op.builder_ == this);
   if (op.handle() >= instructions_.size() || op.handle() < 0) {
     return InvalidArgument("no XlaOp value %lld", op.handle());
@@ -957,7 +1956,7 @@ StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
 }
 
 XlaOp XlaBuilder::UnimplementedOp() {
-  NoteError(Unimplemented("Op not yet implemented"));
+  NoteError(Unimplemented("Op not implemented"));
   return {};
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index c5c35159e06e1cc2d9f75a5b41f025773c3d685d..24e0be2ac165fb74503a4991987ebe0c4064c73f 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -335,6 +335,26 @@ class XlaBuilder {
   XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                    const DotDimensionNumbers& dimension_numbers);
 
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64 kConvBatchDimension = 0;
+  static constexpr int64 kConvFeatureDimension = 1;
+  static constexpr int64 kConvFirstSpatialDimension = 2;
+  static constexpr int64 kConvSecondSpatialDimension = 3;
+  static constexpr int64 kConvKernelOutputDimension = 0;
+  static constexpr int64 kConvKernelInputDimension = 1;
+  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+
   // Enqueues a convolution instruction onto the computation, which uses the
   // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
@@ -667,11 +687,12 @@ class XlaBuilder {
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
 
   // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with index greater than or equal to
-  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
-  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
-  // compile-time constant without evaluating the computation.
-  StatusOr<bool> IsConstant(const XlaOp& operand, int64 num_parameters = 0);
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
   // Normalizes operand across spatial and batch dimensions for each feature.
   //
@@ -711,10 +732,32 @@ class XlaBuilder {
                       const XlaOp& grad_output, float epsilon,
                       int64 feature_index);
 
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
+
   // Builds the computation with the requested operations, or returns a non-ok
-  // status.
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
   StatusOr<XlaComputation> Build();
 
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
   // Returns the first error that was encountered while building the
   // computation. When an error is encountered, by default we return a vacuous
   // XlaOp and inform the user of the error that occurred while
@@ -727,26 +770,23 @@ class XlaBuilder {
   StatusOr<Shape> GetShape(const XlaOp& op) const;
 
   // Returns the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape();
+  StatusOr<ProgramShape> GetProgramShape() const;
 
  private:
   StatusOr<XlaOp> AddInstruction(
       HloInstructionProto&& instr, HloOpcode opcode,
       tensorflow::gtl::ArraySlice<XlaOp> operands = {});
 
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
   // Notes that the error occurred by:
   // * storing it internally and capturing a backtrace if it's the first error
   //   (this deferred value will be produced on the call to Build())
   // * dying if die_immediately_on_error_ is true
   void NoteError(const Status& error);
 
-  XlaOp NoteErrorOrReturn(StatusOr<XlaOp>&& op) {
-    if (!op.ok()) {
-      NoteError(op.status());
-      return XlaOp();
-    }
-    return op.ConsumeValueOrDie();
-  }
+  XlaOp NoteErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
 
   // Helper method that creates an empty op and notes error.
   XlaOp UnimplementedOp();
@@ -766,6 +806,10 @@ class XlaBuilder {
   XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
                   const XlaOp& ehs);
 
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
   StatusOr<XlaOp> InDimBroadcast(
       const Shape& shape, const XlaOp& operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
@@ -781,7 +825,29 @@ class XlaBuilder {
 
   // Returns the (inferred) result for the program shape for the current
   // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id);
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
 
   string name_;  // Name to use for the built computation.
 
@@ -814,6 +880,8 @@ class XlaBuilder {
 
   // Mode bit that indicates whether to die when a first error is encountered.
   bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
 };
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
index 85d4227ba4d8d04b1d2ba8b1d24922b13bd9cae5..ce984564d016ce65fa6c932f3cda290cc0d75a4a 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -39,7 +40,8 @@ class XlaBuilderTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(XlaComputation computation, b->Build());
     const HloModuleProto& proto = computation.proto();
     TF_ASSIGN_OR_RETURN(const auto& config,
-                        HloModule::CreateModuleConfigFromProto(proto));
+                        HloModule::CreateModuleConfigFromProto(
+                            proto, legacy_flags::GetDebugOptionsFromFlags()));
     return HloModule::CreateFromProto(proto, config);
   }
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
index 3681792eeea081f87ee055e79ba841b4917a428d..a6752c601026518825c7994f6b6fa20d20f34f24 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/status_macros.h"
+
 namespace xla {
 
-const ProgramShape& XlaComputation::GetProgramShape() const {
+StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
+  TF_RET_CHECK(proto_.has_program_shape());
   return proto_.program_shape();
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index 5b89747fdd4f91e82c7ebc7aa10c5a914100a0c8..2a3c6952667a434b68ca0c5e4e9874397da173d3 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -29,6 +29,8 @@ namespace xla {
 // TODO(b/74197823): Replace xla::Computation with this one.
 class XlaComputation {
  public:
+  XlaComputation() : unique_id_(-1) {}
+
   XlaComputation(const XlaComputation&) = delete;
   XlaComputation& operator=(const XlaComputation&) = delete;
 
@@ -38,7 +40,8 @@ class XlaComputation {
 
   // Returns the "program shape" (parameter and return shapes) for this
   // computation.
-  const ProgramShape& GetProgramShape() const;
+  StatusOr<ProgramShape> GetProgramShape() const;
+
   const HloModuleProto& proto() const { return proto_; }
 
  private:
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 392ad9010ab81923a089c7b00a79ddc281af92bb..1700c977189a9e4aedf6a6a75923c13678dae667 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -87,4 +87,11 @@ const DeviceAssignment* ExecutableRunOptions::device_assignment() const {
   return device_assignment_;
 }
 
+ExecutableRunOptions& ExecutableRunOptions::set_rng_seed(int rng_seed) {
+  rng_seed_ = rng_seed;
+  return *this;
+}
+
+int ExecutableRunOptions::rng_seed() const { return rng_seed_; }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index d4fcbf0493c936ebcd0639a432e56b62ee15672c..2c1d9ffff10ed26410898ad258aa6b5b2cd37518 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -84,6 +84,9 @@ class ExecutableRunOptions {
       DeviceAssignment* device_assignment);
   const DeviceAssignment* device_assignment() const;
 
+  ExecutableRunOptions& set_rng_seed(int rng_seed);
+  int rng_seed() const;
+
  private:
   DeviceMemoryAllocator* allocator_ = nullptr;
   int device_ordinal_ = -1;
@@ -92,6 +95,7 @@ class ExecutableRunOptions {
   tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
+  int rng_seed_ = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/legacy_flags/BUILD b/tensorflow/compiler/xla/legacy_flags/BUILD
index 0a9725db0a4fcf963cadcacf2cbc1d95d2c7239d..89353448e29ec3d97275dac288e23aa8e96e31b2 100644
--- a/tensorflow/compiler/xla/legacy_flags/BUILD
+++ b/tensorflow/compiler/xla/legacy_flags/BUILD
@@ -75,17 +75,3 @@ tf_cc_test(
             "//tensorflow/core:test",
         ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index c8ed3e3a2b009ddffdfb79a9a6ced8d5e736bee6..70ae95bf47398589e3c20f72c1f2084a738f253a 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -40,7 +40,10 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   flags->set_xla_cpu_multi_thread_eigen(true);
   flags->set_xla_gpu_cuda_data_dir("./cuda_sdk_lib");
   flags->set_xla_eliminate_hlo_implicit_broadcast(true);
-
+#ifdef INTEL_MKL
+  flags->set_xla_cpu_use_mkl_dnn(true);
+#endif  // INTEL_MKL
+  flags->set_xla_gpu_max_kernel_unroll_factor(1);
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
@@ -220,6 +223,11 @@ void AllocateFlags() {
           bool_setter_for(&DebugOptions::set_xla_gpu_disable_multi_streaming),
           flag_values->xla_gpu_disable_multi_streaming(),
           "If true, multi-streaming in the GPU backend is disabled."),
+      tensorflow::Flag(
+          "xla_gpu_max_kernel_unroll_factor",
+          int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor),
+          flag_values->xla_gpu_max_kernel_unroll_factor(),
+          "Specify the maximum kernel unroll factor for the GPU backend."),
       tensorflow::Flag(
           "xla_dump_optimized_hlo_proto_to",
           flag_values->mutable_xla_dump_optimized_hlo_proto_to(),
@@ -288,6 +296,10 @@ void AllocateFlags() {
           flag_values->xla_gpu_use_cudnn_batchnorm(),
           "Allows the GPU backend to implement batchnorm HLOs using cudnn, "
           "rather than expanding them to a soup of HLOs."),
+      tensorflow::Flag("xla_cpu_use_mkl_dnn",
+                       bool_setter_for(&DebugOptions::set_xla_cpu_use_mkl_dnn),
+                       flag_values->xla_cpu_use_mkl_dnn(),
+                       "Generate calls to MKL-DNN in the CPU backend."),
   });
   ParseFlagsFromEnv(*flag_objects);
 }
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 13675b7d0074592043b7e12de0aad948a3e9848f..c315b4ff30059147ee33dcdd5b0858a1c39e5999 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -97,11 +97,18 @@ Literal::Literal(const Shape& shape, bool allocate_arrays)
     const Shape& subshape = piece.subshape();
     if (ShapeUtil::IsArray(subshape)) {
       if (allocate_arrays) {
-        piece.set_buffer(new char[piece.size_bytes()]);
         if (LayoutUtil::IsSparseArray(subshape)) {
+          // For sparse arrays, the buffer must be of the size of the maximum
+          // number of sparse elements possible.
+          const int64 max_sparse_elements =
+              LayoutUtil::MaxSparseElements(subshape.layout());
+          piece.set_buffer(
+              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
+                                                 subshape.element_type())]);
           piece.set_sparse_indices(new SparseIndexArray(
-              LayoutUtil::MaxSparseElements(subshape.layout()),
-              ShapeUtil::Rank(subshape)));
+              max_sparse_elements, ShapeUtil::Rank(subshape)));
+        } else {
+          piece.set_buffer(new char[piece.size_bytes()]);
         }
       } else {
         piece.set_buffer(nullptr);
@@ -1409,6 +1416,28 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  auto converter = [](NativeSrcT src) {
+    return tensorflow::bit_cast<NativeDestT>(src);
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
+      src_literal, converter);
+}
+
+// This template specialization is here to make the compiler happy. bit_cast has
+// a static check that the types are the same size. This specialization should
+// never be used because the source and destination types are checked for
+// identical sizes higher up.
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
+                        std::unique_ptr<Literal>>::type
+BitcastBetweenNativeTypes(const Literal& src_literal) {
+  LOG(FATAL) << "Invalid bitcast between types of different sizes.";
+}
+
 template <PrimitiveType primitive_src_type>
 std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
@@ -1428,21 +1457,33 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+                                             bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
-  return ConvertBetweenNativeTypes<
-      typename primitive_util::PrimitiveTypeToNative<primitive_src_type>::type,
-      typename primitive_util::PrimitiveTypeToNative<
-          primitive_dest_type>::type>(src_literal);
+  if (bitcast) {
+    return BitcastBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  } else {
+    return ConvertBetweenNativeTypes<
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_src_type>::type,
+        typename primitive_util::PrimitiveTypeToNative<
+            primitive_dest_type>::type>(src_literal);
+  }
 }
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type) {
+    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   switch (primitive_dest_type) {
-#define CONVERT_IF_TYPES_MATCH(type) \
-  case (type):                       \
-    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal);
+#define CONVERT_IF_TYPES_MATCH(type)                                    \
+  case (type):                                                          \
+    return ConvertIfTypesMatch<primitive_src_type, (type)>(src_literal, \
+                                                           bitcast);
     CONVERT_IF_TYPES_MATCH(PRED)
     CONVERT_IF_TYPES_MATCH(S8)
     CONVERT_IF_TYPES_MATCH(S32)
@@ -1456,28 +1497,31 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
     CONVERT_IF_TYPES_MATCH(BF16)
 #undef CONVERT_IF_TYPES_MATCH
     case C64:
-      return ConvertToC64<primitive_src_type>(src_literal);
+      if (!bitcast) {
+        return ConvertToC64<primitive_src_type>(src_literal);
+      }
+      break;
     // Other types are not yet supported.
     default:
-      return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
-          PrimitiveType_Name(primitive_dest_type).c_str());
-  }
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
-    PrimitiveType primitive_dest_type) const {
-  TF_RET_CHECK(ShapeUtil::IsArray(shape()));
-  if (shape().element_type() == primitive_dest_type) {
-    return CloneToUnique();
+      break;
   }
-  switch (shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type) \
-  case (type):                             \
-    return ConvertIfDestTypeMatches<(type)>(*this, primitive_dest_type);
+  return Unimplemented(
+      "Converting from type %s to type %s is not implemented.",
+      PrimitiveType_Name(src_literal.shape().element_type()).c_str(),
+      PrimitiveType_Name(primitive_dest_type).c_str());
+}
+
+StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
+    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+  TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
+  if (literal.shape().element_type() == primitive_dest_type) {
+    return literal.CloneToUnique();
+  }
+  switch (literal.shape().element_type()) {
+#define CONVERT_IF_DEST_TYPE_MATCHES(type)                                \
+  case (type):                                                            \
+    return ConvertIfDestTypeMatches<(type)>(literal, primitive_dest_type, \
+                                            bitcast);
     CONVERT_IF_DEST_TYPE_MATCHES(PRED)
     CONVERT_IF_DEST_TYPE_MATCHES(S8)
     CONVERT_IF_DEST_TYPE_MATCHES(S32)
@@ -1493,12 +1537,35 @@ StatusOr<std::unique_ptr<Literal>> Literal::Convert(
       // Other types are not yet supported.
     default:
       return Unimplemented(
-          "Converting from type %s to type %s is not implemented.",
-          PrimitiveType_Name(shape().element_type()).c_str(),
+          "%s from type %s to type %s is not implemented.",
+          (bitcast ? "Bitcast converting" : "Converting"),
+          PrimitiveType_Name(literal.shape().element_type()).c_str(),
           PrimitiveType_Name(primitive_dest_type).c_str());
   }
 }
 
+}  // namespace
+
+StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+    PrimitiveType primitive_dest_type) const {
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
+}
+
+StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+    PrimitiveType primitive_dest_type) const {
+  if (primitive_util::BitWidth(shape().element_type()) !=
+      primitive_util::BitWidth(primitive_dest_type)) {
+    return InvalidArgument(
+        "Cannot bitcast convert from %s to %s, bit widths are different: %d != "
+        "%d",
+        PrimitiveType_Name(shape().element_type()).c_str(),
+        PrimitiveType_Name(primitive_dest_type).c_str(),
+        primitive_util::BitWidth(shape().element_type()),
+        primitive_util::BitWidth(primitive_dest_type));
+  }
+  return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
+}
+
 StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
     const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index a96a76fbb4e1a46e225d33b715f073c05fe6275a..8aa19222dc4b9175ec72128dfdad448f65c23e91 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -333,11 +333,19 @@ class Literal {
   template <typename NativeT>
   std::unique_ptr<Literal> Replicate(int64 times) const;
 
-  // Converts this literal to another primitive type. Returns an error if the
-  // conversion is not possible. This literal must be array-shaped.
+  // Converts this literal to another primitive type using
+  // static_cast<>. Returns an error if the conversion is not possible. This
+  // literal must be array-shaped.
   StatusOr<std::unique_ptr<Literal>> Convert(
       PrimitiveType primitive_dest_type) const;
 
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
   // Converts this literal to the given shape. Returns an error is the
   // conversion is not possible.
   //
@@ -587,6 +595,12 @@ class Literal {
   template <typename NativeT, typename FnType>
   Status Populate(const FnType& generator);
 
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
+
   // Fills this literal with the given value.
   template <typename NativeT>
   void PopulateWithValue(NativeT value);
@@ -727,7 +741,13 @@ class Literal {
     int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
 
     // Returns the number of elements in this piece's array.
-    int64 element_count() const { return ShapeUtil::ElementsIn(subshape()); }
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
 
     // Copy the data from 'src' into this piece's buffer. Shapes of this piece
     // and src must be compatible.
@@ -785,6 +805,10 @@ class Literal {
   // buffer).
   void DeallocateBuffers();
 
+  // Implementation details shared between Populate() and PopulateParallel()
+  template <typename NativeT, typename FnType>
+  Status PopulateInternal(const FnType& generator, bool parallel);
+
   Shape shape_;
   ShapeTree<Piece> pieces_;
 
@@ -835,8 +859,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::ArraySlice<NativeT>(
-      reinterpret_cast<const NativeT*>(buffer()),
-      ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<const NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -849,7 +872,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
       << " type, but literal element type is "
       << PrimitiveType_Name(subshape().element_type());
   return tensorflow::gtl::MutableArraySlice<NativeT>(
-      reinterpret_cast<NativeT*>(buffer()), ShapeUtil::ElementsIn(subshape()));
+      reinterpret_cast<NativeT*>(buffer()), element_count());
 }
 
 template <typename NativeT>
@@ -1264,19 +1287,20 @@ void Literal::PopulateSparse(SparseIndexArray indices,
   CHECK_LE(num_elements, max_elements);
   CHECK_EQ(num_elements, indices.index_count());
   auto root_data = root_piece().data<NativeT>();
-  root_data.remove_suffix(max_elements - values.size());
+  // Piece::data() returns an ArraySlice of size equal to the number of indices
+  // in the SparseIndexArray. So there is no need to adjust the size of the data
+  // here. It is enough to just copy the incoming values into the data buffer.
   std::copy(values.begin(), values.end(), root_data.begin());
   *this->root_piece().sparse_indices() = std::move(indices);
   if (sort) {
     auto root_data = this->root_piece().data<NativeT>();
-    root_data.remove_suffix(root_data.size() - num_elements);
     this->root_piece().sparse_indices()->SortWithValues(root_data);
   }
   DCHECK(this->root_piece().sparse_indices()->Validate(shape()));
 }
 
 template <typename NativeT, typename FnType>
-Status Literal::Populate(const FnType& generator) {
+Status Literal::PopulateInternal(const FnType& generator, bool parallel) {
   const Shape& this_shape = shape();
   const int64 rank = ShapeUtil::Rank(this_shape);
   TF_RET_CHECK(LayoutUtil::IsDenseArray(this_shape));
@@ -1286,11 +1310,11 @@ Status Literal::Populate(const FnType& generator) {
   if (rank > 0) {
     StrideConfig stride_config(this_shape, this_shape,
                                AsInt64Slice(this_shape.dimensions()));
-    DimensionVector minor_scan_indexes(rank, 0);
     int64 minor_dimension_size =
         ShapeUtil::GetDimension(this_shape, stride_config.minor_dimension);
 
     auto init_function = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+      DimensionVector minor_scan_indexes(rank, 0);
       const int64 index =
           IndexUtil::MultidimensionalIndexToLinearIndex(shape(), indexes);
       std::copy(indexes.begin(), indexes.end(), minor_scan_indexes.begin());
@@ -1298,17 +1322,35 @@ Status Literal::Populate(const FnType& generator) {
         minor_scan_indexes[stride_config.minor_dimension] = i;
         literal_data.at(index + i) = generator(minor_scan_indexes);
       }
-      return true;
     };
-    ShapeUtil::ForEachIndex(this_shape, stride_config.base,
-                            stride_config.dimensions, stride_config.step,
-                            init_function);
+    if (parallel) {
+      ShapeUtil::ForEachIndexParallel(this_shape, stride_config.base,
+                                      stride_config.dimensions,
+                                      stride_config.step, init_function);
+    } else {
+      ShapeUtil::ForEachIndex(
+          this_shape, stride_config.base, stride_config.dimensions,
+          stride_config.step,
+          [&init_function](tensorflow::gtl::ArraySlice<int64> indexes) {
+            init_function(indexes);
+            return true;
+          });
+    }
   } else {
     // For scalars.
     literal_data.at(0) = generator({});
   }
   return Status::OK();
 }
+template <typename NativeT, typename FnType>
+Status Literal::Populate(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/false);
+}
+
+template <typename NativeT, typename FnType>
+Status Literal::PopulateParallel(const FnType& generator) {
+  return PopulateInternal<NativeT>(generator, /*parallel=*/true);
+}
 
 template <typename NativeT>
 void Literal::PopulateWithValue(NativeT value) {
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 7627762074b6132655c58690a7fffbaf2717e279..61046784e05623cd3117c24ecc6d6c474739bbd5 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -217,9 +218,7 @@ TEST_F(LiteralUtilTest, CreateSparse) {
   EXPECT_EQ(literal->sparse_indices()->data(),
             ArraySlice<int64>(expected_indices.data(),
                               expected_indices.num_elements()));
-  EXPECT_EQ(
-      ArraySlice<int64>(literal->data<int64>().data(), expected_values.size()),
-      ArraySlice<int64>(expected_values));
+  EXPECT_EQ(literal->data<int64>(), ArraySlice<int64>(expected_values));
 }
 
 TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
@@ -1090,6 +1089,48 @@ TEST_F(LiteralUtilTest, Populate) {
   }
 }
 
+TEST_F(LiteralUtilTest, PopulateParallel) {
+  struct PopulateData {
+    std::vector<int64> dimensions;
+    std::vector<int64> layout;
+  } populate_data[] = {
+      {{}, {}},
+      {{0}, {0}},
+      {{16}, {0}},
+      {{2, 0}, {1, 0}},
+      {{4, 16}, {1, 0}},
+      {{21, 12}, {0, 1}},
+      {{6, 11, 17}, {2, 0, 1}},
+      {{6, 11, 5, 17}, {3, 2, 0, 1}},
+  };
+  for (const auto& data : populate_data) {
+    Shape shape = ShapeUtil::MakeShapeWithLayout(
+        primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
+        data.layout);
+    auto literal = Literal::CreateFromShape(shape);
+    auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
+      // Offsets from linear index just to avoid R0 literals to be initialized
+      // with zero.
+      return IndexUtil::MultidimensionalIndexToLinearIndex(literal->shape(),
+                                                           indexes) +
+             17;
+    };
+    TF_EXPECT_OK(literal->PopulateParallel<uint32>(generator));
+
+    std::vector<int64> zero_base(data.dimensions.size(), 0);
+    std::vector<int64> step(data.dimensions.size(), 1);
+    bool matched = true;
+    auto check_function = [&](ArraySlice<int64> indexes) {
+      auto value = literal->Get<uint32>(indexes);
+      matched = matched && (value == generator(indexes));
+      return matched;
+    };
+    ShapeUtil::ForEachIndex(literal->shape(), zero_base, data.dimensions, step,
+                            check_function);
+    EXPECT_TRUE(matched);
+  }
+}
+
 TEST_F(LiteralUtilTest, ConvertR4) {
   // clang-format off
   auto original = Literal::CreateR4WithLayout<int8>({{
@@ -1243,6 +1284,25 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
             tensorflow::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, BitcastConvert) {
+  auto original =
+      Literal::CreateR1<uint32>({tensorflow::bit_cast<uint32>(2.5f),
+                                 tensorflow::bit_cast<uint32>(-42.25f),
+                                 tensorflow::bit_cast<uint32>(100.f), 0xbeef});
+  auto expected = Literal::CreateR1<float>(
+      {2.5f, -42.25f, 100.0f, tensorflow::bit_cast<float>(0xbeef)});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> converted,
+                          original->BitcastConvert(F32));
+}
+
+TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
+  auto literal = Literal::CreateR0<uint32>(1234);
+  Status status = literal->BitcastConvert(F64).status();
+  EXPECT_NE(Status::OK(), status);
+  EXPECT_TRUE(tensorflow::str_util::StrContains(status.error_message(),
+                                                "bit widths are different"));
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index e2972f06016ab3555c4fc0cc4616993fe6764b1e..0517a5502e686def4ffea59f929aef225186a8aa 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -72,15 +72,3 @@ tf_py_wrap_cc(
         "//tensorflow/compiler/xla/service:cpu_plugin",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index b21ab3044fae7136071f50bdba6e74b799a309d5..2bacc6a9142971f6d14b3929fb1a69e2a40052e2 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -521,6 +521,17 @@ ComputationDataHandle LocalComputationBuilder::Conditional(
                               false_computation.computation());
 }
 
+StatusOr<bool> LocalComputationBuilder::IsConstant(
+    const ComputationDataHandle& operand, int64 num_parameters) {
+  return builder_.IsConstant(operand, num_parameters);
+}
+
+StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
+    const ComputationDataHandle& operand, const Layout* output_layout,
+    tensorflow::gtl::ArraySlice<Literal> parameters) {
+  return builder_.ComputeConstant(operand, output_layout, parameters);
+}
+
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
   return_sig LocalComputationBuilder::method_name args_sig { \
     return builder_.method_name args;                        \
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index a7375c8965e9041226ffee08dab6ffafa25312af..31046e60f11af9cc89ddec4c5fd16babfc8eb231 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -268,6 +268,13 @@ class LocalComputationBuilder {
                                     const ComputationDataHandle& false_operand,
                                     const LocalComputation& false_computation);
 
+  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
+                            int64 num_parameters);
+
+  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
+      const ComputationDataHandle& operand, const Layout* output_layout,
+      tensorflow::gtl::ArraySlice<Literal> parameters);
+
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 8f231d1a12d92ecd93908771019c1440da6855e3..ac792e8189bda9eda472e7d282db86ac988c57b9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -182,7 +182,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
   const int64 handle = numpy::PyIntOrPyLongToLong($input);
   if (handle == -1 && PyErr_Occurred()) {
-    return NULL;
+    SWIG_fail;
   }
   temp.set_handle(handle);
   $1 = &temp;
@@ -201,7 +201,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -211,7 +211,7 @@ tensorflow::ImportNumpy();
     $result = numpy::PyObjectFromXlaLiteral(*value);
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -224,7 +224,7 @@ tensorflow::ImportNumpy();
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
 }
 
@@ -233,7 +233,16 @@ tensorflow::ImportNumpy();
     $result = numpy::PyShapeInfoFromXlaShape($1.ConsumeValueOrDie());
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
+  }
+}
+
+%typemap(out) StatusOr<bool> {
+  if ($1.ok()) {
+    $result = PyBool_FromLong($1.ConsumeValueOrDie());
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
+    SWIG_fail;
   }
 }
 
@@ -241,7 +250,7 @@ tensorflow::ImportNumpy();
   if (!$1.ok()) {
     PyErr_SetString(
         PyExc_RuntimeError, $1.ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   Py_INCREF(Py_None);
   $result = Py_None;
@@ -253,7 +262,7 @@ tensorflow::ImportNumpy();
     (std::vector<int64> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -265,13 +274,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i] = numpy::PyIntOrPyLongToLong(py_int);
     if (temps[i] == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(py_int);
     Py_DECREF(o);
@@ -285,7 +294,7 @@ tensorflow::ImportNumpy();
     (std::vector<ComputationDataHandle> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.resize(size);
@@ -296,13 +305,13 @@ tensorflow::ImportNumpy();
       PyErr_SetString(
           PyExc_TypeError,
           "Argument sequence element cannot be converted to int");
-      return NULL;
+      SWIG_fail;
     }
     const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
     if (handle == -1 && PyErr_Occurred()) {
       Py_DECREF(py_int);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps[i].set_handle(handle);
     Py_DECREF(py_int);
@@ -317,7 +326,7 @@ tensorflow::ImportNumpy();
     (std::vector<LocalShapedBuffer*> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
@@ -326,7 +335,7 @@ tensorflow::ImportNumpy();
     LocalShapedBuffer* lsbp;
     if ((SWIG_ConvertPtr(o, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
                          SWIG_POINTER_EXCEPTION)) == -1) {
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(lsbp);
     Py_DECREF(o);
@@ -340,7 +349,7 @@ tensorflow::ImportNumpy();
   literal_status = numpy::XlaLiteralFromPyObject($input);
   if (!literal_status.ok()) {
     PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $1 = literal_status.ValueOrDie().get();
 }
@@ -352,7 +361,7 @@ tensorflow::ImportNumpy();
 %typemap(out) StatusOr< std::unique_ptr<Literal> > {
   if (!$1.ok()) {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   $result = numpy::PyObjectFromXlaLiteral(*$1.ValueOrDie());
 }
@@ -360,7 +369,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Literal>& (std::vector<Literal> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -369,7 +378,7 @@ tensorflow::ImportNumpy();
     if (!literal_status.ok()) {
       PyErr_SetString(PyExc_RuntimeError, literal_status.status().ToString().c_str());
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::move(*literal_status.ConsumeValueOrDie()));
     Py_DECREF(o);
@@ -383,7 +392,7 @@ tensorflow::ImportNumpy();
   StatusOr<OpMetadata> statusor = numpy::OpMetadataFromPyObject($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -395,7 +404,7 @@ tensorflow::ImportNumpy();
   StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
   if (!statusor.ok()) {
     PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-    return NULL;
+    SWIG_fail;
   }
   temp = std::move(statusor).ValueOrDie();
   $1 = &temp;
@@ -410,7 +419,7 @@ tensorflow::ImportNumpy();
     StatusOr<Shape> statusor = numpy::XlaShapeFromPyShape($input);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temp = std::move(statusor).ValueOrDie();
     $1 = &temp;
@@ -424,7 +433,7 @@ tensorflow::ImportNumpy();
 %typemap(in) const std::vector<Shape>& (std::vector<Shape> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -433,7 +442,7 @@ tensorflow::ImportNumpy();
     Py_DECREF(o);
     if (!statusor.ok()) {
       PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(statusor.ConsumeValueOrDie());
   }
@@ -444,7 +453,7 @@ tensorflow::ImportNumpy();
     std::vector<tensorflow::gtl::optional<Shape> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   for (int i = 0; i < size; ++i) {
@@ -456,7 +465,7 @@ tensorflow::ImportNumpy();
       Py_DECREF(o);
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_RuntimeError, statusor.status().ToString().c_str());
-        return NULL;
+        SWIG_fail;
       }
       temps.push_back(statusor.ConsumeValueOrDie());
     }
@@ -470,18 +479,18 @@ tensorflow::ImportNumpy();
   PyObject* py_int = numpy::PyNumberToPyInt($input);
   if (!py_int) {
     PyErr_SetString(PyExc_TypeError, "Argument cannot be converted to int");
-    return NULL;
+    SWIG_fail;
   }
   const long value = numpy::PyIntOrPyLongToLong(py_int);
   if (value == -1 && PyErr_Occurred()) {
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   if (!PrimitiveType_IsValid(value)) {
     PyErr_SetString(
         PyExc_TypeError, "Argument not valid for PrimitiveType enum");
     Py_DECREF(py_int);
-    return NULL;
+    SWIG_fail;
   }
   $1 = static_cast<PrimitiveType>(value);
 }
@@ -492,19 +501,19 @@ tensorflow::ImportNumpy();
     (std::vector<std::pair<int64, int64> > temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
-    return NULL;
+    SWIG_fail;
   }
   const int size = PySequence_Size($input);
   temps.reserve(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
     if (!o) {
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first = PyTuple_GetItem(o, 0);
     if (!first) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* first_pyint = numpy::PyNumberToPyInt(first);
     if (!first_pyint) {
@@ -512,13 +521,13 @@ tensorflow::ImportNumpy();
           PyExc_TypeError,
           "First pair item cannot be converted to int");
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second = PyTuple_GetItem(o, 1);
     if (!second) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     PyObject* second_pyint = numpy::PyNumberToPyInt(second);
     if (!second_pyint) {
@@ -527,21 +536,21 @@ tensorflow::ImportNumpy();
           "Second pair item cannot be converted to int");
       Py_DECREF(o);
       Py_DECREF(first_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 first_value = numpy::PyIntOrPyLongToLong(first_pyint);
     if (first_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     const int64 second_value = numpy::PyIntOrPyLongToLong(second_pyint);
     if (second_value == -1 && PyErr_Occurred()) {
       Py_DECREF(o);
       Py_DECREF(first_pyint);
       Py_DECREF(second_pyint);
-      return NULL;
+      SWIG_fail;
     }
     temps.push_back(std::make_pair(first_value, second_value));
     Py_DECREF(o);
@@ -559,26 +568,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "lhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -589,26 +598,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_contracting_dimensions = PyObject_GetAttrString(
       $input, "rhs_contracting_dimensions");
   if (!lhs_contracting_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_contracting_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_contracting_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_contracting_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_contracting_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_contracting_dimensions(dimension);
     Py_DECREF(item);
@@ -619,26 +628,26 @@ tensorflow::ImportNumpy();
   PyObject* lhs_batch_dimensions = PyObject_GetAttrString(
       $input, "lhs_batch_dimensions");
   if (!lhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(lhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(lhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(lhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(lhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_lhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -649,26 +658,26 @@ tensorflow::ImportNumpy();
   PyObject* rhs_batch_dimensions = PyObject_GetAttrString(
       $input, "rhs_batch_dimensions");
   if (!rhs_batch_dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   length = PySequence_Size(rhs_batch_dimensions);
   if (length == -1) {
     Py_DECREF(rhs_batch_dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(rhs_batch_dimensions, i);
     if (!item) {
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(rhs_batch_dimensions);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_rhs_batch_dimensions(dimension);
     Py_DECREF(item);
@@ -684,20 +693,20 @@ tensorflow::ImportNumpy();
     (PaddingConfig padding_config) {
   PyObject* dimensions = PyObject_GetAttrString($input, "dimensions");
   if (!dimensions) {
-    return NULL;
+    SWIG_fail;
   }
 
   int length = PySequence_Size(dimensions);
   if (length == -1) {
     Py_DECREF(dimensions);
-    return NULL;
+    SWIG_fail;
   }
 
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(dimensions, i);
     if (!item) {
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     int64 edge_padding_low, edge_padding_high, interior_padding;
     if (!GetIntAttr(item, "edge_padding_low", &edge_padding_low)
@@ -705,7 +714,7 @@ tensorflow::ImportNumpy();
         || !GetIntAttr(item, "interior_padding", &interior_padding)) {
       Py_DECREF(item);
       Py_DECREF(dimensions);
-      return NULL;
+      SWIG_fail;
     }
     Py_DECREF(item);
 
@@ -727,32 +736,32 @@ tensorflow::ImportNumpy();
   int64 value;
 
   if (!GetIntAttr($input, "input_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_batch_dimension(value);
 
   if (!GetIntAttr($input, "input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_input_feature_dimension(value);
 
   if (!GetIntAttr($input, "output_batch_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_batch_dimension(value);
 
   if (!GetIntAttr($input, "output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_output_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_output_feature_dimension(value);
 
   if (!GetIntAttr($input, "kernel_input_feature_dimension", &value)) {
-    return NULL;
+    SWIG_fail;
   }
   dimension_numbers.set_kernel_input_feature_dimension(value);
 
@@ -761,24 +770,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "input_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_input_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -787,24 +796,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "kernel_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_kernel_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -813,24 +822,24 @@ tensorflow::ImportNumpy();
 
   o = PyObject_GetAttrString($input, "output_spatial_dimensions");
   if (!o) {
-    return NULL;
+    SWIG_fail;
   }
   length = PySequence_Size(o);
   if (length == -1) {
     Py_DECREF(o);
-    return NULL;
+    SWIG_fail;
   }
   for (int i = 0; i < length; ++i) {
     PyObject* item = PySequence_GetItem(o, i);
     if (!item) {
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     const int64 dimension = numpy::PyIntOrPyLongToLong(item);
     if (dimension == -1 && PyErr_Occurred()) {
       Py_DECREF(item);
       Py_DECREF(o);
-      return NULL;
+      SWIG_fail;
     }
     dimension_numbers.add_output_spatial_dimensions(dimension);
     Py_DECREF(item);
@@ -865,12 +874,12 @@ tensorflow::ImportNumpy();
 
     PyObject* o = PyObject_GetAttrString($input, "hlo_profile");
     if (o == NULL) {
-      return NULL;
+      SWIG_fail;
     }
     if (o != Py_None) {
       if (!PyBool_Check(o)) {
         PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_hlo_profile(o == Py_True);
     }
@@ -885,7 +894,7 @@ tensorflow::ImportNumpy();
       if (!statusor.ok()) {
         PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat("ExecutableBuildOptions.result_shape could not be created from Python shape value: ", statusor.status().ToString()).c_str());
         Py_DECREF(o);
-        return NULL;
+        SWIG_fail;
       }
       build_options.set_result_layout(statusor.ValueOrDie());
     }
@@ -951,6 +960,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::RngBernoulli;
 %unignore xla::swig::LocalComputationBuilder::While;
 %unignore xla::swig::LocalComputationBuilder::Conditional;
+%unignore xla::swig::LocalComputationBuilder::IsConstant;
 %unignore xla::swig::LocalComputationBuilder::Eq;
 %unignore xla::swig::LocalComputationBuilder::Ne;
 %unignore xla::swig::LocalComputationBuilder::Ge;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index e548d420f4614d3b3fff6034f9a174d553ebea66..9c81f6439d0d9f0a0f0d1d3402e9c1ada46e8691 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1028,6 +1028,20 @@ class ComputationBuilder(object):
             _unwrap_data_handle(false_operand),
             false_computation.c_local_computation))
 
+  def IsConstant(self, operand, num_parameters=0):
+    """Enqueues an IsConstant operation onto the computation.
+
+    Args:
+      operand: a ComputationDataHandle to test.
+      num_parameters: optional int, number of computation parameters to treat as
+        constant (default 0).
+
+    Returns: bool indicating whether `operand` is a compile-time constant,
+      meaning its value does not depend on parameters with index greater than or
+      equal to `num_parameters`.
+    """
+    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 4c16c1f8b07a28d8098e92e27f81a126ed9bdf0c..d97264ea640787ab865f3cd64867addedd73cc1d 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -855,6 +855,17 @@ class SingleOpTest(LocalComputationTest):
     self.assertTrue(np.all(lo <= result))
     self.assertTrue(np.all(result < hi))
 
+  def testIsConstant(self):
+    c = self._NewComputation()
+    a = c.ConstantS32Scalar(3)
+    b = c.ConstantS32Scalar(1)
+    x = c.ParameterFromNumpy(NumpyArrayS32(0))
+    const_expr = c.Sub(b, a)
+    non_const_expr = c.Mul(const_expr, x)
+    self.assertTrue(c.IsConstant(const_expr))
+    self.assertFalse(c.IsConstant(non_const_expr))
+    # self.assertTrue(c.IsConstant(c.Sub(c.Add(x, a), x)))  # TODO(b/77245564)
+
 
 class EmbeddedComputationsTest(LocalComputationTest):
   """Tests for XLA graphs with embedded computations (such as maps)."""
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..977f8637873a4b6555798f533010a28ff36e8679
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -0,0 +1,79 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/compiler/xla:xla.bzl",
+    "xla_proto_library",
+    "xla_py_grpc_library",
+)
+
+xla_proto_library(
+    name = "xla_service_proto",
+    srcs = ["xla_service.proto"],
+    use_grpc_plugin = True,
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+    ],
+)
+
+cc_library(
+    name = "grpc_stub",
+    srcs = ["grpc_stub.cc"],
+    hdrs = ["grpc_stub.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla:service_interface",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+    ],
+)
+
+tf_cc_binary(
+    name = "grpc_service_main_cpu",
+    srcs = ["grpc_service_main.cc"],
+    deps = [
+        ":grpc_service",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_client_test",
+    srcs = ["grpc_client_test.cc"],
+    data = [
+        "//tensorflow/compiler/xla/rpc:grpc_service_main_cpu",
+    ],
+    deps = [
+        ":grpc_stub",
+        "//tensorflow/compiler/xla/client",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_service",
+    srcs = ["grpc_service.cc"],
+    hdrs = ["grpc_service.h"],
+    deps = [
+        ":xla_service_proto",
+        "//tensorflow/compiler/xla/service",
+        "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b559ee4b5a345dbb2cc481b571562a0a630b3294
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Simple C++ test to exercise the GRPC capabilities of XLA.
+//
+// Launches an RPC service in a subprocess and connects to it over a socket
+// using an RPCStub.
+#include <memory>
+#include <vector>
+
+#include "grpc++/create_channel.h"
+#include "grpc++/security/credentials.h"
+
+#include "tensorflow/compiler/xla/client/client.h"
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/net.h"
+#include "tensorflow/core/platform/subprocess.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class GRPCClientTestBase : public ::testing::Test {
+ protected:
+  GRPCClientTestBase() {
+    string test_srcdir = tensorflow::testing::TensorFlowSrcRoot();
+    string service_main_path = tensorflow::io::JoinPath(
+        test_srcdir, "compiler/xla/rpc/grpc_service_main_cpu");
+    int port = tensorflow::internal::PickUnusedPortOrDie();
+    subprocess_.SetProgram(
+        service_main_path,
+        {service_main_path, tensorflow::strings::Printf("--port=%d", port)});
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDOUT,
+                                 tensorflow::ACTION_DUPPARENT);
+    subprocess_.SetChannelAction(tensorflow::CHAN_STDERR,
+                                 tensorflow::ACTION_DUPPARENT);
+    CHECK(subprocess_.Start());
+    LOG(INFO) << "Launched subprocess";
+
+    auto channel =
+        ::grpc::CreateChannel(tensorflow::strings::Printf("localhost:%d", port),
+                              ::grpc::InsecureChannelCredentials());
+    channel->WaitForConnected(gpr_time_add(
+        gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN)));
+    LOG(INFO) << "Channel to server is connected on port " << port;
+
+    xla_service_ = grpc::XlaService::NewStub(channel);
+    stub_.reset(new GRPCStub(xla_service_.get()));
+    client_.reset(new Client(stub_.get()));
+  }
+
+  ~GRPCClientTestBase() override {
+    LOG(INFO) << "Killing subprocess";
+    subprocess_.Kill(SIGKILL);
+  }
+
+  tensorflow::SubProcess subprocess_;
+  std::unique_ptr<grpc::XlaService::Stub> xla_service_;
+  std::unique_ptr<GRPCStub> stub_;
+  std::unique_ptr<Client> client_;
+};
+
+TEST_F(GRPCClientTestBase, ItsAlive) {
+  ASSERT_NE(xla_service_, nullptr);
+  ASSERT_NE(stub_, nullptr);
+  ASSERT_NE(client_, nullptr);
+}
+
+TEST_F(GRPCClientTestBase, AxpyTenValues) {
+  ComputationBuilder builder(client_.get(), "axpy_10");
+  auto alpha = builder.ConstantR0<float>(3.1415926535);
+  auto x = builder.ConstantR1<float>(
+      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
+  auto y = builder.ConstantR1<float>(
+      {5.0, -5.0, -4.0, 4.0, 3.0, -3.0, -2.0, 2.0, 1.0, -1.0});
+  auto ax = builder.Mul(alpha, x);
+  auto axpy = builder.Add(ax, y);
+
+  std::vector<float> expected = {
+      1.85840735, -1.85840735, 2.28318531,   -2.28318531,  -6.42477796,
+      6.42477796, 10.56637061, -10.56637061, -14.70796327, 14.70796327};
+  std::unique_ptr<Literal> expected_literal =
+      Literal::CreateR1<float>(expected);
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
+                                                   computation, {}, nullptr));
+  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
+                              ErrorSpec(0.0001));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414829d6e76354672c7c1998d1fb1bd185043d78
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -0,0 +1,192 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<GRPCService>> GRPCService::NewService(
+    perftools::gputools::Platform* platform) {
+  std::unique_ptr<GRPCService> grpc_service(new GRPCService());
+  TF_ASSIGN_OR_RETURN(grpc_service->service_,
+                      ::xla::Service::NewService(platform));
+  return std::move(grpc_service);
+}
+
+::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
+  tensorflow::Status s = op();
+  return tensorflow::ToGrpcStatus(s);
+}
+
+::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
+                                        const ComputationRequest* arg,
+                                        ComputationResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Computation(arg, result); });
+}
+
+::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
+                                     const OpRequest* arg, OpResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Op(arg, result); });
+}
+
+::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
+                                       const UnregisterRequest* arg,
+                                       UnregisterResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Unregister(arg, result); });
+}
+
+::grpc::Status GRPCService::DeconstructTuple(::grpc::ServerContext* context,
+                                             const DeconstructTupleRequest* arg,
+                                             DeconstructTupleResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->DeconstructTuple(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
+                                           const SetReturnValueRequest* arg,
+                                           SetReturnValueResponse* results) {
+  return DelegateRPC([this, arg, results]() {
+    return service_->SetReturnValue(arg, results);
+  });
+}
+
+::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
+                                    const ExecuteRequest* arg,
+                                    ExecuteResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->Execute(arg, result); });
+}
+
+::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
+                                         const ExecuteAsyncRequest* arg,
+                                         ExecuteAsyncResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
+}
+
+::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
+                                             const WaitForExecutionRequest* arg,
+                                             WaitForExecutionResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->WaitForExecution(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToClient(::grpc::ServerContext* context,
+                                             const TransferToClientRequest* arg,
+                                             TransferToClientResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToClient(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToServer(::grpc::ServerContext* context,
+                                             const TransferToServerRequest* arg,
+                                             TransferToServerResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToServer(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferToInfeed(::grpc::ServerContext* context,
+                                             const TransferToInfeedRequest* arg,
+                                             TransferToInfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferToInfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::TransferFromOutfeed(
+    ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+    TransferFromOutfeedResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->TransferFromOutfeed(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::ResetDevice(::grpc::ServerContext* context,
+                                        const ResetDeviceRequest* arg,
+                                        ResetDeviceResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ResetDevice(arg, result); });
+}
+
+::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
+                                       const IsConstantRequest* arg,
+                                       IsConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->IsConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
+                                            const ComputeConstantRequest* arg,
+                                            ComputeConstantResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
+}
+
+::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
+                                     const GetShapeRequest* arg,
+                                     GetShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationShape(
+    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+    GetComputationShapeResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationShape(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
+                                          const GetLocalShapeRequest* arg,
+                                          GetLocalShapeResponse* result) {
+  return DelegateRPC(
+      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
+}
+
+::grpc::Status GRPCService::GetComputationStats(
+    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
+    ComputationStatsResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->GetComputationStats(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::SnapshotComputation(
+    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+    SnapshotComputationResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->SnapshotComputation(arg, result);
+  });
+}
+
+::grpc::Status GRPCService::LoadComputationSnapshot(
+    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+    LoadComputationSnapshotResponse* result) {
+  return DelegateRPC([this, arg, result]() {
+    return service_->LoadComputationSnapshot(arg, result);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c9e484517e9ced45c40dda78a2bd427a24c2722
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
+
+#include "grpc++/server_context.h"
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service/service.h"
+
+namespace xla {
+
+// Service implementation which wraps a XLA Service with a GRPC interface.
+class GRPCService : public grpc::XlaService::Service {
+ public:
+  // Factory for creating a RPCService. The parameter platform is the platform
+  // that the service should target. If platform is null then the default
+  // platform is used.
+  static StatusOr<std::unique_ptr<GRPCService>> NewService(
+      perftools::gputools::Platform* platform = nullptr);
+
+  ::grpc::Status Computation(::grpc::ServerContext* context,
+                             const ComputationRequest* arg,
+                             ComputationResponse* result) override;
+
+  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
+                          OpResponse* result) override;
+
+  ::grpc::Status Unregister(::grpc::ServerContext* context,
+                            const UnregisterRequest* arg,
+                            UnregisterResponse* result) override;
+
+  ::grpc::Status DeconstructTuple(::grpc::ServerContext* context,
+                                  const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) override;
+
+  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
+                                const SetReturnValueRequest* arg,
+                                SetReturnValueResponse* results) override;
+
+  ::grpc::Status Execute(::grpc::ServerContext* context,
+                         const ExecuteRequest* arg,
+                         ExecuteResponse* result) override;
+
+  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
+                              const ExecuteAsyncRequest* arg,
+                              ExecuteAsyncResponse* result) override;
+
+  ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
+                                  const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) override;
+
+  ::grpc::Status TransferToClient(::grpc::ServerContext* context,
+                                  const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) override;
+
+  ::grpc::Status TransferToServer(::grpc::ServerContext* context,
+                                  const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) override;
+
+  ::grpc::Status TransferToInfeed(::grpc::ServerContext* context,
+                                  const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) override;
+
+  ::grpc::Status TransferFromOutfeed(
+      ::grpc::ServerContext* context, const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  ::grpc::Status ResetDevice(::grpc::ServerContext* context,
+                             const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) override;
+
+  ::grpc::Status IsConstant(::grpc::ServerContext* context,
+                            const IsConstantRequest* arg,
+                            IsConstantResponse* result) override;
+
+  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
+                                 const ComputeConstantRequest* arg,
+                                 ComputeConstantResponse* result) override;
+
+  ::grpc::Status GetShape(::grpc::ServerContext* context,
+                          const GetShapeRequest* arg,
+                          GetShapeResponse* result) override;
+
+  ::grpc::Status GetComputationShape(
+      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
+                               const GetLocalShapeRequest* arg,
+                               GetLocalShapeResponse* result) override;
+
+  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
+                                     const ComputationStatsRequest* arg,
+                                     ComputationStatsResponse* result) override;
+
+  ::grpc::Status SnapshotComputation(
+      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
+      SnapshotComputationResponse* result) override;
+
+  ::grpc::Status LoadComputationSnapshot(
+      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
+      LoadComputationSnapshotResponse* result) override;
+
+ private:
+  std::unique_ptr<::xla::Service> service_;
+
+  GRPCService() {}
+  GRPCService(const GRPCService&) = delete;
+  void operator=(const GRPCService&) = delete;
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e29908ccec80db76e3b5b856e57382c56430c379
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Basic server binary that exposes a xla::Service through a GRPC interface
+// on a configurable port.
+#include "grpc++/security/server_credentials.h"
+#include "grpc++/server.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/compiler/xla/rpc/grpc_service.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace xla {
+namespace {
+
+int RealMain(int argc, char** argv) {
+  int32 port = 1685;
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("port", &port, "port to listen on"),
+  };
+  string usage = tensorflow::Flags::Usage(argv[0], flag_list);
+  bool parsed_values_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  if (!parsed_values_ok) {
+    LOG(ERROR) << usage;
+    return 2;
+  }
+  tensorflow::port::InitMain(argv[0], &argc, &argv);
+
+  std::unique_ptr<xla::GRPCService> service =
+      xla::GRPCService::NewService().ConsumeValueOrDie();
+
+  ::grpc::ServerBuilder builder;
+  string server_address(tensorflow::strings::Printf("localhost:%d", port));
+
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<::grpc::Server> server(builder.BuildAndStart());
+
+  LOG(INFO) << "Server listening on " << server_address;
+  server->Wait();
+
+  return 0;
+}
+
+}  // namespace
+}  // namespace xla
+
+int main(int argc, char** argv) { return xla::RealMain(argc, argv); }
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e1f2b0abe39b10dd82b700941748bc4f4e8cb2f8
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -0,0 +1,244 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/rpc/grpc_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace xla {
+
+GRPCStub::~GRPCStub() = default;
+
+tensorflow::Status MakeRPC(
+    const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
+  ::grpc::ClientContext context;
+  ::grpc::Status s = rpc_method(&context);
+  return tensorflow::FromGrpcStatus(s);
+}
+
+tensorflow::Status GRPCStub::TransferToClient(
+    const TransferToClientRequest* request,
+    TransferToClientResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToClient(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToServer(
+    const TransferToServerRequest* request,
+    TransferToServerResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToServer(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferToInfeed(
+    const TransferToInfeedRequest* request,
+    TransferToInfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferToInfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::TransferFromOutfeed(
+    const TransferFromOutfeedRequest* request,
+    TransferFromOutfeedResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->TransferFromOutfeed(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                                         ResetDeviceResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ResetDevice(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::LoadComputationSnapshot(
+    const LoadComputationSnapshotRequest* request,
+    LoadComputationSnapshotResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
+                                     ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Execute(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                                          ExecuteResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraph(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteParallel(
+    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* request,
+    ExecuteParallelResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteGraphParallel(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
+                                          ExecuteAsyncResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ExecuteAsync(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::WaitForExecution(
+    const WaitForExecutionRequest* request,
+    WaitForExecutionResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->WaitForExecution(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::DeconstructTuple(
+    const DeconstructTupleRequest* request,
+    DeconstructTupleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->DeconstructTuple(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationStats(
+    const ComputationStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* request,
+    ComputationStatsResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationGraphStats(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetComputationShape(
+    const GetComputationShapeRequest* request,
+    GetComputationShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetComputationShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
+                                      GetShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetDeviceHandles(
+    const GetDeviceHandlesRequest* request,
+    GetDeviceHandlesResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetDeviceHandles(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::CreateChannelHandle(
+    const CreateChannelHandleRequest* request,
+    CreateChannelHandleResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateChannelHandle(context, *request, response);
+  });
+}
+
+// Methods used by ComputationBuilder.
+tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
+                                         ComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Computation(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::Op(const OpRequest* request,
+                                OpResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->CreateOp(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
+                                           GetLocalShapeResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->GetLocalShape(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::SetReturnValue(
+    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
+  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
+    return grpc_stub_->SetReturnValue(context, *request, responses);
+  });
+}
+
+tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
+                                        IsConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->IsConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstant(
+    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstant(context, *request, response);
+  });
+}
+
+tensorflow::Status GRPCStub::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* request,
+    ComputeConstantResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->ComputeConstantGraph(context, *request, response);
+  });
+}
+
+// Methods used by Computation.
+tensorflow::Status GRPCStub::SnapshotComputation(
+    const SnapshotComputationRequest* request,
+    SnapshotComputationResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->SnapshotComputation(context, *request, response);
+  });
+}
+
+// Methods used by GlobalData.
+tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
+                                        UnregisterResponse* response) {
+  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
+    return grpc_stub_->Unregister(context, *request, response);
+  });
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd9810d4f1a5e084b73e83007ea7f9f8b0462c72
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+#define TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
+
+#include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
+#include "tensorflow/compiler/xla/service_interface.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+class GRPCStub : public ServiceInterface {
+ public:
+  explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
+  ~GRPCStub() override;
+
+  tensorflow::Status TransferToClient(
+      const TransferToClientRequest* arg,
+      TransferToClientResponse* result) override;
+
+  tensorflow::Status TransferToServer(
+      const TransferToServerRequest* arg,
+      TransferToServerResponse* result) override;
+
+  tensorflow::Status TransferToInfeed(
+      const TransferToInfeedRequest* arg,
+      TransferToInfeedResponse* result) override;
+
+  tensorflow::Status TransferFromOutfeed(
+      const TransferFromOutfeedRequest* arg,
+      TransferFromOutfeedResponse* result) override;
+
+  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
+                                 ResetDeviceResponse* result) override;
+
+  tensorflow::Status LoadComputationSnapshot(
+      const LoadComputationSnapshotRequest* request,
+      LoadComputationSnapshotResponse* result) override;
+
+  tensorflow::Status Execute(const ExecuteRequest* arg,
+                             ExecuteResponse* result) override;
+
+  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
+                                  ExecuteResponse* response) override;
+
+  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
+                                     ExecuteParallelResponse* result) override;
+
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* request,
+      ExecuteParallelResponse* response) override;
+
+  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
+                                  ExecuteAsyncResponse* result) override;
+
+  tensorflow::Status WaitForExecution(
+      const WaitForExecutionRequest* arg,
+      WaitForExecutionResponse* result) override;
+
+  tensorflow::Status DeconstructTuple(
+      const DeconstructTupleRequest* arg,
+      DeconstructTupleResponse* result) override;
+
+  tensorflow::Status GetComputationStats(
+      const ComputationStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* request,
+      ComputationStatsResponse* response) override;
+
+  tensorflow::Status GetComputationShape(
+      const GetComputationShapeRequest* arg,
+      GetComputationShapeResponse* result) override;
+
+  tensorflow::Status GetShape(const GetShapeRequest* arg,
+                              GetShapeResponse* result) override;
+
+  tensorflow::Status GetDeviceHandles(
+      const GetDeviceHandlesRequest* arg,
+      GetDeviceHandlesResponse* result) override;
+
+  tensorflow::Status CreateChannelHandle(
+      const CreateChannelHandleRequest* arg,
+      CreateChannelHandleResponse* result) override;
+
+  // Methods used by ComputationBuilder.
+  tensorflow::Status Computation(const ComputationRequest* arg,
+                                 ComputationResponse* result) override;
+
+  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
+  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
+                                   GetLocalShapeResponse* result) override;
+
+  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
+                                    SetReturnValueResponse* results) override;
+
+  tensorflow::Status IsConstant(const IsConstantRequest* arg,
+                                IsConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
+                                     ComputeConstantResponse* result) override;
+
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
+
+  // Methods used by Computation.
+  tensorflow::Status SnapshotComputation(
+      const SnapshotComputationRequest* ag,
+      SnapshotComputationResponse* result) override;
+
+  // Methods used by GlobalData.
+  tensorflow::Status Unregister(const UnregisterRequest* arg,
+                                UnregisterResponse* result) override;
+
+  grpc::XlaService::Stub* service() { return grpc_stub_; }
+
+ private:
+  grpc::XlaService::Stub* grpc_stub_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GRPCStub);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RPC_GRPC_STUB_H_
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
new file mode 100644
index 0000000000000000000000000000000000000000..c47164ee1b7657ae378a053f553442bee751753e
--- /dev/null
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA service API.
+//
+// Users 1) build up computations and 2) create allocations via this API.
+// Computations are composed of data flowing between arbitrarily-sized
+// vector-oriented operations.
+//
+// Users build up computations using a ComputationHandle, and talk about
+// allocations using GlobalDataHandles.
+//
+// There are currently no checkpointing capabilities or distribution/replication
+// guarantees. The service runs on a single machine (e.g. one task) and that is
+// its failure domain.
+//
+// Canonical example of "alpha * X + Y":
+// * Make a computation.
+// * Add alpha and X and Y as parameters.
+// * Request the multiplication of alpha and X.
+// * Request the addition of that result and Y.
+//
+// Then, pass the computation and appropriately shaped inputs to the XLA
+// service's Execute method, which provides a result as a GlobalDataHandle.
+//
+// All data in XLA computations are conceptually immutable.
+//
+// Note: this API is subject to change / refinement over time -- use the
+// provided client libraries to insulate code from changes to this service API.
+
+syntax = "proto3";
+
+import "tensorflow/compiler/xla/xla.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+package xla;
+
+service XlaService {
+  /////////////////////////
+  // Global data requests
+
+  // Unregisters a global allocation.
+  //
+  // If the handle given is not currently allocated, a NOT_FOUND status is
+  // returned.
+  rpc Unregister(UnregisterRequest) returns (UnregisterResponse) {
+  }
+
+  // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
+  // element in the tuple.
+  rpc DeconstructTuple(DeconstructTupleRequest)
+      returns (DeconstructTupleResponse) {
+  }
+
+  // Unpack requests that a global data handle, with a tuple shape, has global
+  // data handles created for each of its constituent members. This is the
+  // equivalent of the "destructuring assignment" present in various programming
+  // languages.
+  rpc Unpack(UnpackRequest) returns (UnpackResponse) {
+  }
+
+  // Requests the shape of the referenced global data.
+  rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
+  }
+
+  // Requests the program shape of the referenced computation.
+  rpc GetComputationShape(GetComputationShapeRequest)
+      returns (GetComputationShapeResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  rpc GetComputationStats(ComputationStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Requests the statistics of the given computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc GetComputationGraphStats(ComputationGraphStatsRequest)
+      returns (ComputationStatsResponse) {
+  }
+
+  // Loads a variable number of values with a given element type from ColumnIO.
+  rpc LoadData(LoadDataRequest) returns (LoadDataResponse) {
+  }
+
+  // Transfers the given global data to the client in the form of a Literal.
+  rpc TransferToClient(TransferToClientRequest)
+      returns (TransferToClientResponse) {
+  }
+
+  // Transfers the given literal to the server to be stored in a global
+  // allocation, which is returned.
+  rpc TransferToServer(TransferToServerRequest)
+      returns (TransferToServerResponse) {
+  }
+
+  // Transfers the given literal to the Infeed buffer of the device.
+  rpc TransferToInfeed(TransferToInfeedRequest)
+      returns (TransferToInfeedResponse) {
+  }
+
+  // Transferred literal from the Outfeed buffer of the device.
+  rpc TransferFromOutfeed(TransferFromOutfeedRequest)
+      returns (TransferFromOutfeedResponse) {
+  }
+
+  // Resets the device, clearing all existing state on the device.
+  rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
+  }
+
+  // Tests if an expression is a compile-time constant.
+  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
+  }
+
+  // Computes the value of a constant expression.
+  rpc ComputeConstant(ComputeConstantRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Computes the value of a constant expression. The request contains the
+  // computation graph for the constant expression.
+  rpc ComputeConstantGraph(ComputeConstantGraphRequest)
+      returns (ComputeConstantResponse) {
+  }
+
+  // Retrieves the inferred shape for a value within a computation.
+  rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) {
+  }
+
+  // Requests one or more device handles from the target. The returned device
+  // handles can be used to specify the device on which to execute computations
+  // or transfer data.
+  rpc GetDeviceHandles(GetDeviceHandlesRequest)
+      returns (GetDeviceHandlesResponse) {
+  }
+
+  // Creates a channel handle that can be used to transfer data between
+  // two computations via a pair of Send and Recv instructions.
+  rpc CreateChannelHandle(CreateChannelHandleRequest)
+      returns (CreateChannelHandleResponse) {
+  }
+
+  // Requests that the referenced computation be specialized for the provided
+  // arguments for subsequent execution. This permits things such as value
+  // specialization.
+  rpc Specialize(SpecializeRequest) returns (SpecializeResponse) {
+  }
+
+  // Modifies the provided computation so that subsequent executions
+  // will compute the provided ComputationDataHandle, rather than the
+  // last expression enqueued on that Computation.
+  rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
+  }
+
+  // Computation creates a new computation with the given name.
+  // A unique ComputationHandle is returned.
+  rpc Computation(ComputationRequest) returns (ComputationResponse) {
+  }
+
+  // Adds a new op to a computation.
+  rpc CreateOp(OpRequest) returns (OpResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns global data output and execution timing.
+  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. The request contains the whole computation graph.
+  // Returns global data output and execution timing.
+  rpc ExecuteGraph(ExecuteGraphRequest) returns (ExecuteResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  rpc ExecuteParallel(ExecuteParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided list of computations in parallel with the provided
+  // global data for each computation. Returns a list of global data output and
+  // execution timing.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
+      returns (ExecuteParallelResponse) {
+  }
+
+  // Invokes the provided computation with the provided global data passed as
+  // immutable arguments. Returns a handle to the execution.
+  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
+  }
+
+  // Waits until the given execution (aysnchronously launched) is complete, and
+  // returns the global data output.
+  rpc WaitForExecution(WaitForExecutionRequest)
+      returns (WaitForExecutionResponse) {
+  }
+
+  // Serializes a computation to proto form, so it can be loaded via
+  // LoadComputationSnapshot.
+  rpc SnapshotComputation(SnapshotComputationRequest)
+      returns (SnapshotComputationResponse) {
+  }
+
+  // Loads a computation from a captured snapshot.
+  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
+      returns (LoadComputationSnapshotResponse) {
+  }
+}
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index da16976d06ad516644113e8e727ce6b24b6bb26a..db91e804072676c609d4d1fa3110bd587f5f2bc0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -285,6 +285,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dfs_hlo_visitor_with_default_test",
+    srcs = ["dfs_hlo_visitor_with_default_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_runner",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
@@ -713,7 +730,6 @@ cc_library(
         ":computation_layout",
         ":device_memory_allocator",
         ":hlo",
-        ":hlo_cost_analysis",
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":pool",
@@ -1580,6 +1596,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -2622,6 +2639,21 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "despecializer",
+    srcs = ["despecializer.cc"],
+    hdrs = ["despecializer.h"],
+    deps = [
+        ":bfloat16_normalization",
+        ":defuser",
+        ":hlo",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":implicit_broadcast_remover",
+        "//tensorflow/compiler/xla:statusor",
+    ],
+)
+
 cc_library(
     name = "source_map_util",
     srcs = ["source_map_util.cc"],
@@ -2633,17 +2665,3 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index f9fabd8a35bcee2253b30fc5ad9e5fee545f06eb..6cb1bd56695772a38c377280da4ea357027519e5 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1424,6 +1424,7 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
+// TODO(b/74536353): do this simplification for BroadcastDimOne as well.
 StatusOr<bool> AlgebraicSimplifierVisitor::
     TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
         HloInstruction* reshape_or_broadcast) {
@@ -1731,18 +1732,29 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
                                   function));
   }
 
-  VLOG(10) << "Considering folding Pad: " << operand->ToString()
-           << "\ninto reduce-window: " << reduce_window->ToString();
-
   // This optimization folds a pad op into reduce_window.
-  if (operand->opcode() != HloOpcode::kPad) {
+  HloInstruction* pad;
+  const HloInstruction* convert = nullptr;
+  if (operand->opcode() == HloOpcode::kPad) {
+    pad = operand;
+  } else if (operand->opcode() == HloOpcode::kConvert &&
+             operand->operand(0)->opcode() == HloOpcode::kPad) {
+    convert = operand;
+    pad = operand->mutable_operand(0);
+  } else {
     VLOG(10) << "Not folding pad into reduce-window as there is no pad.";
     return Status::OK();
   }
 
+  VLOG(10) << "Considering folding Pad: " << pad->ToString()
+           << "\ninto reduce-window: " << reduce_window->ToString()
+           << (convert != nullptr ? tensorflow::strings::StrCat(
+                                        "\nvia convert: ", convert->ToString())
+                                  : "");
+
   // Do not fold interior padding into ReduceWindow since the backends do not
   // support it.
-  const PaddingConfig& pad_config = operand->padding_config();
+  const PaddingConfig& pad_config = pad->padding_config();
   if (HasInteriorPadding(pad_config)) {
     VLOG(10) << "Not folding pad into reduce-window due to interior padding.";
     return Status::OK();
@@ -1750,14 +1762,27 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If reduce_window already has padding, the pad value of the pad op and the
   // init value of reduce_window must match to allow folding the pad.
-  const HloInstruction* pad_value = operand->operand(1);
+  const HloInstruction* pad_value = pad->operand(1);
   const HloInstruction* reduce_init_value = reduce_window->operand(1);
   if (pad_value != reduce_init_value) {
+    auto literals_are_equivalent = [&] {
+      auto& pad_literal = pad_value->literal();
+      auto& reduce_init_literal = reduce_init_value->literal();
+      if (pad_literal == reduce_init_literal) {
+        return true;
+      }
+      auto converted_pad_literal = pad_literal.ConvertToShape(
+          reduce_init_value->shape(), /*round_f32_to_bf16=*/true);
+      if (!converted_pad_literal.ok()) {
+        return false;
+      }
+      return *converted_pad_literal.ValueOrDie() == reduce_init_literal;
+    };
     // The pad value is usually a constant, so we handle that case and do not
     // try to get more fancy about proving equivalence in cases beyond that.
     if (pad_value->opcode() != HloOpcode::kConstant ||
         reduce_init_value->opcode() != HloOpcode::kConstant ||
-        pad_value->literal() != reduce_init_value->literal()) {
+        !literals_are_equivalent()) {
       VLOG(10) << "Not folding pad into reduce-window due to different pad "
                   "values.";
       return Status::OK();
@@ -1766,7 +1791,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
 
   // If the pad puts a single non-identity value in each window that we're
   // reducing, then this is a broadcast.
-  HloInstruction* pad_operand = operand->mutable_operand(0);
+  HloInstruction* pad_operand = pad->mutable_operand(0);
   auto is_effective_broadcast = [&] {
     if (window_util::HasStride(window)) {
       VLOG(10) << "Window has stride.";
@@ -1810,6 +1835,18 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     VLOG(10) << "Found window covers a single unpadded element.";
     return true;
   };
+
+  HloInstruction* new_reduce_window_operand;
+  if (convert != nullptr) {
+    new_reduce_window_operand =
+        computation_->AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::ChangeElementType(pad_operand->shape(),
+                                         convert->shape().element_type()),
+            pad_operand));
+  } else {
+    new_reduce_window_operand = pad_operand;
+  }
+
   if (is_effective_broadcast()) {
     VLOG(10) << "Replacing pad/reduce-window with (implicit) broadcast.";
     auto fadd = [this](std::unique_ptr<HloInstruction> x) {
@@ -1818,7 +1855,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return ReplaceWithNewInstruction(
         reduce_window, HloInstruction::CreateBroadcastSequence(
                            /*output_shape=*/reduce_window->shape(),
-                           /*operand=*/pad_operand, fadd));
+                           /*operand=*/new_reduce_window_operand, fadd));
   }
 
   // Carry out the folding of the pad into reduce_window.
@@ -1835,10 +1872,11 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     window_dim.set_padding_high(window_dim.padding_high() +
                                 pad_dim.edge_padding_high());
   }
+
   return ReplaceWithNewInstruction(
       reduce_window, HloInstruction::CreateReduceWindow(
                          /*shape=*/reduce_window->shape(),
-                         /*operand=*/pad_operand,
+                         /*operand=*/new_reduce_window_operand,
                          /*init_value=*/reduce_window->mutable_operand(1),
                          /*window=*/new_window,
                          /*reduce_computation=*/function));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 3b80a827bf0b5f1041c7351be0943bf1ad8c8afe..20c549562d5153c802c1e675a8ff1c92426b8832 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -2338,6 +2338,91 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
 }
 
+// Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
+// ReduceWindow(Convert(op), x).
+TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
+  HloModule module(TestName());
+  HloComputation::Builder builder(TestName());
+
+  // Create operand to the pad.
+  HloInstruction* parameter =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(BF16, {1, 2, 3, 4}), "p0"));
+
+  // Create the pad.
+  PaddingConfig padding = MakeNoPaddingConfig(4);
+  padding.mutable_dimensions(1)->set_edge_padding_low(1);
+  padding.mutable_dimensions(3)->set_edge_padding_high(2);
+
+  HloInstruction* pad_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(BF16, {1, 3, 3, 5}), parameter, pad_value, padding));
+
+  HloInstruction* convert =
+      builder.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(pad->shape(), F32), pad));
+
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module.AddEmbeddedComputation(builder.Build());
+  }
+
+  // Create the reduce-window.
+  Window window;
+  for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+    auto* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_padding_low(10);
+    dim->set_padding_high(100);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+  const Shape reduce_window_shape =
+      ShapeUtil::MakeShape(F32, {111, 113, 113, 115});
+  HloInstruction* reduce_init_value = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(5.0f)));
+  HloInstruction* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          reduce_window_shape, convert, reduce_init_value, window,
+          add_computation));
+
+  // Build the computation and run the simplifier.
+  auto computation = module.AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root, reduce_window);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+
+  // Running simplification again should not result in any further changes.
+  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+
+  // Verify the result
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
+      << ShapeUtil::HumanString(root->shape()) << " vs "
+      << ShapeUtil::HumanString(reduce_window_shape);
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(1).padding_low(), 11);
+  EXPECT_EQ(root->window().dimensions(2).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(3).padding_low(), 10);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(1).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(2).padding_high(), 100);
+  EXPECT_EQ(root->window().dimensions(3).padding_high(), 102);
+}
+
 TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   HloComputation::Builder builder(TestName());
   const Shape shape = ShapeUtil::MakeShape(F32, {448, 2048, 1, 1});
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index 13eb02ca012f44b2b5ed7c6f5becb7d54b07c33c..a8053d15e124319c5c898f0034b9aaa95a007a89 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -51,8 +51,8 @@ std::ostream& operator<<(std::ostream& out, const CallContext& context) {
   return out;
 }
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction) {
-  switch (instruction->opcode()) {
+CallContext GetInstructionCallContext(HloOpcode opcode) {
+  switch (opcode) {
     case HloOpcode::kCall:
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
@@ -101,7 +101,7 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
 
 void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
   CHECK_EQ(instruction->parent(), computation());
-  const CallContext context = GetInstructionCallContext(instruction);
+  const CallContext context = GetInstructionCallContext(instruction->opcode());
   if (!instruction->called_computations().empty()) {
     CHECK(context == CallContext::kSequential ||
           context == CallContext::kParallel);
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index 688c4085dfb4f47d3e08a4abee5e7b645f595b11..97d3811508adee1bf2d0942bcc69e3e34a41c8c3 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -53,7 +53,7 @@ enum class CallContext {
 string CallContextToString(CallContext context);
 std::ostream& operator<<(std::ostream& out, const CallContext& context);
 
-CallContext GetInstructionCallContext(const HloInstruction* instruction);
+CallContext GetInstructionCallContext(HloOpcode opcode);
 
 // Represents an HLO instruction which calls one or more computations.
 class CallSite {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 0faa9e9c41063c5f7576ef5cbd873e8a84a73c28..246b80286189286dd29a306dd0bda495df9dad3e 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -18,6 +18,10 @@ load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -170,6 +174,7 @@ cc_library(
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
@@ -538,6 +543,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_matmul_mkl",
+    srcs = ["runtime_matmul_mkl.cc"],
+    hdrs = ["runtime_matmul_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "//third_party/mkl:intel_binary_blob",
+        "@mkl_dnn",
+    ]),
+)
+
 cc_library(
     name = "runtime_single_threaded_conv2d",
     srcs = [
@@ -584,10 +605,12 @@ cc_library(
 tf_cc_test(
     name = "cpu_runtime_test",
     srcs = ["cpu_runtime_test.cc"],
+    shard_count = 10,
     tags = ["optonly"],
     deps = [
         ":cpu_runtime",
         ":runtime_matmul",
+        ":runtime_matmul_mkl",
         ":runtime_single_threaded_matmul",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:types",
@@ -916,17 +939,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 267b89a10b3c038dc2048f0ad5b5b343c88ef0f9..d3502b3a03e27c8f90ed74c4d826dfab1c4e8b75 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -71,11 +71,6 @@ class CpuExecutable : public Executable {
     ir_module_string_ = ir_module_string;
   }
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU executable.
-    return Unimplemented("Equality test on CPU executable is not implemented.");
-  }
-
   static int64 ShapeSizeBytes(const Shape& shape);
 
   // Type of the computation function we expect in the JIT.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 9a3bd68c80c6e8bcdb231c63ba025d1f73619eb7..872b0be1f8a8ec317bf059fd1c4d2550e2ad161a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,14 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF32";
+extern const char* const kMKLMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLMatMulF64";
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF32";
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName =
+    "__xla_cpu_runtime_MKLSingleThreadedMatMulF64";
 extern const char* const kEigenConvF16SymbolName =
     "__xla_cpu_runtime_EigenConvF16";
 extern const char* const kEigenConvF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e61d6ea28b633398863357541e056ee887582f9c..e392e231b4c71b2e206640a47b712de70a148582 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,10 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLMatMulF32SymbolName;
+extern const char* const kMKLMatMulF64SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
+extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
index f385829cdf5cafbd35e083f47106734cdd5dde88..2ac950e6d93ade315808f2ca1d0bdd7bc85f53b9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
@@ -130,25 +131,23 @@ MatMulShape MatMulShapes[] = {
 // * transpose_lhs
 // * transpose_rhs
 // * single_threaded
-using EigenMatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
+using MatMulTestParam = std::tuple<MatMulShape, bool, bool, bool>;
 
-class EigenMatMulTest
-    : public CpuRuntimeTest,
-      public ::testing::WithParamInterface<EigenMatMulTestParam> {
+class EigenMatMulTest : public CpuRuntimeTest,
+                        public ::testing::WithParamInterface<MatMulTestParam> {
  public:
-  static string Name(
-      const ::testing::TestParamInfo<EigenMatMulTestParam>& info) {
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
     MatMulShape shape = std::get<0>(info.param);
     bool transpose_lhs = std::get<1>(info.param);
     bool transpose_rhs = std::get<2>(info.param);
     bool single_threaded = std::get<3>(info.param);
 
     return tensorflow::strings::Printf(
-        "MatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        "EigenMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
         transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
         single_threaded ? "single" : "multi");
   }
-};  // namespace xla
+};
 
 TEST_P(EigenMatMulTest, DoIt) {
   MatMulShape shape = std::get<0>(GetParam());
@@ -169,5 +168,74 @@ INSTANTIATE_TEST_CASE_P(EigenMatMulTestInstantiaion, EigenMatMulTest,
                                            ::testing::Bool()),
                         EigenMatMulTest::Name);
 
+#ifdef INTEL_MKL
+class MKLMatMulTest : public CpuRuntimeTest,
+                      public ::testing::WithParamInterface<MatMulTestParam> {
+ public:
+  static string Name(const ::testing::TestParamInfo<MatMulTestParam>& info) {
+    MatMulShape shape = std::get<0>(info.param);
+    bool transpose_lhs = std::get<1>(info.param);
+    bool transpose_rhs = std::get<2>(info.param);
+    bool single_threaded = std::get<3>(info.param);
+
+    return tensorflow::strings::Printf(
+        "MKLMatMul_%lld_%lld_%lld_%s%s%s_threaded", shape.m, shape.k, shape.n,
+        transpose_lhs ? "Tlhs_" : "", transpose_rhs ? "Trhs_" : "",
+        single_threaded ? "single" : "multi");
+  }
+};
+
+std::unique_ptr<Array2D<float>> MKLMatrixMultiply(const Array2D<float>& a,
+                                                  const Array2D<float>& b,
+                                                  bool transpose_lhs,
+                                                  bool transpose_rhs,
+                                                  bool single_threaded) {
+  CHECK_EQ(a.width(), b.height());
+  int64 m = a.height();
+  int64 n = b.width();
+  int64 k = a.width();
+
+  // The MKL matmul runtime function expects the matrix to be in column major
+  // order and array2d is in row-major order. Create transposes of a and b. The
+  // 'data' buffer in the transposed array is the original array in column major
+  // order.
+  auto a_transpose = MaybeTransposeArray2D(a, !transpose_lhs);
+  auto b_transpose = MaybeTransposeArray2D(b, !transpose_rhs);
+
+  // Since we're going to transpose c before returning it, swap the order of the
+  // dimension sizes to ensure the returned array is properly dimensioned.
+  auto c_transpose = MakeUnique<Array2D<float>>(n, m);
+  if (single_threaded) {
+    __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+        nullptr, c_transpose->data(), a_transpose->data(), b_transpose->data(),
+        m, n, k, transpose_lhs, transpose_rhs);
+  } else {
+    __xla_cpu_runtime_MKLMatMulF32(nullptr, c_transpose->data(),
+                                   a_transpose->data(), b_transpose->data(), m,
+                                   n, k, transpose_lhs, transpose_rhs);
+  }
+  return MaybeTransposeArray2D(*c_transpose, true);
+}
+
+TEST_P(MKLMatMulTest, DoIt) {
+  MatMulShape shape = std::get<0>(GetParam());
+  bool transpose_lhs = std::get<1>(GetParam());
+  bool transpose_rhs = std::get<2>(GetParam());
+  bool single_threaded = std::get<3>(GetParam());
+
+  auto a = MakeLinspaceArray2D(0.0, 1.0, shape.m, shape.k);
+  auto b = MakeLinspaceArray2D(-2.0, 2.0, shape.k, shape.n);
+  auto c =
+      MKLMatrixMultiply(*a, *b, transpose_lhs, transpose_rhs, single_threaded);
+  CheckMatrixMultiply(*a, *b, *c);
+}
+
+INSTANTIATE_TEST_CASE_P(MKLMatMulTestInstantiaion, MKLMatMulTest,
+                        ::testing::Combine(::testing::ValuesIn(MatMulShapes),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        MKLMatMulTest::Name);
+#endif  // INTEL_MKL
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 8b1e20d79e90fcc32e985ffb855a1a10cdd2f2b9..29afd8ea5f9822ea9ae969ae035511a58de4888e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -918,28 +918,35 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded_eigen =
+  bool multi_threaded =
       hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
     case F16:
-      fn_name = multi_threaded_eigen
+      fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
       float_type = ir_builder_->getHalfTy();
       break;
     case F32:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF32SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF32SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF32SymbolName
+                                   : runtime::kEigenMatMulF32SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF32SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF32SymbolName);
       float_type = ir_builder_->getFloatTy();
       break;
     case F64:
-      fn_name = multi_threaded_eigen
-                    ? runtime::kEigenMatMulF64SymbolName
-                    : runtime::kEigenSingleThreadedMatMulF64SymbolName;
+      fn_name = multi_threaded
+                    ? (use_mkl_dnn ? runtime::kMKLMatMulF64SymbolName
+                                   : runtime::kEigenMatMulF64SymbolName)
+                    : (use_mkl_dnn
+                           ? runtime::kMKLSingleThreadedMatMulF64SymbolName
+                           : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = ir_builder_->getDoubleTy();
       break;
     default:
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
index c393e9b8ea39bfb4c605ebba8e2cd29726bc4af9..87c0a3df458eb4b3f217192597e0de1576304367 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_cpu_executable.h
@@ -83,12 +83,6 @@ class ParallelCpuExecutable : public Executable {
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
   }
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on CPU parallel executable.
-    return Unimplemented(
-        "Equality test on CPU parallel executable is not implemented.");
-  }
-
  private:
   // Allocate buffers required for execution and assign them to the elements of
   // "buffers". "buffers" should be sized to the number of buffers in buffer
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 1e439cde11cf74272101b80c867a308e51ab26a6..54af40506dab48b3c2a3a44eb0b5f5fb213a32ec 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -29,7 +29,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
     : LoopEmitter(target_element_generator, target_array, ir_builder),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
@@ -69,7 +70,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK(exit_bb_ != nullptr);
 
-  return array_index;
+  return {array_index};
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index ce92e36a944de33b991d97460f0b2e859ad56081..755715634aa70a822b21d25dcae20a8fe053477a 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -60,7 +60,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92da5f71c23d5e1450b39ea8b7bb8345f6fabb3b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+#include "third_party/intel_mkl_ml/include/mkl_service.h"
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/types.h"
+
+#define EIGEN_USE_THREADS
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+namespace {
+// BLAS GEMM API for 32-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF32(const void* run_options_ptr, float* out, float* lhs, float* rhs,
+               int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For column-major matrices, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_sgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+// BLAS GEMM API for 64-bit Matrix Multiplication.
+
+// MatMul function is defined as: c = alpha * op(a) * op(b) + beta * c.
+// Since XLA MatMul does not used alpha, beta, we set them to 1.0 and 0.0.
+// Matrix lhs, rhs and out are all colum-major.
+void MatMulF64(const void* run_options_ptr, double* out, double* lhs,
+               double* rhs, int64 m, int64 n, int64 k, int32 transpose_lhs,
+               int32 transpose_rhs) {
+  const float alpha = 1.0f, beta = 0.0f;
+  // lda, ldb, and ldc are the leading dimensions of matrices a, b, and c,
+  // respectively. For a column-major matrix, the leading dimension is the
+  // stride between consecutive columns (which equals the number of rows). If
+  // the matrix is transposed, the leading dimension is the stride between
+  // consecutive rows (which equals the number of columns).
+  int lda = transpose_lhs ? k : m;
+  int ldb = transpose_rhs ? n : k;
+  int ldc = m;
+  cblas_dgemm(CblasColMajor, transpose_lhs ? CblasTrans : CblasNoTrans,
+              transpose_rhs ? CblasTrans : CblasNoTrans, m, n, k, alpha, lhs,
+              lda, rhs, ldb, beta, out, ldc);
+}
+
+}  // namespace
+
+void __xla_cpu_runtime_MKLMatMulF32(const void* run_options_ptr, float* out,
+                                    float* lhs, float* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+// BLAS GEMM API for 64-bit Matrix Multiplication
+void __xla_cpu_runtime_MKLMatMulF64(const void* run_options_ptr, double* out,
+                                    double* lhs, double* rhs, int64 m, int64 n,
+                                    int64 k, int32 transpose_lhs,
+                                    int32 transpose_rhs) {
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
+  // BLAS GEMM MatMul uses OpenMP for parallelization, so we pass the thread
+  // number specified in intra_op_thread_pool to MKL.
+  int prev_num_threads = mkl_set_num_threads_local(
+      run_options->intra_op_thread_pool()->numThreads());
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(const void* run_options_ptr,
+                                                  float* out, float* lhs,
+                                                  float* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF32(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(const void* run_options_ptr,
+                                                  double* out, double* lhs,
+                                                  double* rhs, int64 m, int64 n,
+                                                  int64 k, int32 transpose_lhs,
+                                                  int32 transpose_rhs) {
+  // Set the thread number to 1 for single threaded excution.
+  int prev_num_threads = mkl_set_num_threads_local(1);
+  MatMulF64(nullptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+  // Set thread number back to the previous number.
+  mkl_set_num_threads_local(prev_num_threads);
+}
+#endif  // INTEL_MKL
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
new file mode 100644
index 0000000000000000000000000000000000000000..831b796efb971f6fb0170e2321c00ac415f2830f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+#ifdef INTEL_MKL
+#include "third_party/intel_mkl_ml/include/mkl_cblas.h"
+
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+#else
+extern void __xla_cpu_runtime_MKLMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+extern void __xla_cpu_runtime_MKLSingleThreadedMatMulF64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, double* out,
+    double* lhs, double* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs) {
+  std::cerr << "Attempt to call MKL MatMul runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+}
+
+#endif  // INTEL_MKL
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 80c24eaccfc2a83f8f3f311d60860715668d0c08..b7ce5bbe47482320bfb9524c8f366a463b9579ed 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
@@ -87,7 +88,6 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                                 /*MAttrs=*/DetectMachineAttributes()))),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
-      execution_session_(string_pool_),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
@@ -184,6 +184,10 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index aaeff2de8785b99d271f13b261c63118bcf7bd4a..f4260a95bc45557b6cd969f7d3fff01c8b392575 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -102,7 +102,6 @@ class SimpleOrcJIT {
   std::unique_ptr<llvm::TargetMachine> target_machine_;
   const Disassembler disassembler_;
   const llvm::DataLayout data_layout_;
-  llvm::orc::SymbolStringPool string_pool_;
   llvm::orc::ExecutionSession execution_session_;
   std::shared_ptr<llvm::orc::SymbolResolver> symbol_resolver_;
   ObjLayerT object_layer_;
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d938f3a2c4b5bfdd70d5a614b9890b4d7bf050f7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/despecializer.h"
+
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/defuser.h"
+#include "tensorflow/compiler/xla/service/implicit_broadcast_remover.h"
+
+namespace xla {
+
+Despecializer::Despecializer() : pipeline_("despecializer") {
+  // TODO(b/70588125): Also deal with window reversal in a fast way.
+  pipeline_.AddPass<Defuser>();
+  pipeline_.AddPass<ImplicitBroadcastRemover>();
+  pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
+}
+
+StatusOr<bool> Despecializer::Run(HloModule* module) {
+  return pipeline_.Run(module);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..af48f4ab6e506d295251239fe92db68cfec6dcfa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Creates an HloPassPipeline containing multiple HloPasses that can
+// despecialize an optimized HloModule. This is useful to run an HloModule
+// optimized for one specfic platform on a different platform (undoing platform
+// specific passes) with matching numerics for comparison.
+//
+// Current despecialization passes are Defuser, ImplicitBroadcastRemover,
+// and BFloat16MixedPrecisionRemoval.
+class Despecializer : public HloPassInterface {
+ public:
+  Despecializer();
+  tensorflow::StringPiece name() const override { return "despecializer"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  HloPassPipeline pipeline_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DESPECIALIZER_H_
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 56723e765048698baedc50ae7b189d0287ee56b8..3f7089d6ca1e1a3b9bb42028327ba54ba4b93974 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -199,6 +199,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
   virtual Status HandleBroadcast(HloInstructionPtr hlo) = 0;
+  virtual Status HandleBroadcastDimOne(HloInstructionPtr hlo) = 0;
   virtual Status HandleReshape(HloInstructionPtr hlo) = 0;
   virtual Status HandleTranspose(HloInstructionPtr hlo) = 0;
   virtual Status HandleParameter(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index ecda5288ee17a3856ce95f0caa327c3524fd180b..e6680ee9b87e1a01782204047c3b2104995c11ed 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -35,6 +35,12 @@ class HloInstruction;
 // DfsHloVisitor with default action based on the HloInstruction being visited.
 // Users should not use this class directly, but use the type aliases
 // DfsHloVisitorWithDefault/ConstDfsHloVisitorWithDefault instead.
+//
+// Do *not* add an override to this class if the opcode is covered by
+// HandleElementwiseUnary/Binary. These opcode handlers dispatch to
+// HandleElementwiseUnary/Binary in DfsHloVisitorBase. Adding such a handler
+// here will break passes which rely on the HandleElementwiseUnary/Binary
+// handling these opcodes.
 template <typename HloInstructionPtr>
 class DfsHloVisitorWithDefaultBase
     : public DfsHloVisitorBase<HloInstructionPtr> {
@@ -70,12 +76,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConcatenate(HloInstructionPtr concatenate) override {
     return DefaultAction(concatenate);
   }
-  Status HandleConvert(HloInstructionPtr convert) override {
-    return DefaultAction(convert);
-  }
-  Status HandleCopy(HloInstructionPtr copy) override {
-    return DefaultAction(copy);
-  }
   Status HandleSelect(HloInstructionPtr select) override {
     return DefaultAction(select);
   }
@@ -91,9 +91,6 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCrossReplicaSum(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
-  Status HandleCompare(HloInstructionPtr compare) override {
-    return DefaultAction(compare);
-  }
   Status HandleRng(HloInstructionPtr random) override {
     return DefaultAction(random);
   }
@@ -161,6 +158,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleBroadcast(HloInstructionPtr broadcast) override {
     return DefaultAction(broadcast);
   }
+  Status HandleBroadcastDimOne(HloInstructionPtr broadcastDimOne) override {
+    return DefaultAction(broadcastDimOne);
+  }
   Status HandlePad(HloInstructionPtr pad) override {
     return DefaultAction(pad);
   }
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..825e1436f0ec6d49b555e5e3e9c2c7a19fb7b062
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class DfsHloVisitorWithDefaultTest : public HloTestBase {};
+
+TEST_F(DfsHloVisitorWithDefaultTest, DefaultElementwiseTest) {
+  // Verify that HandleElementwiseBinary and HandleElementwiseUnary are called
+  // on the appropriate HLO ops (elementwise binary/unary ops).
+
+  class ElementwiseTestVisitor : public DfsHloVisitorWithDefault {
+   public:
+    Status DefaultAction(HloInstruction* hlo) override {
+      // The HLO should be neither an elementwise unary nor binary op. These
+      // cases are handled in HandleElementwiseBinary/Unary.
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 2))
+          << hlo->ToString();
+      TF_RET_CHECK(!(hlo->IsElementwise() && hlo->operand_count() == 1))
+          << hlo->ToString();
+      return Status::OK();
+    }
+
+    Status HandleElementwiseBinary(HloInstruction* hlo) override {
+      // HLO should be elementwise binary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 2)
+          << hlo->ToString();
+      return Status::OK();
+    }
+    Status HandleElementwiseUnary(HloInstruction* hlo) override {
+      // HLO should be elementwise unary.
+      TF_RET_CHECK(hlo->IsElementwise() && hlo->operand_count() == 1)
+          << hlo->ToString();
+      return Status::OK();
+    }
+  };
+
+  // HLO module contains are arbitrary mix of elementwise and non-elementwise
+  // operations.
+  const string& hlo_string = R"(
+HloModule TestModule
+
+ENTRY TestComputation {
+  arg = f32[] parameter(0)
+  tuple = (f32[]) tuple(arg)
+  gte = f32[] get-tuple-element(tuple), index=0
+  abs = f32[] abs(arg)
+  add = f32[] add(arg, gte)
+  broadcast = f32[42] broadcast(add), dimensions={}
+  slice = f32[0] slice(broadcast), slice={[1:2]}
+  copy = f32[] copy(arg)
+  eq = pred[] equal-to(arg, gte)
+  neg = f32[] negate(arg)
+  ROOT convert = f64[] convert(f32[] arg)
+})";
+  std::unique_ptr<HloModule> module =
+      HloRunner::CreateModuleFromString(hlo_string, GetDebugOptionsForTest())
+          .ConsumeValueOrDie();
+  ElementwiseTestVisitor visitor;
+  TF_EXPECT_OK(module->entry_computation()->Accept(&visitor));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index be92b1629a2d8dae57b315751bd4f7f9ccddf171..471d2fd6cebcd7a00dfea4aca08da08af534b05f 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -80,6 +80,7 @@ StatusOr<std::unique_ptr<ShapedBuffer>> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<std::unique_ptr<ShapedBuffer>> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
+  TF_RETURN_IF_ERROR(return_value.status());
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 0aee535ee780ef000bc5e9963ff48786b3a61eb2..a157235f8af6ea64a488510e427bbae502c46ca6 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -109,14 +108,6 @@ class Executable {
     return execution_profile_;
   }
 
-  // Returns Status::ok() if the two executables are equal to each other.
-  //
-  // An error status is returned otherwise.
-  virtual const Status EqualOrFail(const Executable& executable) {
-    return Unimplemented(
-        "Equality test on this executable is not implemented.");
-  }
-
   const HloProfilePrinterData& hlo_profile_printer_data() const {
     CHECK(hlo_profiling_enabled());
     return *hlo_profile_printer_data_;
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index 2b6caa149439a86d6d047605099bc3ff7b295a8e..85409b330b11537158059dcce8c2a96c98d38f30 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -93,7 +93,7 @@ Status FlattenNode(const CallGraphNode& node) {
       auto current = worklist.back();
       worklist.pop_back();
       for (auto* instruction : current->instructions()) {
-        if (GetInstructionCallContext(instruction) !=
+        if (GetInstructionCallContext(instruction->opcode()) !=
             CallContext::kSequential) {
           continue;
         }
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 93b2f2a4748932e50ce40e8a2f573af922dea8d1..f1707442fe3354d5183d905468810f3871146ff5 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -700,17 +700,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 1792893ae401bf16d2dd9e861607e8f3821a505e..d6b457a91b9e9edec222b9e25f5105d43b97aae8 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -94,11 +94,17 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
 // Determines whether we can safely perform a winograd non-fused convolution for
 // the given input and output shapes.  This works around b/68264959, an integer
 // overflow in cuDNNv5 and cuDNNv6.
-//
-// TODO(jlebar): We shouldn't need this check for cuDNNv7.
-bool ShouldIncludeWinogradNonfusedAlgo(
-    const Shape& input_shape, const Shape& output_shape,
-    const ConvolutionDimensionNumbers& dnums) {
+bool ShouldIncludeWinogradNonfusedAlgo(const Shape& input_shape,
+                                       const Shape& output_shape,
+                                       const ConvolutionDimensionNumbers& dnums,
+                                       se::StreamExecutor* stream_exec) {
+  // Skip this check for cudnn7 and newer.
+  se::port::StatusOr<std::tuple<int, int, int>> version =
+      stream_exec->AsDnn()->GetVersion();
+  if (version.ok() && std::get<0>(version.ValueOrDie()) >= 7) {
+    return true;
+  }
+
   int64 batch = input_shape.dimensions(dnums.input_batch_dimension());
   int64 in_depths = input_shape.dimensions(dnums.input_feature_dimension());
   int64 in_rows = input_shape.dimensions(dnums.input_spatial_dimensions(0));
@@ -118,20 +124,20 @@ bool ShouldIncludeWinogradNonfusedAlgo(
 
 std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
                                          bool with_winograd_nonfused,
-                                         se::StreamExecutor* stream_exec_) {
+                                         se::StreamExecutor* stream_exec) {
   std::vector<AlgorithmDesc> algorithms;
   switch (kind) {
     case CudnnConvKind::kBackwardFilter:
-      CHECK(stream_exec_->GetConvolveBackwardFilterAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardFilterAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kBackwardInput:
-      CHECK(stream_exec_->GetConvolveBackwardDataAlgorithms(
+      CHECK(stream_exec->GetConvolveBackwardDataAlgorithms(
           with_winograd_nonfused, &algorithms));
       break;
     case CudnnConvKind::kForward:
-      CHECK(stream_exec_->GetConvolveAlgorithms(with_winograd_nonfused,
-                                                &algorithms));
+      CHECK(stream_exec->GetConvolveAlgorithms(with_winograd_nonfused,
+                                               &algorithms));
       break;
   }
 
@@ -209,8 +215,8 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     return nullopt;
   }
 
-  const bool use_winograd_nonfused =
-      ShouldIncludeWinogradNonfusedAlgo(input_shape, output_shape, dnums);
+  const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
+      input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
   int64 best_result_bytes_used = 0;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 04b37d913e0bc8f8226057f107da05fd1e675010..28f93447953b90d8a7fa4386e2355066c0405aec 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -267,16 +267,22 @@ StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
     if (allocation.is_entry_computation_parameter()) {
-      // The caller must give us a buffer for ShapeIndex {} of every parameter.
-      // It can optionally give us a buffer for other ShapeIndices, but we
-      // ignore them: Because we can't rely on these sub-buffers' addresses
-      // being available, our generated code can't use them.  Instead, it must
-      // chase pointers starting at the tuple root.
-      if (allocation.param_shape_index().empty()) {
-        auto param_no = allocation.parameter_number();
-        buffer_allocations_builder.RegisterBuffer(
-            i, arguments[param_no]->root_buffer());
+      auto param_no = allocation.parameter_number();
+      se::DeviceMemoryBase buffer =
+          arguments[param_no]->buffer(allocation.param_shape_index());
+
+      // All top-level buffers and sub-buffers must have an explicit, non-null
+      // pointer, except for zero-sized buffers, which may be null.
+      if (buffer.is_null() && buffer.size() > 0) {
+        return FailedPrecondition(
+            "Cannot run XLA computation because pointer to (sub-)buffer at "
+            "index %s of parameter %lld was null.  All pointers to "
+            "(sub-)buffers must not be null, unless the (sub-)buffer has zero "
+            "elements.",
+            allocation.param_shape_index().ToString().c_str(), param_no);
       }
+
+      buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
   }
   se::StreamExecutor* executor = run_options->stream()->parent();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index b19cfd43debd0a5490495d176fa2f1fcd625da07..dcb3991f41a31db84d8e9e555ae7d13c3ac84b97 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -83,11 +83,6 @@ class GpuExecutable : public Executable {
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override;
 
-  const Status EqualOrFail(const Executable& executable) {
-    // TODO(b/62952745) Implement equality test on GPU executable.
-    return Unimplemented("Equality test on GPU executable is not implemented.");
-  }
-
  private:
   // If `block_host_until_done` is false, execution will not block the host
   // until the kernels have completed. This is used as an optimization for
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 199e6b787413c5e0fb1435c62f1fc3b83fc6eba3..26e497762f2a6f23767c5b98f339eefdef0b7468 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -145,37 +145,6 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
-// Tries to get a Slice for the given instruction at the given index, but
-// returns nullopt if we might not know the slice's address at runtime without
-// dereferencing a containing tuple.
-//
-// In particular, when XLA accepts a parameter of tuple type, the caller has the
-// option of telling XLA what are the values inside of the tuple, or just giving
-// XLA a pointer to the top-level tuple and letting us chase the pointers on the
-// GPU.  We therefore cannot rely having these pointers to parameter sub-buffers
-// being present when we run the program.
-optional<BufferAllocation::Slice> GetKnownAtRuntimeSlice(
-    const HloInstruction* instr, const ShapeIndex& index,
-    const BufferAssignment& buffer_assn) {
-  auto maybe_slice = buffer_assn.GetUniqueSlice(instr, index);
-  if (!maybe_slice.ok()) {
-    return nullopt;
-  }
-  // BufferAllocation gives a slice and alloc to every buffer accessed by XLA,
-  // but we don't necessarily know the runtime address of sub-buffers of input
-  // parameters.
-  const BufferAllocation::Slice& slice = maybe_slice.ValueOrDie();
-  const BufferAllocation* alloc = slice.allocation();
-  if (alloc->IsInputOrOutput() && !alloc->maybe_live_out() &&
-      !alloc->param_shape_index().empty()) {
-    return nullopt;
-  }
-
-  // Otherwise, we will know the address of this slice at runtime without having
-  // to dereference a tuple.
-  return slice;
-}
-
 }  // namespace
 
 IrEmitterUnnested::IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
@@ -206,7 +175,7 @@ bool ImplementedAsHostToDeviceMemcpy(const BufferAssignment& buffer_assignment,
   return hlo.opcode() == HloOpcode::kCopy &&
          hlo.operand(0)->opcode() == HloOpcode::kConstant &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok();
 }
 
 bool ImplementedAsDeviceToDeviceMemcpy(
@@ -216,13 +185,13 @@ bool ImplementedAsDeviceToDeviceMemcpy(
   //
   // 1. `hlo` is a kCopy instruction.
   // 2. `hlo` and its operand have the same shape (thus the same layout too).
-  // 3. The operand to `hlo` has a buffer assignment (constants do not, for
-  //    instance) which means the source buffer also resides on the device.
+  // 3. `hlo` and its operand have a statically-known buffer assignment
+  //     (constants do not, for instance), which means the source buffer also
+  //     resides on the device.
   return hlo.opcode() == HloOpcode::kCopy &&
          ShapeUtil::Equal(hlo.operand(0)->shape(), hlo.shape()) &&
-         GetKnownAtRuntimeSlice(&hlo, {}, buffer_assignment).has_value() &&
-         GetKnownAtRuntimeSlice(hlo.operand(0), {}, buffer_assignment)
-             .has_value();
+         buffer_assignment.GetUniqueTopLevelSlice(&hlo).ok() &&
+         buffer_assignment.GetUniqueTopLevelSlice(hlo.operand(0)).ok();
 }
 }  // namespace
 
@@ -567,7 +536,27 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     thunk_sequence_->emplace_back(BuildGemmThunk(fusion));
     return Status::OK();
   }
-  thunk_sequence_->emplace_back(BuildKernelThunk(fusion));
+
+  int max_unroll_factor = fusion->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_max_kernel_unroll_factor();
+
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
+  int unroll_factor = 1;
+  if (!fusion->IsMultiOutputFusion()) {
+    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+    int64 num_elements = ShapeUtil::ElementsIn(fusion->shape());
+    for (int i = max_unroll_factor; i > 1; i /= 2) {
+      if (num_elements % i == 0) {
+        unroll_factor = i;
+        break;
+      }
+    }
+  }
+
+  thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
 }
 
@@ -1959,49 +1948,54 @@ GetHloBufferSlices(const HloInstruction* hlo,
       -> optional<std::pair<BufferAllocation::Slice, ShapeIndex>> {
     // Simple, common case: Is the buffer for instr known at runtime?  If so,
     // we're done.
-    auto slice = GetKnownAtRuntimeSlice(instr, index, buffer_assn);
-    if (slice.has_value()) {
-      return {{*slice, ShapeIndex()}};
-    }
-
-    // If we don't know the buffer for instr at index, see if we know the buffer
-    // for instr at index without its last element.  If so, we can dynamically
-    // find the buffer for instr by dereferencing a pointer in that buffer.
-    // Continue looking this way until we run out of elements in 'index'.
-    ShapeIndex new_index = index;
-    ShapeIndex gte_indices;
-    while (!new_index.empty()) {
-      gte_indices.push_front(new_index.back());
-      new_index.pop_back();
-      auto slice = GetKnownAtRuntimeSlice(instr, new_index, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
-      }
+    auto slice = buffer_assn.GetUniqueSlice(instr, index);
+    if (slice.ok()) {
+      return {{slice.ValueOrDie(), ShapeIndex()}};
     }
 
-    // If *that* didn't work, walk up any bitcasts that we might see.  These
-    // must appear before any GTE instructions, because it's illegal to bitcast
-    // to a tuple type.
+    // If that didn't work, walk up any bitcasts that we might see.  These must
+    // appear before any GTE instructions, because it's illegal to bitcast to a
+    // tuple type.
     const HloInstruction* parent = instr;
     while (parent->opcode() == HloOpcode::kBitcast) {
       parent = parent->operand(0);
 
-      auto slice = GetKnownAtRuntimeSlice(parent, {}, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), ShapeIndex()}};
       }
     }
 
-    // Finally, check whether instr is a GTE instruction.  If it is, see if we
-    // can get a buffer for its parent, and continue walking up parents until we
-    // find a defined buffer or we hit something that's not a GTE.
+    // Check whether instr is a GTE instruction.  If it is, see if we can get a
+    // buffer for its parent, and continue walking up parents until we find a
+    // defined buffer or we hit something that's not a GTE.
+    ShapeIndex gte_indices;
     while (parent->opcode() == HloOpcode::kGetTupleElement) {
       gte_indices.push_front(parent->tuple_index());
       parent = parent->operand(0);
 
-      auto slice = GetKnownAtRuntimeSlice(parent, {}, buffer_assn);
-      if (slice.has_value()) {
-        return {{*slice, gte_indices}};
+      auto slice = buffer_assn.GetUniqueSlice(parent, {});
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
+      }
+    }
+
+    // Finally, if we don't know the buffer for instr at index, see if we know
+    // the buffer for instr at index without its last element.  If so, we can
+    // dynamically find the buffer for instr by dereferencing a pointer in that
+    // buffer.  Continue looking this way until we run out of elements in
+    // 'index'.
+    //
+    // We can almost always get a buffer without resorting to this.  The only
+    // exception is for cases where the relevant sub-buffer is truly unknowable,
+    // for example the sub-buffer of a tuple-shaped select.
+    ShapeIndex new_index = index;
+    while (!new_index.empty()) {
+      gte_indices.push_front(new_index.back());
+      new_index.pop_back();
+      auto slice = buffer_assn.GetUniqueSlice(instr, new_index);
+      if (slice.ok()) {
+        return {{slice.ValueOrDie(), gte_indices}};
       }
     }
 
@@ -2047,7 +2041,7 @@ Status IrEmitterUnnested::HandleGather(HloInstruction* gather) {
 }
 
 std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
-    const HloInstruction* inst) {
+    const HloInstruction* inst, int unroll_factor) {
   const BufferAssignment& buffer_assn =
       ir_emitter_context_->buffer_assignment();
 
@@ -2139,7 +2133,7 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
   }
 
   return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 inst);
+                                 inst, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2511,21 +2505,28 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk) {
+  int unroll_factor = thunk->unroll_factor();
   VLOG(3) << bindings_.ToString();
 
   const Shape& element_shape = hlo.IsMultiOutputFusion()
                                    ? ShapeUtil::GetSubshape(hlo.shape(), {0})
                                    : hlo.shape();
+  VLOG(3) << "EmitTargetElementLoopInThunk "
+          << ShapeUtil::HumanStringWithLayout(hlo.shape())
+          << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->device_description());
+      element_shape, ir_emitter_context_->device_description(), unroll_factor);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_)
+                               launch_dimensions, &ir_builder_, unroll_factor)
         .EmitLoop(IrName(&hlo));
   }
 
+  CHECK_EQ(unroll_factor, 1)
+      << "multi-output fusion does not support unrolling";
+
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 66c62e2d2de3ed1668271a21943dc73ed3d77651..b842f480c6257c1a8bee8cdac55e29c5db6801a0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -150,8 +150,10 @@ class IrEmitterUnnested : public IrEmitter {
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
-  // Thunk object.
-  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst);
+  // Thunk object. The kernel implementation will be unrolled if unroll_factor
+  // is greater than one.
+  std::unique_ptr<KernelThunk> BuildKernelThunk(const HloInstruction* inst,
+                                                int unroll_factor = 1);
 
   // Returns a FftThunk that calls cuFFT to implement `inst`.
   std::unique_ptr<Thunk> BuildFftThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index c20a781a33fe89af4740ed31dd5bfb1a64473057..c24dc1457f83c7557430a69baf806ed05b45adca 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -30,10 +30,12 @@ namespace gpu {
 
 KernelThunk::KernelThunk(
     tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-    const string& kernel_name, const HloInstruction* hlo_instruction)
+    const string& kernel_name, const HloInstruction* hlo_instruction,
+    int unroll_factor)
     : Thunk(Kind::kKernel, hlo_instruction),
       args_(args.begin(), args.end()),
-      kernel_name_(kernel_name) {}
+      kernel_name_(kernel_name),
+      unroll_factor_(unroll_factor) {}
 
 tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
   tensorflow::mutex_lock lock(mutex_);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 9ae455e2fcc253a7a08ff95764721048a16b0bf7..df8971b083fe70588f8c32f977981e365d78fdb8 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -47,12 +47,14 @@ class KernelThunk : public Thunk {
   //
   // `hlo_instruction` is as in Thunk. Other arguments are as the class members.
   KernelThunk(tensorflow::gtl::ArraySlice<const BufferAllocation*> args,
-              const string& kernel_name, const HloInstruction* hlo_instruction);
+              const string& kernel_name, const HloInstruction* hlo_instruction,
+              int unroll_factor);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
 
   const string& kernel_name() const { return kernel_name_; }
+  int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
   tensorflow::Status Initialize(const GpuExecutable& executable) override;
@@ -69,6 +71,10 @@ class KernelThunk : public Thunk {
   // Entry kernel name for the computation.
   const string kernel_name_;
 
+  // The number of times this kernel should be unrolled. This works as a
+  // multiplier on the number of elements produced by a GPU thread.
+  const int unroll_factor_;
+
   // The thread and block dimension used to launch the kernel.
   // Will be set by IrEmitterUnnested.
   LaunchDimensions launch_dimensions_;
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index f4c4dcdafd6cc0cd64da5a8d1f23c8c0e7b2a9cb..86c4ac18b0501c38aaaae5a007bddcf261ca338f 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -68,17 +68,3 @@ tf_cc_test(
         "@llvm//:support",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index 388dcc008b07a76ff9ed07df04181e49a8734f51..d8c07dc3119fb81a3ef22822acb11b7c4d5bbca5 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,25 +32,32 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(body_emitter, shape, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_arrays, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder)
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    int unroll_factor)
     : LoopEmitter(target_element_generator, target_array, ir_builder),
-      launch_dimensions_(launch_dimensions) {}
+      launch_dimensions_(launch_dimensions),
+      unroll_factor_(unroll_factor) {}
 
-llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<llvm_ir::IrArray::Index>
+ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -63,6 +70,9 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   //   "It is guaranteed that [...] 0  <=  %ctaid.x <  %nctaid.x"
   //
   // %nctaid.x is currently specified as 2147483647.
+  VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_;
+  std::vector<llvm_ir::IrArray::Index> array_indices;
+
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
@@ -81,7 +91,7 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   thread_id = ir_builder_->CreateZExt(thread_id, ir_builder_->getInt64Ty(),
                                       "thread_id");
 
-  llvm::Value* linear_index = ir_builder_->CreateAdd(
+  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
       ir_builder_->CreateMul(
           block_id,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block()), "",
@@ -99,15 +109,30 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
       {ir_builder_->CreateICmpULT(
-          linear_index,
+          linear_index_base,
           ir_builder_->getInt64(launch_dimensions_.threads_per_block() *
                                 launch_dimensions_.block_count()),
           "linear_index_in_range")},
       {}, ir_builder_);
 
+  if (unroll_factor_ > 1) {
+    linear_index_base = ir_builder_->CreateMul(
+        linear_index_base, ir_builder_->getInt64(unroll_factor_),
+        "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
+  }
+
+  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  for (int i = 1; i < unroll_factor_; ++i) {
+    llvm::Value* linear_index = ir_builder_->CreateAdd(
+        linear_index_base, ir_builder_->getInt64(i), "linear_index",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+  }
+
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
       ir_builder_->CreateICmpULT(
-          linear_index, ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
+          linear_index_base,
+          ir_builder_->getInt64(ShapeUtil::ElementsIn(shape_))),
       llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
@@ -116,7 +141,8 @@ llvm_ir::IrArray::Index ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
 
   // Set IR builder insertion point to the body of the if structure.
   llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-  return llvm_ir::IrArray::Index(linear_index, shape_, ir_builder_);
+
+  return array_indices;
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 8ed63a854a74fc06c3c389f40fe1f5970885deac..25318b3bed8bf4a2dfe3a4a974269d0405c3bfec 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,18 +50,20 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder);
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
   ~ParallelLoopEmitter() override = default;
 
-  llvm_ir::IrArray::Index EmitIndexAndSetExitBasicBlock(
+  std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name) override;
 
  private:
   // The thread and block dimension to parallelize the loop on.
   const LaunchDimensions launch_dimensions_;
+  const int unroll_factor_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
index 6cf280df05496716a0780d61ded92efd9982734c..5283d51cd10668c43c5ad1c1fb11049555bff5d4 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.cc
@@ -44,12 +44,16 @@ std::ostream& operator<<(std::ostream& out,
 
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(
-    const Shape& shape, const se::DeviceDescription& device_desc) {
+    const Shape& shape, const se::DeviceDescription& device_desc,
+    int unroll_factor) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
   }
 
+  CHECK_EQ(num_elements % unroll_factor, 0);
+  num_elements = num_elements / unroll_factor;
+
   // Since we don't do any inter-warp communication, we're free to choose any
   // block size we want, subject to hardware constraints.  We choose the
   // smallest block size that allows the GPU to reach full occupancy (assuming
diff --git a/tensorflow/compiler/xla/service/gpu/partition_assignment.h b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
index 0bf463a6ef95d5a32784838c08ad239752fd1acf..42d2d2af2e334da7c42419cb07a2bd5bb9d209d6 100644
--- a/tensorflow/compiler/xla/service/gpu/partition_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/partition_assignment.h
@@ -58,7 +58,8 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(
     const Shape& shape,
-    const perftools::gputools::DeviceDescription& device_desc);
+    const perftools::gputools::DeviceDescription& device_desc,
+    int unroll_factor = 1);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 0b446c654779db410ebbd91ef9a5bab14d08a278..8fd7f8945c7c36a451af30fcd5939a2498648e74 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -135,6 +135,10 @@ message HloInstructionProto {
   xla.GatherDimensionNumbers gather_dimension_numbers = 33;
   repeated int64 gather_window_bounds = 34;
 
+  // Compute Host.
+  string channel_name = 41;
+  int64 cost_estimate_ns = 42;
+
   // The id of this instruction.
   int64 id = 35;
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 6f983d0b950435d43fe3a1e0fe84902b51bfe249..594413e88fb26e86b198d08b2e4db77fad671348 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -304,19 +304,15 @@ void ComputeComputationPostOrder(
     HloComputation* computation,
     tensorflow::gtl::FlatSet<HloComputation*>* visited,
     std::list<HloComputation*>* post_order) {
-  if (visited->count(computation) > 0) {
-    return;
-  }
-
-  for (auto* instruction : computation->instructions()) {
-    for (HloComputation* called_computation :
-         instruction->called_computations()) {
-      ComputeComputationPostOrder(called_computation, visited, post_order);
+  if (visited->insert(computation).second) {
+    for (auto* instruction : computation->instructions()) {
+      for (HloComputation* called_computation :
+           instruction->called_computations()) {
+        ComputeComputationPostOrder(called_computation, visited, post_order);
+      }
     }
+    post_order->push_back(computation);
   }
-
-  visited->insert(computation);
-  post_order->push_back(computation);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 35ecd4428d0dfde2de445ea34472d2c78148c6c9..7aa38c6b79ed904bb4a518c4b7aaa1e079c27ea8 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -69,7 +69,8 @@ StatusOr<bool> HloConstantFolding::Run(HloModule* module) {
       // Broadcasts dramatically increase the size of constants, which is often
       // detrimental to performance and memory capacity, so do not fold
       // broadcasts.
-      if (instruction->opcode() == HloOpcode::kBroadcast) {
+      if (instruction->opcode() == HloOpcode::kBroadcast ||
+          instruction->opcode() == HloOpcode::kBroadcastDimOne) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 4ec2ef27bf59b0c877ec38e55ef5c12debeec227..ea4dd62fdb5bb3be40987d1a6ea96b3a58b0053b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -335,6 +336,11 @@ Status HloCostAnalysis::HandleBroadcast(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleBroadcastDimOne(
+    const HloInstruction* broadcastDimOne) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandlePad(const HloInstruction*) {
   return Status::OK();
 }
@@ -379,20 +385,101 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
-  auto rhs_instruction = convolution->operand(1);
+  auto lhs = convolution->operand(0);
+  auto rhs = convolution->operand(1);
+  Window window = convolution->window();
+  const auto& result_shape = convolution->shape();
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+
   const auto& dnums = convolution->convolution_dimension_numbers();
-  const int64 output_features =
-      convolution->shape().dimensions(dnums.output_feature_dimension());
-
-  // For each output element, we do one fma per element in the kernel at some
-  // given output feature index.
-  const int64 fmas_per_output_element =
-      output_features > 0
-          ? ShapeUtil::ElementsIn(rhs_instruction->shape()) / output_features
-          : 0;
-  const int64 output_elements = ShapeUtil::ElementsIn(convolution->shape());
-  current_properties_[kFlopsKey] =
-      output_elements * fmas_per_output_element * kFmaFlops;
+
+  const int64 input_batch_dim = dnums.input_batch_dimension();
+  const int64 input_feature_dim = dnums.input_feature_dimension();
+  const int64 output_feature_dim = dnums.output_feature_dimension();
+  const int64 input_feature =
+      ShapeUtil::GetDimension(lhs_shape, input_feature_dim);
+  const int64 output_feature =
+      ShapeUtil::GetDimension(result_shape, output_feature_dim);
+  const int64 batch = ShapeUtil::GetDimension(lhs_shape, input_batch_dim);
+
+  DimensionVector kernel_limits;
+  DimensionVector output_limits;
+  DimensionVector input_limits;
+  if (window.dimensions().empty()) {
+    window = window_util::MakeWindow({1});
+    kernel_limits.push_back(1);
+    output_limits.push_back(1);
+    input_limits.push_back(1);
+  } else {
+    for (int64 spatial_dimension = 0;
+         spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+      // Spatial dimension number for kernel (rhs).
+      const int64 kernel_spatial_dim =
+          dnums.kernel_spatial_dimensions(spatial_dimension);
+      const int64 kernel_limit = rhs_shape.dimensions(kernel_spatial_dim);
+      kernel_limits.push_back(kernel_limit);
+
+      // Spatial dimension number for output.
+      const int64 output_spatial_dim =
+          dnums.output_spatial_dimensions(spatial_dimension);
+      const int64 output_limit = result_shape.dimensions(output_spatial_dim);
+      output_limits.push_back(output_limit);
+
+      // Spatial dimension number for input (lhs).
+      const int64 input_spatial_dim =
+          dnums.input_spatial_dimensions(spatial_dimension);
+      const int64 input_limit = lhs_shape.dimensions(input_spatial_dim);
+      input_limits.push_back(input_limit);
+    }
+  }
+
+  DimensionVector valid_position_counts;
+
+  // Loop over each spatial dimension.
+  for (int64 spatial_dimension = 0;
+       spatial_dimension < window.dimensions_size(); ++spatial_dimension) {
+    int64 valid_position_count = 0;
+    // Loop over each point in the kernel.
+    for (int64 kernel_idx = 0; kernel_idx < kernel_limits[spatial_dimension];
+         ++kernel_idx) {
+      // Loop over each point in the output.
+      for (int64 output_idx = 0; output_idx < output_limits[spatial_dimension];
+           ++output_idx) {
+        // Calculate lhs (input) index without taking base dilation into
+        // account.
+        const auto& window_dim = window.dimensions(spatial_dimension);
+        const int64 undilated_index = output_idx * window_dim.stride() -
+                                      window_dim.padding_low() +
+                                      kernel_idx * window_dim.window_dilation();
+
+        // Calculate the actual lhs (input) index after dilation. Avoid the
+        // division as an optimization.
+        const int64 lhs_spatial_index =
+            window_dim.base_dilation() > 1
+                ? undilated_index / window_dim.base_dilation()
+                : undilated_index;
+
+        // Skip if the lhs (input) index is to be dilated.
+        if (undilated_index != lhs_spatial_index * window_dim.base_dilation()) {
+          continue;
+        }
+
+        // Skip if input index is not in bound.
+        if (lhs_spatial_index < 0 ||
+            lhs_spatial_index >= input_limits[spatial_dimension]) {
+          continue;
+        }
+
+        valid_position_count += 1;
+      }
+    }
+    valid_position_counts.push_back(valid_position_count);
+  }
+
+  const int64 fma_count =
+      input_feature * output_feature * batch * Product(valid_position_counts);
+  current_properties_[kFlopsKey] = fma_count * kFmaFlops;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d17678d20f2a23fd98d18b77d5fb25853901a789..a9f6845747aa2081df936d388551bbc0b75b787b 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -95,6 +95,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleSelectAndScatter(const HloInstruction* instruction) override;
   Status HandleBitcast(const HloInstruction* bitcast) override;
   Status HandleBroadcast(const HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(const HloInstruction* broadcastDimOne) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 3b289c240a45e8f3df8156ed89e879da2132d01a..3d055b327ee920dac9c0904c69e1461206b31203 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -186,12 +186,14 @@ TEST_F(HloCostAnalysisTest, Map) {
 TEST_F(HloCostAnalysisTest, Convolution) {
   ComputationBuilder builder(client_, "convolution");
   auto input = builder.Parameter(
-      0, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
-                                    /*x_dim=*/20}),
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
       "input");
   auto kernel = builder.Parameter(
-      1, ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
-                                    /*x_dim=*/3}),
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
       "kernel");
   auto result = builder.Conv(input, kernel, {1, 1}, Padding::kValid);
 
@@ -440,5 +442,32 @@ TEST_F(HloCostAnalysisTest, TupleCost) {
   EXPECT_EQ(analysis.bytes_accessed(), kPointerSize * 2);
 }
 
+TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
+  ComputationBuilder builder(client_, "BaseDilatedConvolution");
+  auto input = builder.Parameter(
+      0,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
+                                 /*x_dim=*/20}),
+      "input");
+  auto kernel = builder.Parameter(
+      1,
+      ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/3,
+                                 /*x_dim=*/3}),
+      "kernel");
+
+  auto result = builder.ConvGeneralDilated(
+      input, kernel, /*window_strides=*/{1, 1}, /*padding=*/{{1, 1}, {1, 1}},
+      /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11},
+      ComputationBuilder::CreateDefaultConvDimensionNumbers(2));
+
+  // Run HLO cost analysis.
+  auto hlo_module = BuildHloGraph(&builder);
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.flop_count(), 1472);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 693004d364114b1a25ce6b6791092665c861d13f..b4f9a9db9cbcae56fbf60ad9d2ef4b3e0ffe2a90 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -202,6 +202,25 @@ void IterateThroughWindow(
   } while (IndexUtil::BumpIndices(window_shape, &window_index));
 }
 
+// Creates a vector of multipliers which can be used to create a linear index
+// into shape.
+//
+// Given the multidimensional index {i1, ..., iN} and
+// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+//
+//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+//
+// This lets you calculate LI given the multidimensional indices in any order.
+DimensionVector MakeDimMultipliers(const Shape& shape) {
+  DimensionVector v(ShapeUtil::Rank(shape));
+  int64 scale = 1;
+  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+    v[dim] = scale;
+    scale *= shape.dimensions(dim);
+  }
+  return v;
+}
+
 }  // namespace
 
 template <typename ReturnT, typename ElementwiseT>
@@ -380,6 +399,22 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
   Status HandleExp(HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
@@ -979,18 +1014,6 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
 
-    // Dimension number applicable for input (lhs).
-    const int64 input_batch_dim = dnums.input_batch_dimension();
-    const int64 input_z_dim = dnums.input_feature_dimension();
-    // Dimension number applicable for kernel (rhs).
-    const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-    const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-    // Dimension number applicable for output.
-    const int64 output_batch_dim = dnums.output_batch_dimension();
-    const int64 output_z_dim = dnums.output_feature_dimension();
-
-    const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
     std::vector<int64> window_dimension_sizes;
     for (auto i : dnums.kernel_spatial_dimensions()) {
       window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
@@ -999,25 +1022,43 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     const Shape& window_shape =
         ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
 
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
-    DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size());
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
 
-    auto func = [&](ArraySlice<int64> out_index) {
-      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
 
-      std::fill(lhs_index.begin(), lhs_index.end(), 0);
-      std::fill(rhs_index.begin(), rhs_index.end(), 0);
-      std::fill(rhs_spatial_index.begin(), rhs_spatial_index.end(), 0);
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
 
-      lhs_index[input_batch_dim] = out_index[output_batch_dim];
-      rhs_index[kernel_output_z_dim] = out_index[output_z_dim];
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
 
       // Convolve input feature with kernel.
       do {
         for (int64 iz = 0; iz < z_size; ++iz) {
-          lhs_index[input_z_dim] = iz;
-          rhs_index[kernel_input_z_dim] = iz;
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
 
           // Find corresponding spatial dimension index for input (lhs).
           for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
@@ -1042,29 +1083,32 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
 
             // Calculate the actual lhs (input) index after dilation.  As an
             // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
             if (window_dim.base_dilation() > 1) {
-              lhs_index[input_spatial_dim] =
-                  undilated_index / window_dim.base_dilation();
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
             } else {
-              lhs_index[input_spatial_dim] = undilated_index;
+              lhs_spatial_index = undilated_index;
             }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
 
-            // Skip if input index is not in bound.
-            if (!(lhs_index[input_spatial_dim] >= 0 &&
-                  lhs_index[input_spatial_dim] <
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
                       lhs_shape.dimensions(input_spatial_dim))) {
               goto cnt;
             }
 
-            rhs_index[dnums.kernel_spatial_dimensions(ki)] =
-                window_dim.window_reversal()
-                    ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                    : rhs_spatial_index[ki];
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
           }
 
           result_val +=
-              static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-              static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
         }
       cnt : {}
       } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
@@ -1073,7 +1117,7 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     };
 
     auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
 
     parent_->evaluated_[conv] = std::move(result);
     return Status::OK();
@@ -1520,14 +1564,12 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
       arg_dim_counts[dim] = arg_dimensions[dim];
     }
 
-    // Create mapping from result index to arg index.
-    const int64 result_rank = ShapeUtil::Rank(result->shape());
-    int64 result_dim = 0;
-    std::vector<int64> result_to_arg_index(result_rank);
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
     for (int64 i = 0; i < arg_dimensions.size(); ++i) {
       if (arg_dim_steps[i] == 0) {
-        result_to_arg_index[result_dim] = i;
-        ++result_dim;
+        result_to_arg_index.push_back(i);
       }
     }
 
@@ -1542,6 +1584,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             base[result_to_arg_index[i]] = multi_index[i];
           }
 
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
           auto func = [&](ArraySlice<int64> input_index) {
             auto curr_val = arg_literal.Get<ReturnT>(input_index);
 
@@ -1554,19 +1610,17 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
             std::unique_ptr<Literal> computed_result =
                 embedded_evaluator.Evaluate<const Literal*>(*function, args)
                     .ConsumeValueOrDie();
-            // Clear visit states so that the we can use the evaluate again on
+            // Clear visit states so that we can use the evaluator again on
             // the same computation.
             embedded_evaluator.ResetVisitStates();
-
             // Assign computed result to result_val.
             result_val = computed_result->Get<ReturnT>({});
-
             return true;
           };
-
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
           ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
                                   arg_dim_steps, func);
-
           return result_val;
         }));
 
@@ -1574,6 +1628,20 @@ class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
   Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
     auto operand = select_and_scatter->operand(0);
     auto source = select_and_scatter->operand(1);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 685cacd7f74c00789296dee16f0a6a94c35a4393..dd14dd38537a83d0ee16cff9e3c22a38f544e208 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1205,6 +1206,80 @@ TEST_P(HloEvaluatorTest,
   LiteralTestUtil::ExpectEqual(*expected, *result);
 }
 
+class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
+
+// Tests that Reduce doesn't lose precision when adding many numbers (because
+// it accumulates its result in a double).
+TEST_F(HloEvaluatorPreciseReduceTest, AddReductionPrecisionTest) {
+  HloComputation::Builder b(TestName());
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  HloInstruction* init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module().AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module().AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  std::unique_ptr<Literal> result =
+      hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  LiteralTestUtil::ExpectR0Equal<float>(kNumElements, *result);
+}
+
+// Reducing many numbers should be fast because it doesn't create
+// intermediate Literals; the microbenchmark should finish in < 1 msec.
+void BM_ReducePrecisely(int num_iters) {
+  tensorflow::testing::StopTiming();
+  HloComputation::Builder b("BM_ReducePrecisely");
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config);
+
+  constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
+  std::vector<float> v(kNumElements, 1.0f);
+  HloInstruction* arg_instruction = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<float>(v)));
+  auto init_value = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.f)));
+
+  HloComputation::Builder add_computation("add");
+  Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+  auto param_lhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+  auto param_rhs = add_computation.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+  add_computation.AddInstruction(HloInstruction::CreateBinary(
+      scalar_shape, HloOpcode::kAdd, param_lhs, param_rhs));
+  auto add_func = module.AddEmbeddedComputation(add_computation.Build());
+
+  HloInstruction* reduce_instruction = b.AddInstruction(
+      HloInstruction::CreateReduce(scalar_shape, arg_instruction, init_value,
+                                   /*dimensions_to_reduce=*/{0}, add_func));
+  module.AddEntryComputation(b.Build());
+
+  HloEvaluator hlo_eval;
+  tensorflow::testing::StartTiming();
+  hlo_eval.Evaluate(reduce_instruction).ConsumeValueOrDie();
+  tensorflow::testing::StopTiming();
+}
+
+BENCHMARK(BM_ReducePrecisely);
+
 TEST_P(HloEvaluatorTest, ReduceAdd) {
   HloComputation::Builder b(TestName());
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 1dc72355cf179e996caab4d6b52068dc99d02244..c35783c456c63b9a651d1221cf9a3d70af38ba66 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -823,7 +823,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
 
     // Otherwise, print e.g. "%constant.42 (s32[100])".
     string constant_name;
-    if (tensorflow::StringPiece(constant->name()).starts_with("constant")) {
+    if (tensorflow::str_util::StartsWith(constant->name(), "constant")) {
       constant_name = constant->name();
     } else {
       constant_name = StrCat("constant ", constant->name());
@@ -956,6 +956,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
       // De-emphasize nodes which broadcast a scalar within a fusion node --
       // these are essentially free.
       if (instr->IsFused() &&
@@ -1041,8 +1042,8 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
 
   // The HLO instruction name contains usually the opcode, e.g. "%add.42" is
   // an add instruction.  In this case we render just the name.
-  if (tensorflow::StringPiece(instr->name())
-          .starts_with(HloOpcodeString(instr->opcode()))) {
+  if (tensorflow::str_util::StartsWith(instr->name(),
+                                       HloOpcodeString(instr->opcode()))) {
     return Printf("<b>%s</b>", HtmlLikeStringSanitize(instr->name()));
   }
   string extended_opcode =
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a2a2c1e615a7f2b226c712a75b1240b980fc8d3c..a986bbd511d6cfc9f14334f6edf624de8bcedfd7 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -98,6 +98,13 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
   }
 
+  if (instruction->opcode() == HloOpcode::kTrace) {
+    TF_RET_CHECK(instruction->operands().size() == 1)
+        << "Trace instruction should have 1 operand but sees "
+        << instruction->operands().size();
+    instruction->mutable_operand(0)->set_tracing(instruction.get());
+  }
+
   TF_RET_CHECK(!proto.name().empty());
   instruction->name_ = proto.name();
 
@@ -152,6 +159,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->fft_length_.push_back(fft_len);
   }
 
+  if (proto.has_gather_dimension_numbers()) {
+    instruction->gather_dimension_numbers_ =
+        MakeUnique<GatherDimensionNumbers>(proto.gather_dimension_numbers());
+  }
+  for (int64 bound : proto.gather_window_bounds()) {
+    instruction->gather_window_bounds_.push_back(bound);
+  }
+
+  instruction->channel_name_ = proto.channel_name();
+  instruction->cost_estimate_ns_ = proto.cost_estimate_ns();
+
   return std::move(instruction);
 }
 
@@ -170,6 +188,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
   instruction->operands_.push_back(operand);
   instruction->literal_ = Literal::CreateR1U8(tag);
+  operand->set_tracing(instruction.get());
   return instruction;
 }
 
@@ -675,6 +694,15 @@ HloInstruction::CreateSelectAndScatter(
   return instruction;
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateBroadcastDimOne(const Shape& shape,
+                                      HloInstruction* operand) {
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kBroadcastDimOne, shape));
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateBroadcastSequence(
     const Shape& output_shape, HloInstruction* operand,
@@ -1267,6 +1295,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateBroadcast(shape, new_operands[0], dimensions_);
       break;
+    case HloOpcode::kBroadcastDimOne:
+      CHECK_EQ(new_operands.size(), 1);
+      clone = CreateBroadcastDimOne(shape, new_operands[0]);
+      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1818,6 +1850,8 @@ bool HloInstruction::IdenticalSlowPath(
 
     // Remaining instructions with special values.
     case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcastDimOne:
+    case HloOpcode::kDynamicUpdateSlice:
       return eq_shapes(shape(), other.shape());
     case HloOpcode::kBroadcast:
       return eq_shapes(shape(), other.shape()) &&
@@ -1836,8 +1870,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kDynamicSlice:
       return eq_shapes(shape(), other.shape()) &&
              dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
-    case HloOpcode::kDynamicUpdateSlice:
-      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
@@ -2395,6 +2427,15 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  if (gather_dimension_numbers_ != nullptr) {
+    *proto.mutable_gather_dimension_numbers() = *gather_dimension_numbers_;
+  }
+  for (int64 bound : gather_window_bounds_) {
+    proto.add_gather_window_bounds(bound);
+  }
+  proto.set_channel_name(channel_name_);
+  proto.set_cost_estimate_ns(cost_estimate_ns_);
+
   return proto;
 }
 
@@ -2638,6 +2679,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBitcast(this);
     case HloOpcode::kBroadcast:
       return visitor->HandleBroadcast(this);
+    case HloOpcode::kBroadcastDimOne:
+      return visitor->HandleBroadcastDimOne(this);
     case HloOpcode::kPad:
       return visitor->HandlePad(this);
     case HloOpcode::kReshape:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a94ba145df792ade9bb7ce3e9a31b56b2f460cd2..a6cb19f331695da996769e8bb5ad16cb5cfd9bf1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -401,6 +401,10 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
+  // Creates a broadcast-size-one-dimensions instruction.
+  static std::unique_ptr<HloInstruction> CreateBroadcastDimOne(
+      const Shape& shape, HloInstruction* operand);
+
   // Creates a sequence of instructions that performs an explicit broadcast of
   // the operand to the target shape.
   //
@@ -928,6 +932,13 @@ class HloInstruction {
   const HloSharding& sharding_or_default(const HloSharding& default_) const {
     return sharding_ ? *sharding_ : default_;
   }
+  // Returns the sharding unique device, if any.
+  tensorflow::gtl::optional<int64> sharding_unique_device() const {
+    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+      return tensorflow::gtl::optional<int64>();
+    }
+    return sharding_->UniqueDevice().ValueOrDie();
+  }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 595c531ccff728f836cfaca2fafaa8a08e715b74..08b9a29aeda2ee612d49b0788acf8438a25eb6a3 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -295,12 +295,13 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
 /* static */
 StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
-    const HloModuleProto& module) {
+    const HloModuleProto& module, const DebugOptions& debug_options) {
   TF_RET_CHECK(module.has_program_shape())
       << "No program shape found in the proto";
   const auto& program_shape = module.program_shape();
 
   HloModuleConfig module_config(program_shape);
+  module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
   // passed in via the ProgramShape. Set the layouts to the appropriate values.
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 755bbd359f7b95e7f3f3cbee1b46df85908202c6..9f7f25202ba42b14e995ed5c47d1012dabc69332 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -172,7 +172,7 @@ class HloModule {
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
   static StatusOr<HloModuleConfig> CreateModuleConfigFromProto(
-      const HloModuleProto& module);
+      const HloModuleProto& module, const DebugOptions& debug_options);
 
   // Outlines the given expression from the given computation.
   // instructions_to_outline contains the instructions that form the expression.
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index fa5dcb0b369d17c70c64c67b9f11640c93fb4278..54c34ce116651608e6d91cdcba9c708ca3a5f75e 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -313,6 +313,27 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
       return FailedPrecondition("send/recv shapes do not match");
     }
+    const HloModule* send_module = channel.send->parent()->parent();
+    const HloModule* send_done_module = channel.send_done->parent()->parent();
+    if (send_module != send_done_module) {
+      return FailedPrecondition(
+          "send and send-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+    }
+    const HloModule* recv_module = channel.recv->parent()->parent();
+    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
+    if (recv_module != recv_done_module) {
+      return FailedPrecondition(
+          "recv and recv-done (channel=%lld) must be on the same device: %lld "
+          "vs. %lld",
+          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+    }
+    if (send_module == recv_module) {
+      return FailedPrecondition(
+          "send and recv (channel=%lld) must be on different devices: %lld",
+          channel.id, GetModuleId(send_module));
+    }
   }
 
   // Check if channel instructions are used only in allowed computations.
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index af24604c39b554f146793594958f373999844b4c..dddc72480f93c4c3cc29f41db99fa773dc8d6b68 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -54,6 +54,7 @@ namespace xla {
   V(kBitcast, "bitcast")                                     \
   V(kBitcastConvert, "bitcast-convert")                      \
   V(kBroadcast, "broadcast")                                 \
+  V(kBroadcastDimOne, "broadcast-dim-one")                   \
   V(kCall, "call", kHloOpcodeIsVariadic)                     \
   V(kCeil, "ceil")                                           \
   V(kClamp, "clamp")                                         \
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index e5b1c2efa3fc25d23531df298e125521c002dba1..ec7d8210a70ad7498f77fe807abd53544d4b0487 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -52,10 +52,9 @@ namespace {
 // Creates an HloModule from the given proto.
 StatusOr<std::unique_ptr<HloModule>> HloProtoToModule(
     const HloProto& proto, const DebugOptions& debug_options) {
-  TF_ASSIGN_OR_RETURN(
-      HloModuleConfig config,
-      HloModule::CreateModuleConfigFromProto(proto.hlo_module()));
-  config.set_debug_options(debug_options);
+  TF_ASSIGN_OR_RETURN(HloModuleConfig config,
+                      HloModule::CreateModuleConfigFromProto(proto.hlo_module(),
+                                                             debug_options));
   TF_ASSIGN_OR_RETURN(auto module,
                       HloModule::CreateFromProto(proto.hlo_module(), config));
   return std::move(module);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 18d406f3700da6dfdfcd16fb76bf9c1d2bc63141..06204acbca30648e73382cb4641139e852664b77 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -94,6 +94,10 @@ class HloSharding {
   // Create a new sharding from a protobuf OpSharding.
   static StatusOr<HloSharding> FromProto(const OpSharding& proto);
 
+  // Checks whether device is a reserved device number. A reserved device number
+  // has usually a special meaning, with dedicated handling logic.
+  static bool IsReservedDevice(int64 device) { return device < 0; }
+
   OpSharding ToProto() const;
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8c875698eb1992719d504d272ca338b05b60e36b..63ec5964eb935239e86233c1ae94e2bcce3b0461 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -174,17 +174,34 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
   TF_RETURN_IF_ERROR(CheckShape(broadcast, broadcast->shape()));
   TF_RET_CHECK(ShapeUtil::Rank(operand_shape) ==
                broadcast->dimensions().size());
-  for (int64 operand_dimension = 0;
-       operand_dimension < ShapeUtil::Rank(operand_shape);
-       ++operand_dimension) {
-    int64 output_dimension = broadcast->dimensions()[operand_dimension];
+  for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
+    int64 output_dimension = broadcast->dimensions()[i];
     TF_RET_CHECK(broadcast->shape().dimensions(output_dimension) ==
-                 operand_shape.dimensions(operand_dimension))
+                 operand_shape.dimensions(i))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
   return tensorflow::Status::OK();
 }
 
+Status ShapeVerifier::HandleBroadcastDimOne(HloInstruction* broadcastDimOne) {
+  const Shape& operand_shape = broadcastDimOne->operand(0)->shape();
+  int64 operand_rank = ShapeUtil::Rank(operand_shape);
+  const Shape& output_shape = broadcastDimOne->shape();
+  // Check for mixed precision.
+  TF_RETURN_IF_ERROR(CheckShape(broadcastDimOne, output_shape));
+  TF_RET_CHECK(operand_rank == ShapeUtil::Rank(output_shape));
+  for (int64 i = 0; i < operand_rank; ++i) {
+    int64 operand_dimension = operand_shape.dimensions(i);
+    int64 output_dimension = output_shape.dimensions(i);
+    TF_RET_CHECK(operand_dimension == 1 ||
+                 operand_dimension == output_dimension)
+        << "Dimension " << i << " of broadcastDimOne "
+        << broadcastDimOne->ToString() << " is " << operand_dimension
+        << ", expected 1 or " << output_dimension;
+  }
+  return tensorflow::Status::OK();
+}
+
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   // Check for mixed precision.
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1dd7ec3c51e18dcfe89bd478de87798ba3858119..a4dff977ba268137d8ab94c576b4b511e911806f 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -54,6 +54,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleReduce(HloInstruction* reduce) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
   Status HandleBroadcast(HloInstruction* broadcast) override;
+  Status HandleBroadcastDimOne(HloInstruction* broadcastDimOne) override;
   Status HandleReshape(HloInstruction* reshape) override;
   Status HandleTranspose(HloInstruction* transpose) override;
   Status HandleParameter(HloInstruction*) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index d69ad80bdb4d2eab2d34228be026d7bc0b76efc0..3f4dbf897df7e1fd62f4229ed90c949c59da9d46 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -37,6 +37,7 @@ namespace xla {
     case HloOpcode::kBitcast:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kBroadcast:
+    case HloOpcode::kBroadcastDimOne:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kComplex:
@@ -142,7 +143,8 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
       });
   return std::count_if(hlo->operands().begin(), hlo->operands().end(),
                        [output_rank](HloInstruction* operand) {
-                         if (operand->opcode() == HloOpcode::kBroadcast) {
+                         if (operand->opcode() == HloOpcode::kBroadcast ||
+                             operand->opcode() == HloOpcode::kBroadcastDimOne) {
                            return false;
                          }
                          if (operand->opcode() == HloOpcode::kConstant &&
@@ -247,7 +249,8 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     auto reachability = computation->ComputeReachability();
 
     auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast) {
+      if (producer->opcode() == HloOpcode::kBroadcast ||
+          producer->opcode() == HloOpcode::kBroadcastDimOne) {
         return true;
       }
       if (producer->opcode() == HloOpcode::kConstant &&
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 0db3863f2428cf0c9a66a928d54f774e39a18539..45505484951abfcee93a62fec7a99e86cbb9150c 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -120,14 +120,3 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 37261ed1e665ebed9685751161a412ad114a9e96..f1e7fc29532ce7e6841010a5258f4000a7c70383 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -169,17 +169,3 @@ cc_library(
         "@llvm//:core",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 2a282f3be79f847a6569416794d1a2a3fcd69148..ec04239b4f9112134ba876fdfbb3905a3baf1f72 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -762,7 +763,7 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
     fake_argv_storage.push_back("");
     for (const auto& it : options) {
       // Skip options the XLA backend itself consumes.
-      if (!tensorflow::StringPiece(it.first).starts_with("xla_")) {
+      if (!tensorflow::str_util::StartsWith(it.first, "xla_")) {
         if (it.second.empty()) {
           fake_argv_storage.push_back(it.first);
         } else {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index b6b918ec78a27b90325f72eea14b97f9aee43c54..3978acc132f34b8b195d3772ccf71d0d467984db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -88,12 +88,12 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
-IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
+std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     tensorflow::StringPiece loop_name) {
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
-    return IrArray::Index();
+    return {IrArray::Index()};
   }
 
   // Create loop nest with one for-loop for each dimension of the target shape.
@@ -121,12 +121,14 @@ IrArray::Index LoopEmitter::EmitIndexAndSetExitBasicBlock(
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
   CHECK_NOTNULL(exit_bb_);
 
-  return array_index;
+  return {array_index};
 }
 
 tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
-  IrArray::Index array_index = EmitIndexAndSetExitBasicBlock(loop_name);
-  TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  for (const IrArray::Index& array_index :
+       EmitIndexAndSetExitBasicBlock(loop_name)) {
+    TF_RETURN_IF_ERROR(body_emitter_(array_index));
+  }
 
   // Set the insertion point of ir_builder_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 0fc528439a0d5bf8382dfcf2d8b3051f8900bf1d..9ff497aecd0bc964c929205c7fd410cca87d9b77 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -63,11 +63,12 @@ class LoopEmitter {
 
   // Emits a loop nest (with a yet-to-be-filled loop body) that iterates through
   // every element in the given shape. Returns the multi-dimensional index that
-  // specifies the element.
-  IrArray::Index EmitIndexAndSetExitBasicBlock() {
+  // specifies the element, will return multiple indices if the loop is
+  // unrolled.
+  std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
     return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"");
   }
-  virtual IrArray::Index EmitIndexAndSetExitBasicBlock(
+  virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index f15117f45c689f2d717fbfe6191b510586449bc4..49ec38eb62c7b51c7a2d301d882cef032b288036 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -53,16 +53,8 @@ bool IsReshapeOrTranspose(const HloInstruction* instruction) {
          instruction->opcode() == HloOpcode::kTranspose;
 }
 
-// Returns true if `a` is a broadcast instruction to target shape `shape` and
-// its operand is a scalar.
-bool IsBroadcastScalarToShape(const HloInstruction* a, const Shape& shape) {
-  return a->opcode() == HloOpcode::kBroadcast &&
-         ShapeUtil::SameDimensions(a->shape(), shape) &&
-         ShapeUtil::IsScalar(a->operand(0)->shape());
-}
-
-// Returns true iff `instruction` can change its shape simply by adjusting
-// metadata.
+// Returns true if `instruction` can change its shape simply by adjusting
+// metadata or if `instruction` is a broadcast of a scalar value.
 bool CanTriviallyChangeShape(const HloInstruction* instruction) {
   // NOTE: Technically a sequence of reshape(reshape(constant)) is also
   // trivially reshapable, so we might be tempted to simply recurse if
@@ -97,19 +89,30 @@ bool CanTriviallyChangeShape(const HloInstruction* instruction) {
     return true;
   }
 
+  // A broadcase of scalar can trivially change its shape.
+  if (instruction->opcode() == HloOpcode::kBroadcast &&
+      ShapeUtil::IsScalar(instruction->operand(0)->shape())) {
+    return true;
+  }
+
   return false;
 }
 
-// Finds the first non-scalar operand of an instruction that is a non-trivial
-// reshape or transpose. Returns the operand if it is found or nullptr if not
-// found.
+// Returns true iff `instruction` is a reshape/transpose instruction for which
+// a shape change is nontrivial.
+bool IsNontrivialReshape(const HloInstruction* instruction) {
+  return !ShapeUtil::IsScalar(instruction->shape()) &&
+         IsReshapeOrTranspose(instruction) &&
+         !CanTriviallyChangeShape(instruction->operand(0));
+}
+
+// Finds the first operand of an instruction that is a non-trivial reshape or
+// transpose. Returns such an operand or nullptr if not found.
 HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
     const HloInstruction* hlo) {
   for (HloInstruction* operand : hlo->operands()) {
-    if (!ShapeUtil::IsScalar(operand->shape()) &&
-        IsReshapeOrTranspose(operand) &&
-        !CanTriviallyChangeShape(operand->operand(0))) {
-      VLOG(5) << "Found first non-scalar and non-trivial reshape operand of "
+    if (IsNontrivialReshape(operand)) {
+      VLOG(5) << "Found first non-trivial reshape operand of "
               << hlo->ToString(HloPrintOptions().set_print_metadata(false))
               << ":\n\t"
               << operand->ToString(HloPrintOptions().set_print_metadata(false));
@@ -119,7 +122,7 @@ HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
   return nullptr;
 }
 
-// Returns whether `a` and `b` are equivalent for the purposes of this pass.
+// Returns whether `a` and `b` are equivalent reshapes/transposes.
 bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
       !ShapeUtil::SameDimensions(a->shape(), b->shape())) {
@@ -136,85 +139,14 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
-// Returns true if all operands of `instruction` can easily change shape.
-// Operands can easily change shape if they are all reshapes/transposes to and
-// from the same shape. Additionally, operands like constant, rng, and any
-// scalar change shape with only an adjustment of metadata.
-bool AllOperandsHaveEasyShapeChanges(
-    const HloInstruction* instruction,
-    const HloInstruction* first_reshape_operand) {
-  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  VLOG(3) << "** Checking whether all operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  // Check whether all operands:
-  //    0. Have the same dimensions as the output -- if not, it may be
-  //       implicitly broadcast, which can confound the movement's
-  //       correctness.
-  //
-  // And one of the following:
-  //    1. Are reshapes or transposes that have the same input and
-  //       output shapes as all other reshaped or transposed operands.
-  //     or
-  //    2. Are one of kConstant, kRng, and scalars that can change shape
-  //    trivially,
-  //     or
-  //    3. Are broadcast with a scalar operand.
-  for (const HloInstruction* operand : instruction->operands()) {
-    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; may be "
-                 "implicitly broadcast, so preventing "
-                 "movement\n\toperand: "
-              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
-              << instruction->ToString(print_no_metadata);
-      return false;
-    }
-
-    // Skip the rest checks if the current operand is first_reshape_operand
-    // itself.
-    if (first_reshape_operand == operand) {
-      continue;
-    }
-
-    if (AreEquivalentReshapes(first_reshape_operand, operand)) {
-      VLOG(5) << "Are equivalent reshapes:\n\tfirst_reshape_operand: "
-              << first_reshape_operand->ToString(print_no_metadata)
-              << "\n\toperand: " << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (CanTriviallyChangeShape(operand)) {
-      VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (IsBroadcastScalarToShape(operand, first_reshape_operand->shape())) {
-      VLOG(5) << "Broadcast scalar to shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    // TODO(someone): Look into supporting general ops for the operands as
-    // well.
-    VLOG(5) << "Operand is neither equalivant to the first Reshape operand"
-               "nor can trivially change shape: "
-            << operand->ToString(print_no_metadata);
-    return false;
-  }
-
-  VLOG(3) << "All operands have easy shape changes: "
-          << instruction->ToString(print_no_metadata);
-  return true;
-}
-
 // This function is called once we've decided to sink reshape/transpose operands
 // across an instruction. It returns an updated `operand` with a shape that
 // plays nicely with `new_operand_shape`; either it has the same shape (of the
 // correct type), or it is a scalar that may be implicitly broadcast.
-HloInstruction* UpdateOperand(HloComputation* computation,
-                              const HloInstruction* first_reshape_operand,
+HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
                               const Shape& new_operand_shape,
                               HloInstruction* operand) {
+  HloComputation* computation = operand->parent();
   const PrimitiveType element_type = operand->shape().element_type();
   const Shape new_shape =
       ShapeUtil::ChangeElementType(new_operand_shape, element_type);
@@ -245,42 +177,24 @@ HloInstruction* UpdateOperand(HloComputation* computation,
       VLOG(5) << "Using existing operand of kReshape or kTranspose";
       return operand->mutable_operand(0);
     }
-    case HloOpcode::kBroadcast:
-      CHECK(IsBroadcastScalarToShape(operand, first_reshape_operand->shape()));
-      VLOG(5) << "Changing broadcast";
-      return computation->AddInstruction(
+    case HloOpcode::kBroadcast: {
+      CHECK(ShapeUtil::IsScalar(operand->operand(0)->shape()));
+      HloInstruction* inst = computation->AddInstruction(
           operand->CloneWithNewOperands(new_shape, operand->operands()));
+      VLOG(5) << "Changing broadcast from " << operand->ToString() << " to "
+              << inst->ToString();
+      return inst;
+    }
 
     default:
       LOG(FATAL) << "Unexpected operand opcode during update: " << operand;
   }
 }
 
-// Try to sink any reshape or transpose operands of `instruction` across it. We
-// do so if `instruction` is elementwise and all operands are either equivalent
-// reshapes/transposes or are trivially reshapable.
-StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
-                                         HloInstruction* instruction) {
-  // Only perform sinks for live elementwise instructions with operands.
-  const bool is_dead = instruction->user_count() == 0 &&
-                       instruction != computation->root_instruction();
-  if (!instruction->IsElementwise() || instruction->operands().empty() ||
-      is_dead) {
-    return false;
-  }
-
-  // Only perform sinks if there are any nontrivial reshape/transpose operands.
-  const HloInstruction* first_reshape_operand =
-      FirstNonScalarAndNonTrivialReshapeOperand(instruction);
-  if (!first_reshape_operand) {
-    return false;
-  }
-
-  // Only perform sinks if all operands can easily change shape.
-  if (!AllOperandsHaveEasyShapeChanges(instruction, first_reshape_operand)) {
-    return false;
-  }
-
+// Actually performs the reshape-move transformation -- that is, sinks the
+// reshape or transpose operands of `instruction` across it.
+StatusOr<bool> PerformSinkReshapeOrTranspose(
+    HloInstruction* instruction, const HloInstruction* first_reshape_operand) {
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
   // At this point we've decided to sink reshape/transpose operands.
   const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
@@ -301,8 +215,8 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
     }
     VLOG(3) << "Updating operand #" << i << ": "
             << operands[i]->ToString(print_no_metadata);
-    operands[i] = UpdateOperand(computation, first_reshape_operand,
-                                new_operand_shape, operands[i]);
+    operands[i] =
+        UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
   }
   if (HloOpcode::kFusion == instruction->opcode()) {
     // Here we already know `instruction` is elementwise, and no operand is
@@ -314,6 +228,7 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
       *shape->mutable_layout() = new_operand_shape.layout();
     }
   }
+  HloComputation* computation = instruction->parent();
   HloInstruction* new_elementwise =
       computation->AddInstruction(instruction->CloneWithNewOperands(
           // `instruction` may change the element type, e.g., from
@@ -348,6 +263,141 @@ StatusOr<bool> TrySinkReshapeOrTranspose(HloComputation* computation,
   return true;
 }
 
+// Returns true if the instruction is a reshape-move candidate.
+//
+// An instruction is a reshape-move candidate if the instruction is elementwise,
+// has at least one nontrivial reshape/transpose operand, and its operands are
+// either trivially reshapable or are equivalent nontrivial reshapes/transposes.
+bool IsReshapeMoveCandidate(HloInstruction* instruction) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+  VLOG(5) << "** Checking instruction: "
+          << instruction->ToString(print_no_metadata);
+
+  // Only perform reshape-move for live elementwise instructions with operands.
+  const bool is_dead = instruction->user_count() == 0 &&
+                       instruction != instruction->parent()->root_instruction();
+  if (!instruction->IsElementwise() || instruction->operands().empty() ||
+      is_dead) {
+    return false;
+  }
+
+  // Check whether all operands:
+  //    0. Have the same dimensions as the output -- if not, they may be
+  //       implicitly broadcast, which can confound the movement's
+  //       correctness.
+  //
+  // And one of the following:
+  //    1. Are reshapes or transposes that have the same input and
+  //       output shapes as all other reshaped or transposed operands.
+  //     or
+  //    2. Are one of kConstant, kRng, broadcast of a scalar value, and scalars
+  //     that can change shape trivially.
+  const HloInstruction* first_reshape_operand = nullptr;
+  for (const HloInstruction* operand : instruction->operands()) {
+    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
+      VLOG(5) << "Operand shape differs from output shape; may be "
+                 "implicitly broadcast, so preventing "
+                 "movement\n\toperand: "
+              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
+              << instruction->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (CanTriviallyChangeShape(operand)) {
+      VLOG(5) << "Operand can trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      continue;
+    }
+
+    if (!IsNontrivialReshape(operand)) {
+      VLOG(5) << "Operand can't trivially change shape: "
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+
+    if (first_reshape_operand == nullptr) {
+      first_reshape_operand = operand;
+      VLOG(5) << "First reshape operand "
+              << operand->ToString(print_no_metadata);
+    } else if (AreEquivalentReshapes(first_reshape_operand, operand)) {
+      VLOG(5)
+          << "Operand is an equivalent reshape of the first reshape operand "
+          << operand->ToString(print_no_metadata);
+    } else {
+      // TODO(someone): Look into supporting general ops for the operands as
+      // well.
+      VLOG(5) << "Operand is a reshape but is not equivalent to the first "
+                 "Reshape operand"
+              << operand->ToString(print_no_metadata);
+      return false;
+    }
+  }
+
+  if (first_reshape_operand) {
+    VLOG(5) << "All operands have easy shape changes: "
+            << instruction->ToString(print_no_metadata);
+  }
+
+  return first_reshape_operand != nullptr;
+}
+
+// Reshape-moves all qualifying instructions in reshape_candidates.  Returns
+// true if it makes changes.
+//
+// `reshape_candidates` is a set of HloInstructions with nontrivial reshape
+// operands, and a instruction in the set can be reshape-moved iff all the users
+// of its nontrivial reshape operands can also be reshaped-moved.
+//
+// The algorithm here iteratively finds the nontrivial operands with users that
+// are outside the set of `reshape_candidates`, and removes their users from
+// `reshape_candidates`, until either `reshape_candidates` becomes empty or none
+// of the remaining nontrivial operands have users outside `reshape_candidates`.
+// In the later case, all the remaining instructions in `reshape_candidates`
+// are reshape-moved and the routine returns true.
+StatusOr<bool> TryReshapeMoveOnCandidates(
+    HloInstructionSet* reshape_candidates) {
+  bool removed = true;
+  while (!reshape_candidates->empty() && removed) {
+    if (VLOG_IS_ON(5)) {
+      for (const HloInstruction* instruction : *reshape_candidates) {
+        VLOG(5) << "candidate " << instruction->ToString();
+      }
+    }
+    ConstHloInstructionSet nontrivial_operands;
+    for (const HloInstruction* instruction : *reshape_candidates) {
+      for (const auto* operand : instruction->operands()) {
+        if (IsNontrivialReshape(operand)) {
+          nontrivial_operands.insert(operand);
+        }
+      }
+    }
+
+    removed = false;
+    for (auto operand : nontrivial_operands) {
+      if (c_any_of(operand->users(), [&](HloInstruction* user) {
+            return !reshape_candidates->count(user);
+          })) {
+        for (auto* user : operand->users()) {
+          removed |= reshape_candidates->erase(user) > 0;
+        }
+      }
+    }
+  }
+
+  if (reshape_candidates->empty()) {
+    return false;
+  }
+  for (HloInstruction* instruction : *reshape_candidates) {
+    const HloInstruction* first_reshape_operand =
+        FirstNonScalarAndNonTrivialReshapeOperand(instruction);
+    TF_ASSIGN_OR_RETURN(
+        bool did_change,
+        PerformSinkReshapeOrTranspose(instruction, first_reshape_operand));
+    CHECK(did_change);
+  }
+  return true;
+}
+
 }  // namespace
 
 StatusOr<bool> ReshapeMover::Run(HloModule* module) {
@@ -355,11 +405,15 @@ StatusOr<bool> ReshapeMover::Run(HloModule* module) {
   VLOG(2) << "Pre ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
   for (auto* comp : module->MakeNonfusionComputations()) {
-    for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
-      TF_ASSIGN_OR_RETURN(bool did_change,
-                          TrySinkReshapeOrTranspose(comp, instruction));
-      changed |= did_change;
+    HloInstructionSet reshape_candidates;
+    for (HloInstruction* instruction : comp->instructions()) {
+      if (IsReshapeMoveCandidate(instruction)) {
+        reshape_candidates.insert(instruction);
+      }
     }
+    TF_ASSIGN_OR_RETURN(bool did_change,
+                        TryReshapeMoveOnCandidates(&reshape_candidates));
+    changed |= did_change;
   }
   VLOG(2) << "Post ReshapeMover HLO:";
   XLA_VLOG_LINES(2, module->ToString());
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 4e0a0a8832379402edfc231ea84221448d70bac2..094f7319f462a71f4bfe972771a1de4aedbb8ee3 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -564,15 +564,15 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
   const string hlo_string = R"(
     HloModule TransposeMulInversedTransposeModule
     ENTRY TransposeMulInversedTranspose {
-      src0 = f32[1,20,8,32]{3,2,1,0} parameter(0)
-      transpose0 = f32[1,8,20,32]{3,2,1,0} transpose(src0), dimensions={0,2,1,3}
+      src0 = f32[20,8]{1,0} parameter(0)
+      transpose0 = f32[8,20]{1,0} transpose(src0), dimensions={1,0}
       src1 = f32[] parameter(1)
-      broadcast0 = f32[1,8,20,32]{3,2,1,0} broadcast(src1), dimensions={}
-      ROOT multiply0 = f32[1,8,20,32]{3,2,1,0} multiply(transpose0, broadcast0)
+      broadcast0 = f32[8,20]{1,0} broadcast(src1), dimensions={}
+      ROOT multiply0 = f32[8,20]{1,0} multiply(transpose0, broadcast0)
     }
   )";
 
-  ParseAndVerifyModule(hlo_string.c_str());
+  ParseAndVerifyModule(hlo_string);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
   EXPECT_TRUE(changed);
 
@@ -580,5 +580,75 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
               op::Transpose(op::Multiply()));
 }
 
+TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
+  const string hlo_string = R"(
+    HloModule ReshapeWithUsersOutsideCandidates
+    ENTRY ReshapeWithMultipleUsers {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      param4 = f32[8,20]{1,0} parameter(4)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, param4)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates1
+    ENTRY ReshapeWithMultipleUsers1 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      param1 = f32[] parameter(1)
+      broadcast0 = f32[8,20]{1,0} broadcast(param1), dimensions={}
+      param2 = f32[20,8]{1,0} parameter(2)
+      reshape1 = f32[8,20]{1,0} reshape(param2)
+      param3 = f32[20,8]{1,0} parameter(3)
+      reshape2 = f32[8,20]{1,0} reshape(param3)
+      add0 = f32[8,20]{1,0} add(reshape0, broadcast0)
+      add1 = f32[8,20]{1,0} add(reshape0, reshape1)
+      add2 = f32[8,20]{1,0} add(reshape1, reshape2)
+      ROOT tuple = (f32[8,20]{1,0},f32[8,20]{1,0},
+        f32[8,20]{1,0}) tuple(add0, add1, add2)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
+  const string hlo_string = R"(
+    HloModule ReshapeNoUsersOutsideCandidates2
+    ENTRY ReshapeWithMultipleUsers2 {
+      param0 = f32[20,8]{1,0} parameter(0)
+      reshape0 = f32[8,20]{1,0} reshape(param0)
+      ROOT add0 = f32[8,20]{1,0} add(reshape0, reshape0)
+    }
+  )";
+
+  ParseAndVerifyModule(hlo_string);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(&module()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module().entry_computation()->root_instruction(),
+              op::Reshape(op::Add()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 1d379f0d03fa509173ffaf7a69f21da62e9b44e0..70af1c44ea97fd5982b2acda541f48163dcb2896 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -409,6 +409,37 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
   return std::move(executables);
 }
 
+StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
+    const std::vector<const HloModuleProto*>& module_protos,
+    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+    Backend* backend,
+    std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+    DeviceMemoryAllocator* device_allocator) {
+  VLOG(1) << Printf("BuildExecutable on service %p", this);
+
+  VLOG(1) << "Computations:";
+  for (const HloModuleProto* proto : module_protos) {
+    VLOG(1) << proto->name();
+  }
+
+  CHECK_EQ(module_protos.size(), module_configs.size());
+  std::vector<std::unique_ptr<HloModule>> modules;
+  for (int64 i = 0; i < module_protos.size(); ++i) {
+    const HloModuleProto* proto = module_protos[i];
+    const HloModuleConfig& config = *module_configs[i];
+    TF_ASSIGN_OR_RETURN(auto module,
+                        HloModule::CreateFromProto(*proto, config));
+    modules.push_back(std::move(module));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<Executable>> executables,
+      backend->compiler()->Compile(std::move(modules), std::move(executors),
+                                   device_allocator));
+
+  return std::move(executables);
+}
+
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const VersionedComputationHandle& versioned_handle,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -703,6 +734,47 @@ tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
   return computation->SetReturnValue(arg->operand());
 }
 
+StatusOr<std::vector<perftools::gputools::StreamExecutor*>>
+Service::GetExecutors(const ExecutionOptions& execution_options,
+                      int64 requests_size, int64 request_index) const {
+  if (execution_options.device_handles().empty()) {
+    return FailedPrecondition(
+        "device handles must be given to execute parallel computations");
+  }
+  if (requests_size > 1 && execution_options.device_handles_size() > 1) {
+    return InvalidArgument(
+        "Parallel requests with multiple device handles is not supported. "
+        "Found %lld parallel requests, with request %lld containing %d device "
+        "handles.",
+        requests_size, request_index, execution_options.device_handles_size());
+  }
+  std::vector<perftools::gputools::StreamExecutor*> executors;
+  for (const auto& device_handle : execution_options.device_handles()) {
+    TF_ASSIGN_OR_RETURN(auto replicas,
+                        Replicas(*execute_backend_, device_handle));
+    se::StreamExecutor* executor = replicas[0];
+    CHECK(executor != nullptr);
+    executors.push_back(executor);
+  }
+  return executors;
+}
+
+StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
+    const ExecutionOptions& execution_options,
+    tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments) {
+  // Resolve the allocations for the arguments of the computation, and create
+  // a vector of device memory offsets for the arguments from the allocations.
+  // In the case of partitioned computations, assume all arguments go on the
+  // zeroth core.
+  TF_ASSIGN_OR_RETURN(
+      auto replicas,
+      Replicas(*execute_backend_, execution_options.device_handles(0)));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
+      ResolveAndValidateArguments(arguments, replicas));
+  return replicated_arguments;
+}
+
 tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
                                             ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
@@ -731,26 +803,10 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
     // is one of the executors to run the replicated computation.
     const ExecutionOptions& execution_options =
         arg->requests(i).execution_options();
-    if (execution_options.device_handles().empty()) {
-      return FailedPrecondition(
-          "device handles must be given to execute parallel computations");
-    }
-    if (arg->requests_size() > 1 &&
-        execution_options.device_handles_size() > 1) {
-      return InvalidArgument(
-          "Parallel requests with multiple device handles is not supported. "
-          "Found %d parallel requests, with request %lld containing %d device "
-          "handles.",
-          arg->requests_size(), i, execution_options.device_handles_size());
-    }
-    std::vector<perftools::gputools::StreamExecutor*> executors;
-    for (const auto& device_handle : execution_options.device_handles()) {
-      TF_ASSIGN_OR_RETURN(auto replicas,
-                          Replicas(*execute_backend_, device_handle));
-      se::StreamExecutor* executor = replicas[0];
-      CHECK(executor != nullptr);
-      executors.push_back(executor);
-    }
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
 
     // Resolve the UserComputation object associated with the requested
     // computation and compute the program shape.
@@ -767,16 +823,9 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
         std::shared_ptr<const ProgramShape> program_shape,
         user_computation->ComputeProgramShape(versioned_handle.version));
 
-    // Resolve the allocations for the arguments of the computation, and create
-    // a vector of device memory offsets for the arguments from the allocations.
-    // In the case of partitioned computations, assume all arguments go on the
-    // zeroth core.
-    TF_ASSIGN_OR_RETURN(
-        auto replicas,
-        Replicas(*execute_backend_, execution_options.device_handles(0)));
-    TF_ASSIGN_OR_RETURN(
-        std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-        ResolveAndValidateArguments(request.arguments(), replicas));
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
 
     // Create an HloModuleConfig object for the computation, given the shape of
     // the program and the argument allocations. Here, we care only about the
@@ -837,6 +886,107 @@ tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::ExecuteGraphParallel(
+    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
+  VLOG(1) << "running execute-graph-parallel request";
+
+  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
+  std::vector<std::vector<perftools::gputools::StreamExecutor*>> all_executors;
+  std::vector<const HloModuleProto*> module_protos;
+  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
+  std::vector<string> computation_names;
+  std::vector<DeviceHandle> device_handles;
+
+  int num_requested_devices =
+      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
+                      [](int a, const ExecuteGraphRequest& r) -> int {
+                        return a + r.execution_options().device_handles_size();
+                      });
+  if (num_requested_devices * options_.number_of_replicas() >
+      execute_backend_->device_count()) {
+    return FailedPrecondition(
+        "there are not enough stream executors to execute %d computations",
+        num_requested_devices);
+  }
+
+  for (int64 i = 0; i < arg->requests_size(); ++i) {
+    // Get the stream executor for the i'th computation. This stream executor
+    // is one of the executors to run the replicated computation.
+    const ExecutionOptions& execution_options =
+        arg->requests(i).execution_options();
+    const ExecuteGraphRequest& request = arg->requests(i);
+    TF_RET_CHECK(request.has_computation()) << "computations may not be empty";
+    TF_RET_CHECK(request.computation().has_program_shape())
+        << "programe shape may not be empty";
+
+    // Get the executors.
+    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
+                                                     arg->requests_size(), i));
+
+    // Get the replicated arguments.
+    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
+                        GetArguments(execution_options, request.arguments()));
+
+    // Create an HloModuleConfig object for the computation, given the shape of
+    // the program and the argument allocations. Here, we care only about the
+    // shapes of the arguments, so, it is sufficient to use the arguments of
+    // replica 0.
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModuleConfig> module_config,
+        CreateModuleConfig(request.computation().program_shape(),
+                           replicated_arguments.front(),
+                           request.execution_options(),
+                           /*user_computation=*/nullptr));
+    VLOG(3)
+        << "ExecuteGraphParallel created HloModuleConfig computation layout: "
+        << module_config->entry_computation_layout().ToString();
+
+    // Adds to the vectors to build and execute the computations after the loop.
+    all_arguments.push_back(replicated_arguments);
+    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
+    module_protos.push_back(&request.computation());
+    module_configs.push_back(std::move(module_config));
+    computation_names.insert(computation_names.end(), executors.size(),
+                             request.computation().name());
+    all_executors.push_back(executors);
+    device_handles.insert(device_handles.end(),
+                          execution_options.device_handles().begin(),
+                          execution_options.device_handles().end());
+  }
+
+  // Build the HloModules and compile to generate the executables.
+  //
+  // TODO(jlebar): There's currently no way to pass a device allocator to
+  // ExecuteGraphParallel, so we have to pass a null device_allocator below.
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<Executable>> executables,
+                      BuildExecutables(module_protos, std::move(module_configs),
+                                       execute_backend_.get(), all_executors,
+                                       /*device_allocator=*/nullptr));
+  std::vector<Executable*> executable_ptrs;
+  executable_ptrs.reserve(executables.size());
+  for (const auto& executable : executables) {
+    executable_ptrs.push_back(executable.get());
+  }
+
+  // Execute the generated executables in parallel and return the device
+  // handles for each computation's output.
+  ExecutionProfile profile;
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDataHandle> outputs,
+      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
+                                       execute_backend_.get(), device_handles,
+                                       computation_names, &profile));
+  for (const GlobalDataHandle& output : outputs) {
+    ExecuteResponse response;
+    *response.mutable_output() = output;
+    *response.mutable_profile() = profile;
+    *result->add_responses() = response;
+  }
+
+  VLOG(1) << "successfully completed 'execute-graph-parallel' request";
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                                              GetDeviceHandlesResponse* result) {
   const int64 available_device_count = execute_backend_->device_count();
@@ -867,6 +1017,20 @@ tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
   *parallel_arg.add_requests() = *arg;
   ExecuteParallelResponse parallel_result;
   TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                          ExecuteResponse* result) {
+  ExecuteGraphParallelRequest parallel_arg;
+  *parallel_arg.add_requests() = *arg;
+  ExecuteParallelResponse parallel_result;
+  TF_RETURN_IF_ERROR(ExecuteGraphParallel(&parallel_arg, &parallel_result));
+  return PickParallelResponse(parallel_result, result);
+}
+
+tensorflow::Status Service::PickParallelResponse(
+    const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
   // The "result device" selection is a bit hacky, but better than assuming it
   // is device 0. We have b/76035356 for restructuring the client API to clean
   // up the current asymmetries and support more functionalities.
@@ -994,8 +1158,14 @@ tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("programe shape may not be empty");
+  }
 
-  // TODO(b/74197823): Handle partitioning.
+  // If we received multiple device handles, we must partition the module.
+  if (arg->execution_options().device_handles_size() > 1) {
+    return ExecuteOneToN(arg, result);
+  }
 
   TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
                                               SingleComputationDeviceHandle()));
@@ -1374,6 +1544,50 @@ tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
 
   // Since the shape_with_output_layout option in ExecutionOption is
   // non-effective to the Evaluator results, explicit relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
+  if (arg->has_output_layout()) {
+    result_literal = result_literal->Relayout(arg->output_layout());
+  }
+  *result->mutable_literal() = result_literal->ToProto();
+
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status Service::ComputeConstantGraph(
+    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+  if (!arg->has_computation()) {
+    return InvalidArgument("computations may not be empty");
+  }
+  if (!arg->computation().has_program_shape()) {
+    return InvalidArgument("program shape may not be empty");
+  }
+  if (arg->computation().program_shape().parameters_size() != 0) {
+    return InvalidArgument(
+        "constant computation may not depend on any parameters.");
+  }
+
+  ProgramShape program_shape = arg->computation().program_shape();
+  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
+  if (arg->has_output_layout()) {
+    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
+        arg->output_layout(), program_shape.result()));
+  }
+
+  HloModuleConfig config(program_shape);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  HloEvaluator evaluator;
+  TF_ASSIGN_OR_RETURN(auto result_literal,
+                      evaluator.Evaluate<std::unique_ptr<Literal>>(
+                          *module, /*arg_literals=*/{}));
+
+  // Since the result layout is non-effective to the Evaluator results, explicit
+  // relayout here.
+  //
+  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
   if (arg->has_output_layout()) {
     result_literal = result_literal->Relayout(arg->output_layout());
   }
@@ -1445,6 +1659,29 @@ tensorflow::Status Service::GetComputationStats(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status Service::GetComputationGraphStats(
+    const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
+  HloModuleConfig config;
+  config.set_debug_options(arg->debug_options());
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      HloModule::CreateFromProto(arg->computation(), config));
+
+  hlo_graph_dumper::MaybeDumpHloModule(*module,
+                                       "computation statistics subject");
+
+  // Run HLO analysis to get the computation statistics.
+  HloCostAnalysis analysis(
+      execute_backend_->compiler()->ShapeSizeBytesFunction());
+
+  TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
+
+  ComputationStats stats;
+  stats.set_flop_count(analysis.flop_count());
+  stats.set_transcendental_count(analysis.transcendental_count());
+  *result->mutable_stats() = stats;
+  return tensorflow::Status::OK();
+}
+
 template <typename RequestT, typename ResponseT>
 tensorflow::Status Service::AddInstruction(
     const RequestT* arg, ResponseT* result,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 773f0a642dc93899828ef7b2dd4e271fc3d50d05..e399f1ac1904f8d6145f43b0ed12d8018765d9a1 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -126,6 +126,15 @@ class Service : public ServiceInterface {
   tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
                                      ExecuteParallelResponse* result) override;
 
+  // Executes one or more computations in parallel with the provided global data
+  // passed as immutable arguments. Returns global data output for each
+  // computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) override;
+
   // Requests one or more device handles from the target.
   //
   // When N device handles are requested and the number of replicas is R, at
@@ -197,6 +206,9 @@ class Service : public ServiceInterface {
   // Computes the value of a constant expression.
   tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
                                      ComputeConstantResponse* result) override;
+  tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
@@ -224,6 +236,13 @@ class Service : public ServiceInterface {
       const ComputationStatsRequest* arg,
       ComputationStatsResponse* result) override;
 
+  // Retrieves the statistics of a computation.
+  //
+  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) override;
+
   // Snapshots the current state of a computation handle into a serializable
   // protocol buffer form, so it can be loaded via
   // LoadComputationSnapshot.
@@ -262,6 +281,20 @@ class Service : public ServiceInterface {
       const ExecutionOptions& execution_options,
       const UserComputation* user_computation = nullptr);
 
+  // Picks a parallel response and fills the result.
+  Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
+                              ExecuteResponse* result);
+
+  // Prepare the executors for executing parallel.
+  StatusOr<std::vector<perftools::gputools::StreamExecutor*>> GetExecutors(
+      const ExecutionOptions& execution_options, int64 requests_size,
+      int64 request_index) const;
+
+  // Prepare the arguments for executing parallel.
+  StatusOr<std::vector<std::vector<const ShapedBuffer*>>> GetArguments(
+      const ExecutionOptions& execution_options,
+      tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments);
+
  protected:
   friend class LocalExecutable;
 
@@ -270,8 +303,6 @@ class Service : public ServiceInterface {
   Service(const ServiceOptions& options,
           std::unique_ptr<Backend> execute_backend);
 
-  static StatusOr<std::unique_ptr<Backend>> CreateComputeConstantBackend();
-
   // Resolves the given argument handles in the allocation tracker and returns
   // the corresponding allocations for every replica. The function also verifies
   // that each allocation matches the execution platform and device ordinal of
@@ -318,6 +349,12 @@ class Service : public ServiceInterface {
       Backend* backend,
       std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
+  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
+      const std::vector<const HloModuleProto*>& module_protos,
+      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
+      Backend* backend,
+      std::vector<std::vector<perftools::gputools::StreamExecutor*>> executors,
+      DeviceMemoryAllocator* device_allocator);
 
   // Similar to BuildExecutable, but look in the compilation cache for the
   // executable first. If the executable is not in the cache, it is built and
@@ -362,6 +399,8 @@ class Service : public ServiceInterface {
   // will be the result of this computation.
   tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
                                    ExecuteResponse* result);
+  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
+                                   ExecuteResponse* result);
 
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
@@ -399,8 +438,6 @@ class Service : public ServiceInterface {
   CompilationCache compilation_cache_;
 
   // Backend to compile and execute computations on.
-  //
-  // TODO(b/28616830): Support multiple backends for execution.
   std::unique_ptr<Backend> execute_backend_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Service);
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 36456d552d1ed41e192308fec7489a44f8dd5051..77e12d36024dae56003ad4e59b54f9934dfc2c58 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1070,6 +1070,12 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   for (const HloInstruction* operand : operands) {
     operand_shapes.push_back(&operand->shape());
   }
+  return InferVariadicOpShape(opcode, operand_shapes);
+}
+
+/* static */ StatusOr<Shape> ShapeInference::InferVariadicOpShape(
+    HloOpcode opcode,
+    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
   return InferVariadicOpShape(OpcodeToVariadicOperation(opcode),
                               operand_shapes);
 }
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 88830e6d2516cd664dd4e632adf0bdc72e451880..9da2c99b4177f08ece8daabaf2922ddd7e947a1b 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -83,6 +83,9 @@ class ShapeInference {
   static StatusOr<Shape> InferVariadicOpShape(
       VariadicOperation operation,
       tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
+  static StatusOr<Shape> InferVariadicOpShape(
+      HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
   static StatusOr<Shape> InferVariadicOpShape(
       HloOpcode opcode,
       tensorflow::gtl::ArraySlice<const HloInstruction*> operands);
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
index 0dca30a804005c6f536aca5b54af24eb08d4560b..532f7fd5bfc1dffa86638a6bc51832beebd74e1d 100644
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ b/tensorflow/compiler/xla/service/user_computation.cc
@@ -1284,8 +1284,8 @@ StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
     TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
   }
 
-  if (tensorflow::StringPiece(custom_call_request.call_target_name())
-          .starts_with("$")) {
+  if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(),
+                                       "$")) {
     return InvalidArgument(
         "Invalid custom_call_target \"%s\": Call targets that start with '$' "
         "are reserved for internal use.",
@@ -3491,7 +3491,6 @@ void ComputationLowerer::Visit(
       HloInstruction* operand = lookup_instruction(trace_request.operand());
       hlo_instruction = add_instruction(
           HloInstruction::CreateTrace(trace_request.tag(), operand));
-      operand->set_tracing(hlo_instruction);
       break;
     }
 
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index d8235113dd800f7bab5ceb70272a598b9dcb1fbe..5b44c26b7c7b082556d9533cf3b3b1b98e5e4b09 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -60,6 +60,10 @@ class ServiceInterface {
   virtual tensorflow::Status ExecuteParallel(
       const ExecuteParallelRequest* arg, ExecuteParallelResponse* result) = 0;
 
+  virtual tensorflow::Status ExecuteGraphParallel(
+      const ExecuteGraphParallelRequest* arg,
+      ExecuteParallelResponse* result) = 0;
+
   virtual tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
                                           ExecuteAsyncResponse* result) = 0;
 
@@ -72,6 +76,10 @@ class ServiceInterface {
   virtual tensorflow::Status GetComputationStats(
       const ComputationStatsRequest* arg, ComputationStatsResponse* result) = 0;
 
+  virtual tensorflow::Status GetComputationGraphStats(
+      const ComputationGraphStatsRequest* arg,
+      ComputationStatsResponse* result) = 0;
+
   virtual tensorflow::Status GetComputationShape(
       const GetComputationShapeRequest* arg,
       GetComputationShapeResponse* result) = 0;
@@ -104,6 +112,10 @@ class ServiceInterface {
   virtual tensorflow::Status ComputeConstant(
       const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
 
+  virtual tensorflow::Status ComputeConstantGraph(
+      const ComputeConstantGraphRequest* arg,
+      ComputeConstantResponse* result) = 0;
+
   // Methods used by Computation.
   virtual tensorflow::Status SnapshotComputation(
       const SnapshotComputationRequest* ag,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 4f604e6f7cb18c1aaf844967d54e3b0e07e54b34..6825d2476587d037aace043230168f78f4e46344 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -502,11 +502,11 @@ namespace {
 StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   tensorflow::str_util::RemoveLeadingWhitespace(s);
 
-  if (s->Consume("(")) {  // Tuple.
+  if (tensorflow::str_util::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
     bool must_end = false;
     while (true) {
-      if (s->Consume(")")) {
+      if (tensorflow::str_util::ConsumePrefix(s, ")")) {
         break;
       } else if (must_end) {
         return InvalidArgument("Expected end of tuple; got: \"%s\"",
@@ -515,7 +515,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
       tensorflow::str_util::RemoveLeadingWhitespace(s);
-      must_end = !s->Consume(",");
+      must_end = !tensorflow::str_util::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
   }
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 3e130a02e2ce853ee157e46afb9760f5ff5a5026..1375f981a880392e72db9946c5f006d164447baf 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,8 +28,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -583,34 +585,7 @@ class ShapeUtil {
                                        tensorflow::gtl::ArraySlice<int64> count,
                                        tensorflow::gtl::ArraySlice<int64> incr,
                                        const FnType& visitor_function) {
-    if (ShapeUtil::HasZeroElements(shape)) {
-      return Status::OK();
-    }
-    CHECK_EQ(Rank(shape), base.size());
-    CHECK_EQ(incr.size(), base.size());
-    CHECK_EQ(count.size(), base.size());
-    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
-    // Allows handling R0 arrays, such that the visitor function will be called
-    // once with the proper empty indexes.
-    int64 n = -1;
-    std::vector<int64> indexes(base.begin(), base.end());
-    while (n < rank) {
-      TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
-      if (!should_continue) {
-        break;
-      }
-      // Increments dimensions in minor to major order.
-      for (n = 0; n < rank; ++n) {
-        int64 dim = LayoutUtil::Minor(shape.layout(), n);
-        indexes[dim] += incr[dim];
-        if (indexes[dim] < base[dim] + count[dim]) {
-          break;
-        }
-        indexes[dim] = base[dim];
-      }
-    }
-
-    return Status::OK();
+    return ForEachIndexInternal(shape, base, count, incr, visitor_function);
   }
 
   // Simple ergonomic wrapper around ShapeUtil::ForEachIndexWithStatus.
@@ -642,11 +617,83 @@ class ShapeUtil {
         .IgnoreError();
   }
 
+  // A parallel version of ForEachIndex(WithStatus). This can only be used if
+  // the visitor_function is thread-safe and the order of iteration does not
+  // matter.
+  //
+  // visitor_function must be a callable of type
+  // void(ArraySlice<int64>) or compatible.
+  template <typename FnType>
+  static void ForEachIndexParallel(const Shape& shape,
+                                   tensorflow::gtl::ArraySlice<int64> base,
+                                   tensorflow::gtl::ArraySlice<int64> count,
+                                   tensorflow::gtl::ArraySlice<int64> incr,
+                                   const FnType& visitor_function) {
+    // The parallel version of ForEachIndexInternal can never fail.
+    CHECK(ForEachIndexInternal(
+              shape, base, count, incr,
+              [&visitor_function](tensorflow::gtl::ArraySlice<int64> indexes)
+                  -> StatusOr<bool> {
+                visitor_function(indexes);
+                return true;
+              },
+              /*parallel=*/true)
+              .ok());
+  }
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
   static Status ValidateShapeWithOptionalLayoutInternal(const Shape& shape);
 
+  template <typename FnType>
+  static Status ForEachIndexInternal(const Shape& shape,
+                                     tensorflow::gtl::ArraySlice<int64> base,
+                                     tensorflow::gtl::ArraySlice<int64> count,
+                                     tensorflow::gtl::ArraySlice<int64> incr,
+                                     const FnType& visitor_function,
+                                     bool parallel = false) {
+    if (ShapeUtil::HasZeroElements(shape)) {
+      return Status::OK();
+    }
+    CHECK_EQ(Rank(shape), base.size());
+    CHECK_EQ(incr.size(), base.size());
+    CHECK_EQ(count.size(), base.size());
+    const int64 rank = LayoutUtil::MinorToMajor(shape).size();
+    // Allows handling R0 arrays, such that the visitor function will be called
+    // once with the proper empty indexes.
+    int64 n = -1;
+    std::vector<int64> indexes(base.begin(), base.end());
+    const int kNumThreads = tensorflow::port::NumSchedulableCPUs();
+    tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
+    if (parallel) {
+      pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
+    }
+
+    while (n < rank) {
+      if (pool != tensorflow::gtl::nullopt) {
+        pool->Schedule(
+            [indexes, &visitor_function] { visitor_function(indexes); });
+      } else {
+        TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
+        if (!should_continue) {
+          break;
+        }
+      }
+      // Increments dimensions in minor to major order.
+      for (n = 0; n < rank; ++n) {
+        int64 dim = LayoutUtil::Minor(shape.layout(), n);
+        indexes[dim] += incr[dim];
+        if (indexes[dim] < base[dim] + count[dim]) {
+          break;
+        }
+        indexes[dim] = base[dim];
+      }
+    }
+
+    return Status::OK();
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 424cfe37ea44d64884e08695fd1f49ca1970ca62..13582a2a2678548dfc8e9c329dfb6def9d51fc9d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -624,6 +624,24 @@ TEST(ShapeUtilTest, ForEachIndexWithStatus) {
   EXPECT_EQ(invocations, 5);
 }
 
+TEST(ShapeUtilTest, ForEachIndexParallel) {
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 10});
+  int64 output[10][10];
+  int init = 5;
+  auto set_func = [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+    output[indexes[0]][indexes[1]] = init + indexes[0] + indexes[1];
+  };
+
+  ShapeUtil::ForEachIndexParallel(shape, /*base=*/{0, 0}, /*count=*/{10, 10},
+                                  /*incr=*/{1, 1}, set_func);
+
+  for (int i = 0; i < 10; ++i) {
+    for (int j = 0; j < 10; ++j) {
+      EXPECT_EQ(output[i][j], init + i + j);
+    }
+  }
+}
+
 TEST(ShapeUtilTest, DimensionsUnmodifiedByReshape_1x1x1x1_to_1x1x1) {
   // All output dimensions should be unmodified. One of the input dimensions is
   // modified because the input rank is larger by one.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 5ab25f226415efb3736e2626173b0ebcc182f312..1f90a44d8ba725c1bc7d23b581161f8915ff74fd 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -191,6 +191,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
+        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -347,10 +349,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -413,6 +415,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -488,9 +492,10 @@ xla_test(
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -639,9 +644,9 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -777,10 +782,10 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -856,11 +861,11 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -937,8 +942,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -977,9 +982,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -1011,6 +1015,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1062,6 +1068,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1084,10 +1092,11 @@ xla_test_library(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1124,11 +1133,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1265,9 +1274,9 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1371,11 +1380,10 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1393,8 +1401,8 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1442,9 +1450,9 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1476,11 +1484,14 @@ xla_test(
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1544,6 +1555,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1564,6 +1577,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1959,16 +1974,15 @@ tf_cc_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+xla_test(
+    name = "test_utils_test",
+    srcs = ["test_utils_test.cc"],
+    deps = [
+        ":local_client_test_base",
+        ":test_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
 )
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 28ab9654997728fbafd6610af840e721e72cce5a..f3dac75a44b948c4b45b80b93e7462073010979e 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -69,6 +69,15 @@ class BatchNormalizationTest
     CHECK_EQ(kY, input_array_.width());
   }
 
+  XlaOp CheckShape(XlaBuilder* b, const XlaOp& operand,
+                   const Shape& expected_shape) const {
+    Shape actual_shape = b->GetShape(operand).ConsumeValueOrDie();
+    CHECK(ShapeUtil::Equal(expected_shape, actual_shape))
+        << "want " << ShapeUtil::HumanString(expected_shape) << " got "
+        << ShapeUtil::HumanString(actual_shape);
+    return operand;
+  }
+
   static constexpr int64 kSamples = 3;
   static constexpr int64 kX = 1;
   static constexpr int64 kY = 1;
@@ -91,7 +100,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormalizationTestInstance, BatchNormalizationTest,
 #endif
 
 XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
-  ComputationBuilder builder(client_, "subtract_in_z_one_sample");
+  XlaBuilder builder("subtract_in_z_one_sample");
   auto x = builder.ConstantLiteral(input_literal_);
   auto y = builder.ConstantR1<float>({3.14, 4.25});
   builder.Sub(x, y, /*broadcast_dimensions=*/{1});
@@ -107,7 +116,7 @@ XLA_TEST_P(BatchNormalizationTest, SubtractInZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
-  ComputationBuilder builder(client_, "square_tesseract_elementwise");
+  XlaBuilder builder("square_tesseract_elementwise");
   auto x = builder.ConstantLiteral(input_literal_);
   builder.SquareF32(x);
 
@@ -124,9 +133,9 @@ XLA_TEST_P(BatchNormalizationTest, SquareTesseractElementwise) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SumToZ) {
-  ComputationBuilder builder(client_, "sum_to_z");
+  XlaBuilder builder("sum_to_z");
   auto input_activations = builder.ConstantLiteral(input_literal_);
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all but the Z dimension.
   builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                  {0, 2, 3});
@@ -136,24 +145,23 @@ XLA_TEST_P(BatchNormalizationTest, SumToZ) {
 }
 
 XLA_TEST_P(BatchNormalizationTest, SquareAndReduce) {
-  ComputationBuilder builder(client_, "square_and_reduce");
+  XlaBuilder builder("square_and_reduce");
   auto input_activations = builder.ConstantLiteral(input_literal_);
   auto set_means = builder.ConstantR1<float>({2.f, 4.2f});
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.Reduce(
-      dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
+  builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add, {0, 2, 3});
 
   std::vector<float> expected = {18, 0.06};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
 }
 
 XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
-  ComputationBuilder builder(client_, "variance_to_stddev");
+  XlaBuilder builder("variance_to_stddev");
   auto variance = builder.ConstantR1<float>({6.f, .02f});
-  auto sqrt = builder.SqrtF32(variance);
+  builder.SqrtF32(variance);
 
   std::vector<float> expected = {2.44948974f, 0.14142136f};
   ComputeAndCompareR1<float>(&builder, expected, {}, error_spec_);
@@ -162,23 +170,24 @@ XLA_TEST_P(BatchNormalizationTest, VarianceToStddev) {
 // Compare against a forward batch normalization example in the NN spec
 // reference.
 XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
-  ComputationBuilder builder(client_, "batch_normalize_per_spec");
+  XlaBuilder builder("batch_normalize_per_spec");
   auto input_activations =
-      builder.CheckShape(builder.ConstantLiteral(input_literal_),
-                         ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
+      CheckShape(&builder, builder.ConstantLiteral(input_literal_),
+                 ShapeUtil::MakeShape(F32, {3, 2, 1, 1}));
   auto gamma = builder.ConstantR1<float>({1.0, 1.0});
   auto beta = builder.ConstantR1<float>({0.0, 0.0});
-  Computation add = CreateScalarAddComputation(F32, &builder);
+  XlaComputation add = CreateScalarAddComputation(F32, &builder);
   // Reduce all dimensions except dimension 1.
   Shape TwoElementVectorF32 = ShapeUtil::MakeShape(F32, {2});
-  auto sum = builder.CheckShape(
+  auto sum = CheckShape(
+      &builder,
       builder.Reduce(input_activations, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
   auto input_shape = builder.GetShape(input_activations).ConsumeValueOrDie();
   auto sum_shape = builder.GetShape(sum).ConsumeValueOrDie();
-  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(*input_shape) /
-                                         ShapeUtil::ElementsIn(*sum_shape));
+  auto count = builder.ConstantR0<float>(ShapeUtil::ElementsIn(input_shape) /
+                                         ShapeUtil::ElementsIn(sum_shape));
   auto set_means = builder.Div(sum, count);
 
   const float kEpsilon = 1e-9f;
@@ -187,14 +196,16 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
   auto activation_deviations = builder.Sub(input_activations, set_means,
                                            /*broadcast_dimensions=*/{1});
   auto dev_squares = builder.SquareF32(activation_deviations);
-  auto sum_of_squares = builder.CheckShape(
+  auto sum_of_squares = CheckShape(
+      &builder,
       builder.Reduce(dev_squares, builder.ConstantR0<float>(0.0f), add,
                      /*dimensions_to_reduce=*/{0, 2, 3}),
       TwoElementVectorF32);
   auto variance = builder.Div(sum_of_squares, count);
   auto standard_deviation = builder.SqrtF32(variance);
-  auto standard_deviation_above_epsilon = builder.CheckShape(
-      builder.Gt(standard_deviation, epsilon), ShapeUtil::MakeShape(PRED, {2}));
+  auto standard_deviation_above_epsilon =
+      CheckShape(&builder, builder.Gt(standard_deviation, epsilon),
+                 ShapeUtil::MakeShape(PRED, {2}));
   auto gt_eps = builder.Select(standard_deviation_above_epsilon,
                                standard_deviation, epsilon2);
   auto normalization_factors = builder.ReciprocalF32(gt_eps);
@@ -219,7 +230,7 @@ XLA_TEST_P(BatchNormalizationTest, SpecComparisonForward) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
   const int kFeatureIndex = 3;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f, 2.f}}, {{3.f, 4.f}}}, {{{5.f, 6.f}}, {{7.f, 8.f}}}});
@@ -228,8 +239,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f, -2.0f}}, {{0.1f, 0.6f}}},
@@ -243,7 +254,7 @@ XLA_TEST_P(BatchNormalizationTest, BasicTraining) {
 
 XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand = builder.ConstantR4FromArray4D<float>(
       {{{{1.f}, {2.f}}, {{3.f}, {4.f}}}, {{{5.f}, {6.f}}, {{7.f}, {8.f}}}});
@@ -252,8 +263,8 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 
   auto offset = builder.ConstantR1<float>({1.0f, 2.0f});
 
-  auto tuple = builder.BatchNormTraining(operand, scale, offset,
-                                         /*epsilon=*/0.001, kFeatureIndex);
+  builder.BatchNormTraining(operand, scale, offset,
+                            /*epsilon=*/0.001, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR4<float>({{{{-1.6f}, {-2.0f}}, {{0.1f}, {0.6f}}},
@@ -268,23 +279,23 @@ XLA_TEST_P(BatchNormalizationTest, BasicTrainingOnSublane) {
 XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
   // Use 0 dimension as feature, tests layout analyzer.
   const int kFeatureIndex = 0;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>(Array3D<float>(260, 2, 2, 1.0f),
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(260, 1.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/1, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/1, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>(Array3D<float>(260, 2, 2, 1.0f))
@@ -300,24 +311,24 @@ XLA_TEST_P(BatchNormalizationTest, TrainingWithFeatureOnLowDimension) {
 XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
   // Test the correctness of choosing a large epsilon value.
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto operand = CreateR3Parameter<float>({{{0.0f}, {10.0f}, {20.0f}, {30.0f}}},
                                           /*parameter_number=*/0, "operand",
                                           &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto scale =
       CreateR1Parameter<float>(std::vector<float>(1, 1.0f),
                                /*parameter_number=*/1, "scale", &builder, &h1);
-  ComputationDataHandle h2;
+  XlaOp h2;
   auto offset =
       CreateR1Parameter<float>(std::vector<float>(1, 0.0f),
                                /*parameter_number=*/2, "offset", &builder, &h2);
 
   // var = 125, mean = 15, epsilon = -100
-  auto tuple = builder.BatchNormTraining(h0, h1, h2,
-                                         /*epsilon=*/-100, kFeatureIndex);
+  builder.BatchNormTraining(h0, h1, h2,
+                            /*epsilon=*/-100, kFeatureIndex);
 
   auto expected = Literal::MakeTuple(
       {Literal::CreateR3FromArray3D<float>({{{-3.0f}, {-1.0f}, {1.0f}, {3.0f}}})
@@ -332,7 +343,7 @@ XLA_TEST_P(BatchNormalizationTest, LargeEpsilonTest) {
 
 XLA_TEST_P(BatchNormalizationTest, BatchNormGradBasic) {
   const int kFeatureIndex = 2;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto operand =
       builder.ConstantR4FromArray4D<float>(Array4D<float>(2, 2, 2, 1, 0.0f));
@@ -439,7 +450,7 @@ INSTANTIATE_TEST_CASE_P(BatchNormTest_Instantiation, BatchNormTestManySizes,
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -539,7 +550,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedTrainingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -647,7 +658,7 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedInferencingTests) {
 
 XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   float epsilon = 0.001;
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const std::vector<int64>& bounds = GetParam().bounds;
   Array4D<float> input_array(bounds[0], bounds[1], bounds[2], bounds[3]);
   input_array.FillRandom(GetParam().random_value_var,
@@ -814,9 +825,9 @@ XLA_TEST_P(BatchNormTestManySizes, RandomizedGradTests) {
   std::unique_ptr<GlobalData> grad_output_data =
       client_->TransferToServer(*grad_output_literal).ConsumeValueOrDie();
 
-  auto t = builder.BatchNormGrad(input_parameter, scale_parameter,
-                                 mean_parameter, var_parameter,
-                                 grad_output_parameter, epsilon, feature_index);
+  builder.BatchNormGrad(input_parameter, scale_parameter, mean_parameter,
+                        var_parameter, grad_output_parameter, epsilon,
+                        feature_index);
 
   auto expected =
       Literal::MakeTuple({expected_grad_activation.get(),
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index 0d94d65c1015fb54ada3fdfc95d0c31d0a0f158b..777ac167a3c38c38791e12541a5db3078c37595b 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -42,7 +42,7 @@ class BitcastConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.BitcastConvertType(a, S32);
 
@@ -51,7 +51,7 @@ TEST_F(BitcastConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.BitcastConvertType(a, F32);
 
@@ -60,7 +60,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a =
       builder.ConstantR1<int32>({0, static_cast<int32>(0x80000000), 0x3F800000,
                                  static_cast<int32>(0xBF800000), 0x3F000000,
@@ -72,7 +72,7 @@ TEST_F(BitcastConvertTest, BitcastR1S32ToR1F32) {
 }
 
 XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.BitcastConvertType(a, F32);
 
@@ -81,7 +81,7 @@ XLA_TEST_F(BitcastConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.BitcastConvertType(a, S32);
 
@@ -90,7 +90,7 @@ TEST_F(BitcastConvertTest, ConvertR1F32ToR1S32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.BitcastConvertType(a, F32);
@@ -100,7 +100,7 @@ TEST_F(BitcastConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->BitcastConvertType(param, S32);
@@ -112,7 +112,7 @@ TEST_F(BitcastConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(BitcastConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->BitcastConvertType(param, F32);
@@ -129,7 +129,7 @@ TEST_F(BitcastConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(BitcastConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({0x42280000});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.BitcastConvertType(reshape, F32);
diff --git a/tensorflow/compiler/xla/tests/build_defs.bzl b/tensorflow/compiler/xla/tests/build_defs.bzl
index 610302ac1256a57db6ed6e18016a4136973e3891..eac2eb286c3f7a1cd33aed03686e99ef753b773a 100644
--- a/tensorflow/compiler/xla/tests/build_defs.bzl
+++ b/tensorflow/compiler/xla/tests/build_defs.bzl
@@ -137,7 +137,8 @@ def xla_test(name,
       backend_deps += ["//tensorflow/compiler/xla/tests:test_macros_gpu"]
       this_backend_tags += ["requires-gpu-sm35"]
     elif backend in plugins:
-      backend_deps = plugins[backend]["deps"]
+      backend_deps = []
+      backend_deps += plugins[backend]["deps"]
       this_backend_copts += plugins[backend]["copts"]
       this_backend_tags += plugins[backend]["tags"]
       this_backend_args += plugins[backend]["args"]
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index ec95a68ead055ae3ef301889806ef48982ed76f7..312d8f284d3421b4ef06b94c12949fc5fe4fa0b0 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,6 +36,10 @@ namespace se = ::perftools::gputools;
 
 namespace xla {
 namespace {
+
+// Name of the interpreter backend.
+constexpr char kInterpreter[] = "interpreter";
+
 // Wrapper function that creates a nicer error message (than a bare
 // ValueOrDie()) if the platform we intend to test is not available.
 Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
@@ -43,6 +48,14 @@ Client* GetOrCreateLocalClientOrDie(const LocalClientOptions& client_options) {
   TF_CHECK_OK(result.status()) << " could not create local client for testing";
   return result.ValueOrDie();
 }
+
+// Helper functions to get the reference platform.
+se::Platform* GetReferencePlatform() {
+  auto result = PlatformUtil::GetPlatform(kInterpreter);
+  TF_CHECK_OK(result.status()) << "could not get interpreter platform";
+  return result.ValueOrDie();
+}
+
 }  // namespace
 
 ClientLibraryTestBase::ClientLibraryTestBase(
@@ -66,6 +79,11 @@ ClientLibraryTestBase::ClientLibraryTestBase(se::Platform* platform)
   LocalClientOptions default_options;
   default_options.set_platform(platform);
   client_ = GetOrCreateLocalClientOrDie(default_options);
+
+  LocalClientOptions ref_options;
+  ref_options.set_platform(GetReferencePlatform());
+  ref_client_ = GetOrCreateLocalClientOrDie(ref_options);
+
   execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
       "constant_folding");
 }
@@ -74,9 +92,9 @@ string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
+template <typename BuilderT>
 StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   // Build the computation, as a convenience.
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   return client_->Execute(computation, arguments, &execution_options_);
@@ -127,6 +145,20 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
   return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
 }
 
+StatusOr<std::unique_ptr<Literal>>
+ClientLibraryTestBase::ExecuteAndTransferReference(
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+    const Shape* shape_with_output_layout) {
+  ExecutionOptions execution_options = execution_options_;
+  if (shape_with_output_layout != nullptr) {
+    *execution_options.mutable_shape_with_output_layout() =
+        *shape_with_output_layout;
+  }
+  return ref_client_->ExecuteAndTransfer(computation, arguments,
+                                         &execution_options);
+}
+
 std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
     ComputationBuilder* builder,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -441,8 +473,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   EXPECT_EQ(expected, actual->GetR1U8AsString());
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -453,8 +486,9 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
   LiteralTestUtil::ExpectEqual(expected, *actual);
 }
 
+template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
+    BuilderT* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -519,6 +553,69 @@ ClientLibraryTestBase::ComputeValueAndReference(
   return std::make_pair(std::move(reference), std::move(result));
 }
 
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectEqual(*reference, *result);
+}
+
+void ClientLibraryTestBase::ComputeAndCompare(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments,
+    ErrorSpec error) {
+  auto status_or_data = ComputeValueAndReference(builder, arguments);
+  EXPECT_IS_OK(status_or_data);
+  if (!status_or_data.ok()) {
+    return;
+  }
+  std::unique_ptr<Literal> reference, result;
+  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
+  LiteralTestUtil::ExpectNear(*reference, *result, error);
+}
+
+StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+ClientLibraryTestBase::ComputeValueAndReference(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<Literal> arguments) {
+  // Transfer the arguments to the executor service. We put the unique_ptr's
+  // into a vector to keep the data alive on the service until the end of this
+  // function.
+  std::vector<std::unique_ptr<GlobalData>> argument_data;
+  std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+  for (const auto& arg : arguments) {
+    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
+    TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
+    argument_data.push_back(std::move(data));
+    ref_argument_data.push_back(std::move(ref_data));
+  }
+
+  // Create raw pointers to the GlobalData for the rest of the call stack.
+  std::vector<GlobalData*> argument_data_ptr;
+  std::transform(
+      argument_data.begin(), argument_data.end(),
+      std::back_inserter(argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+  std::vector<GlobalData*> ref_argument_data_ptr;
+  std::transform(
+      ref_argument_data.begin(), ref_argument_data.end(),
+      std::back_inserter(ref_argument_data_ptr),
+      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
+
+  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
+
+  TF_ASSIGN_OR_RETURN(auto result,
+                      ExecuteAndTransfer(computation, argument_data_ptr));
+
+  TF_ASSIGN_OR_RETURN(auto reference, ExecuteAndTransferReference(
+                                          computation, ref_argument_data_ptr));
+
+  return std::make_pair(std::move(reference), std::move(result));
+}
+
 Computation ClientLibraryTestBase::CreateScalarRelu() {
   ComputationBuilder builder(client_, "relu");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
@@ -593,12 +690,26 @@ ComputationDataHandle ClientLibraryTestBase::AddParam(
   return data_handle;
 }
 
+XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
+                                      XlaBuilder* builder) {
+  XlaOp data_handle;
+  arguments_.push_back(CreateParameterAndTransferLiteral(
+      arguments_.size(), argument, "", builder, &data_handle));
+  return data_handle;
+}
+
 ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
     const Literal& literal, ComputationBuilder* builder) {
   return builder->ConstantLiteral(
       use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
 }
 
+XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
+                                                       XlaBuilder* builder) {
+  return builder->ConstantLiteral(
+      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+}
+
 template void ClientLibraryTestBase::ComputeAndCompareLiteral(
     ComputationBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -619,4 +730,27 @@ template void ClientLibraryTestBase::ComputeAndCompareLiteral(
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
     const Shape* shape_with_layout);
 
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    ComputationBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+template void ClientLibraryTestBase::ComputeAndCompareTuple(
+    XlaBuilder* builder, const Literal& expected,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    ComputationBuilder* builder,
+    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
+template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 5ff200be03ebd2aa76144644acc86f85037fff5a..b3212dd2282375367ce890e960278fc469a5ef52 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -92,9 +92,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Convenience methods for building and running a computation with the member
   // execution options. Modify execution_options_ in your test if you want to
   // customize the options.
+  template <typename BuilderT>
   StatusOr<std::unique_ptr<GlobalData>> Execute(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
   // the migration to XlaBuilder is complete.
@@ -114,6 +114,14 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
+  // This executes the computation via the reference client (which connects a
+  // interpreter backend). The result is used as the expected values of the
+  // computation.
+  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransferReference(
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      const Shape* shape_with_output_layout = nullptr);
+
   // Convenience OrDie variants of above methods.
   std::unique_ptr<GlobalData> ExecuteOrDie(
       ComputationBuilder* builder,
@@ -217,11 +225,13 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Convenience method for running a built computation, transferring the
   // result, and comparing it to the expected tuple literal.
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      ComputationBuilder* builder, const Literal& expected,
+      BuilderT* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
   // Convenience method for running a built computation and comparing the result
@@ -234,6 +244,14 @@ class ClientLibraryTestBase : public ::testing::Test {
                          tensorflow::gtl::ArraySlice<Literal> arguments,
                          ErrorSpec error);
 
+  // Convenience method for running a built computation and comparing the result
+  // with the reference result.
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments);
+  void ComputeAndCompare(XlaBuilder* builder,
+                         tensorflow::gtl::ArraySlice<Literal> arguments,
+                         ErrorSpec error);
+
   // Create scalar operations for use in reductions.
   Computation CreateScalarRelu();
   Computation CreateScalarMax();
@@ -298,18 +316,24 @@ class ClientLibraryTestBase : public ::testing::Test {
   // set exactly once. The first added parameter gets index 0, then 1 and so on.
   ComputationDataHandle AddParam(const Literal& argument,
                                  ComputationBuilder* builder);
+  XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
 
   template <class T>
   ComputationDataHandle AddParam(const Array<T>& argument,
                                  ComputationBuilder* builder) {
     return AddParam(*Literal::CreateFromArray(argument), builder);
   }
+  template <class T>
+  XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
+    return AddParam(*Literal::CreateFromArray(argument), builder);
+  }
 
   // Creates a constant instruction with the given literal. When the
   // use_bfloat16 flag is set but the literal has F32 elements, the elements
   // will be converted to BF16s.
   ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
                                                   ComputationBuilder* builder);
+  XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
 
   // Creates a constant instruction with the given array. When the use_bfloat16
   // flag is set but the array has float elements, the elements will be
@@ -320,6 +344,12 @@ class ClientLibraryTestBase : public ::testing::Test {
     return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromArray(const Array<NativeT>& array,
+                                XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
+  }
+
   // Same as CreateConstantFromArray, but for scalars.
   template <typename NativeT>
   ComputationDataHandle CreateConstantFromScalar(NativeT value,
@@ -328,6 +358,12 @@ class ClientLibraryTestBase : public ::testing::Test {
                                      builder);
   }
 
+  template <typename NativeT>
+  XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
+    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
+                                     builder);
+  }
+
   // Creates a parameter instruction that wraps a given value and then stores
   // into "data_handle" the global handle for that parameter.
   //
@@ -393,6 +429,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   PrimitiveType FloatType() const { return use_bfloat16_ ? BF16 : F32; }
 
   Client* client_;
+  Client* ref_client_;  // To compute reference result.
   ExecutionOptions execution_options_;
 
  private:
@@ -424,12 +461,19 @@ class ClientLibraryTestBase : public ::testing::Test {
       const Shape* output_with_layout = nullptr);
 
   // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literal in the order of (expected, actual).
+  // the HloEvaluator. Returns two literals in the order of (expected, actual).
   StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
   ComputeValueAndReference(ComputationBuilder* builder,
                            const ComputationDataHandle& operand,
                            tensorflow::gtl::ArraySlice<Literal> arguments);
 
+  // Executes the computation and calculates the expected reference value using
+  // the reference client. Returns two literals in the order of (expected,
+  // actual).
+  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
+  ComputeValueAndReference(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<Literal> arguments);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 045148cdd11da94ae4789a753efca95c6aaa1f27..32e2f2c0848407ec46a5ac52e2668ef27b92c426 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -109,14 +111,14 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
 
 XLA_TEST_F(ClientTest,
         DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(ExecuteParallel))) {
-  Computation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
+  XlaComputation add_with_one_arg, mul_with_two_args, dot_with_one_arg;
   Shape shape = ShapeUtil::MakeShape(S32, {2, 2});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<GlobalData> const_arg,
       client_->TransferToServer(*Literal::CreateR2<int32>({{5, 6}, {7, 8}})));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Add(b.Parameter(0, shape, "param_0"),
         b.ConstantR2<int32>({{1, 2}, {3, 4}}));
   TF_ASSERT_OK_AND_ASSIGN(add_with_one_arg, b.Build());
@@ -124,14 +126,14 @@ XLA_TEST_F(ClientTest,
   // We can't really test parallel execution on CPU since all of the cores in a
   // CPU are presented as a single device.  So for now we test "parallel"
   // execution on a single device.
-  std::vector<Client::ComputationInstance> computation_instances;
+  std::vector<Client::XlaComputationInstance> computation_instances;
   TF_ASSERT_OK_AND_ASSIGN(std::vector<xla::DeviceHandle> devices,
                           client_->GetDeviceHandles(1));
   ASSERT_EQ(devices.size(), 1);
 
   ExecutionOptions options = execution_options_;
   *options.add_device_handles() = devices[0];
-  computation_instances.push_back(Client::ComputationInstance(
+  computation_instances.push_back(Client::XlaComputationInstance(
       add_with_one_arg, {const_arg.get()}, options, nullptr));
 
   TF_ASSERT_OK_AND_ASSIGN(auto results,
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index ec2c580670cfac14ba42e8c9a836c86551af4b89..c15d808f1ddfb44a512fa395bb8e515bca3859b6 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -31,6 +33,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -70,28 +74,35 @@ class ComputeConstantTest : public ::testing::Test {
   }
 
   StatusOr<std::unique_ptr<Literal>> ComputeConstantLiteral(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder, Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(auto computed, builder->ComputeConstant(
-                                           operand, output_layout, parameters));
+      Client* client, const XlaOp& operand, XlaBuilder* builder,
+      Layout* output_layout = nullptr) {
+    TF_ASSIGN_OR_RETURN(auto subgraph, builder->BuildConstantSubGraph(operand));
+    TF_ASSIGN_OR_RETURN(auto computed,
+                        client->ComputeConstant(subgraph, output_layout));
     return std::move(computed);
   }
 
+  template <class Scalar>
+  StatusOr<Scalar> ComputeConstantScalar(Client* client, const XlaOp& operand,
+                                         XlaBuilder* builder) {
+    TF_ASSIGN_OR_RETURN(auto literal, ComputeConstantLiteral(client, operand,
+                                                             builder, nullptr));
+    return literal->Get<Scalar>({});
+  }
+
   template <class Scalar>
   StatusOr<Scalar> ComputeConstantScalar(
       Client* client, const ComputationDataHandle& operand,
       ComputationBuilder* builder,
       tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(
-        auto literal,
-        ComputeConstantLiteral(client, operand, builder, nullptr, parameters));
+    TF_ASSIGN_OR_RETURN(auto literal,
+                        builder->ComputeConstant(
+                            operand, /*output_layout=*/nullptr, parameters));
     return literal->Get<Scalar>({});
   }
 
-  bool IsConstant(const ComputationDataHandle& operand,
-                  ComputationBuilder* builder, int64 num_parameters = 0) {
-    StatusOr<bool> result = builder->IsConstant(operand, num_parameters);
+  bool IsConstant(const XlaOp& operand, XlaBuilder* builder) {
+    StatusOr<bool> result = builder->IsConstant(operand);
     EXPECT_TRUE(result.ok()) << result.status();
     return result.ok() ? result.ValueOrDie() : false;
   }
@@ -102,7 +113,7 @@ class ComputeConstantTest : public ::testing::Test {
 TEST_F(ComputeConstantTest, ScalarInt32Literal) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.ConstantR0<int32>(42);
     EXPECT_TRUE(IsConstant(computation, &b));
 
@@ -115,7 +126,7 @@ TEST_F(ComputeConstantTest, ScalarInt32Literal) {
 TEST_F(ComputeConstantTest, ScalarFloatAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(42.5f), b.ConstantR0<float>(1.5f));
     EXPECT_TRUE(IsConstant(computation, &b));
@@ -129,7 +140,7 @@ TEST_F(ComputeConstantTest, ScalarFloatAdd) {
 TEST_F(ComputeConstantTest, ScalarRng) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.RngUniform(b.ConstantR0<float>(1.1f), b.ConstantR0<float>(2.1f),
                      ShapeUtil::MakeShape(F32, {}));
@@ -150,25 +161,27 @@ TEST_F(ComputeConstantTest, Param) {
 
     std::vector<Literal> arguments;
     arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
-    EXPECT_TRUE(IsConstant(computation, &b, arguments.size()));
-
-    auto value =
-        ComputeConstantScalar<float>(client, computation, &b, arguments);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 44.0f);
+    TF_ASSERT_OK_AND_ASSIGN(bool is_constant,
+                            b.IsConstant(computation, arguments.size()));
+    EXPECT_TRUE(is_constant);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value,
+        ComputeConstantScalar<float>(client, computation, &b, arguments));
+    EXPECT_EQ(value, 44.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param");
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
@@ -176,15 +189,15 @@ TEST_F(ComputeConstantTest, DirectParamMissing) {
 TEST_F(ComputeConstantTest, IndirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation =
         b.Add(b.ConstantR0<float>(1.0f),
               b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
     EXPECT_FALSE(IsConstant(computation, &b));
 
     auto value = ComputeConstantScalar<float>(client, computation, &b);
-    EXPECT_TRUE(tensorflow::StringPiece(value.status().ToString())
-                    .contains("depends on a parameter"))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(value.status().ToString(),
+                                                  "depends on a parameter"))
         << value.status();
   }
 }
@@ -194,7 +207,7 @@ TEST_F(ComputeConstantTest, IndirectParamMissing) {
 TEST_F(ComputeConstantTest, UnrelatedParam) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto param_a = b.Parameter(10, ShapeUtil::MakeShape(F32, {}), "param0");
     auto constant_4 =
@@ -211,64 +224,64 @@ TEST_F(ComputeConstantTest, UnrelatedParam) {
 
     EXPECT_TRUE(IsConstant(constant_13, &b));
 
-    auto value = ComputeConstantScalar<float>(client, constant_13, &b);
-    ASSERT_TRUE(value.ok()) << value.status();
-    EXPECT_EQ(value.ValueOrDie(), 13.0f);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto value, ComputeConstantScalar<float>(client, constant_13, &b));
+    EXPECT_EQ(value, 13.0f);
   }
 }
 
 TEST_F(ComputeConstantTest, NonScalarAdd) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     auto computation =
         b.Add(b.ConstantR1<int32>({1, 2}), b.ConstantR1<int32>({3, 4}));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 TEST_F(ComputeConstantTest, IntegerDivide) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
     auto computation = b.Div(b.ConstantR0<int32>(15), b.ConstantR0<int32>(3));
     EXPECT_TRUE(IsConstant(computation, &b));
 
-    auto computed = ComputeConstantLiteral(client, computation, &b);
-    ASSERT_TRUE(computed.ok()) << computed.status();
+    TF_ASSERT_OK_AND_ASSIGN(auto computed,
+                            ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
   }
 }
 
 XLA_TEST_F(ComputeConstantTest, Layout) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
+    XlaBuilder b(TestName());
 
     std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
     for (const std::vector<int64>& layout : layouts) {
       auto layout_proto = LayoutUtil::MakeLayout(layout);
-      auto computed = ComputeConstantLiteral(
-          client,
-          b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
-                b.ConstantR2<int32>({{10, 20}, {30, 40}})),
-          &b, &layout_proto);
-      ASSERT_TRUE(computed.ok()) << computed.status();
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto computed, ComputeConstantLiteral(
+                             client,
+                             b.Add(b.ConstantR2<int32>({{1, 2}, {3, 4}}),
+                                   b.ConstantR2<int32>({{10, 20}, {30, 40}})),
+                             &b, &layout_proto));
 
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(
-          expected_literal->shape(), computed.ValueOrDie()->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed.ValueOrDie());
+      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
+                                                   computed->shape());
+      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index fb0e9c724a69b61801e6e0c2d07ef75b63a00465..a4c8a83eb15f7cc279b6c8f1bf1394c0afb9f7cf 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -38,9 +38,9 @@ using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
 XLA_TEST_F(ConcatTest, Concat_Nothing) {
-  ComputationBuilder builder(client_, TestName());
-  auto concatenated = builder.ConcatInDim({}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  XlaBuilder builder(TestName());
+  builder.ConcatInDim({}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("Concatenate expects at least one argument"));
@@ -48,18 +48,18 @@ XLA_TEST_F(ConcatTest, Concat_Nothing) {
 
 // Concatenate with one argument works.
 XLA_TEST_F(ConcatTest, Concat_R1_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a}, 0);
+  builder.ConcatInDim({a}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -68,51 +68,51 @@ XLA_TEST_F(ConcatTest, Concat_R1_L0_With_Nothing) {
 // Show that we can't concatenate R0 with R0 because we can't name the dimension
 // to concatenate on.
 XLA_TEST_F(ConcatTest, CannotConcatR0WithR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR0<float>(42.0);
   auto b = builder.ConstantR0<float>(64.0);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({a, b}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(computation_status.status().ToString(),
               HasSubstr("out of bounds: 0"));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L0_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_L2_With_R1_L1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0, 64.0});
   auto b = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -129,20 +129,20 @@ XLA_TEST_F(ConcatTest, Concat_R1_L253_With_R1_L7) {
     expected[253 + i] = rhs[i] = 253 + i + 1;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>(lhs);
   auto b = builder.ConstantR1<float>(rhs);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
   for (int dim : {0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto a = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
     auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 0));
-    auto concatenated = builder.ConcatInDim({a, b}, dim);
+    builder.ConcatInDim({a, b}, dim);
 
     ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 0), {},
                                ErrorSpec(0.0001));
@@ -150,26 +150,27 @@ XLA_TEST_F(ConcatTest, Concat_0x0_With_0x0) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
-      {0}, {64},
+      {0},
+      {64},
   });
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(1, 1);
   auto b_array = CreatePatternedMatrix(1, 1, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 64},
@@ -178,22 +179,22 @@ XLA_TEST_F(ConcatTest, Concat_1x1_With_1x1_InDim1) {
 }
 
 XLA_TEST_F(ConcatTest, Concat2x0With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(Array2D<float>(2, 0));
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   ComputeAndCompareR2<float>(&builder, *b_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(2, 3);
   auto b_array = CreatePatternedMatrix(2, 5, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 1);
+  builder.ConcatInDim({a, b}, 1);
 
   Array2D<float> expected({
       {0, 1, 2, 64, 65, 66, 67, 68},
@@ -203,22 +204,22 @@ XLA_TEST_F(ConcatTest, Concat2x3With2x5) {
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(Array2D<float>(0, 2));
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   ComputeAndCompareR2<float>(&builder, *a_array, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a_array = CreatePatternedMatrix(3, 2);
   auto b_array = CreatePatternedMatrix(5, 2, /*offset=*/64.0);
   auto a = builder.ConstantR2FromArray2D(*a_array);
   auto b = builder.ConstantR2FromArray2D(*b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 0);
+  builder.ConcatInDim({a, b}, 0);
 
   Array2D<float> expected({
       {0, 1},
@@ -234,16 +235,16 @@ XLA_TEST_F(ConcatTest, Concat3x2With5x2) {
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x0x2_3x0x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 2));
   auto b = builder.ConstantR3FromArray3D(Array3D<float>(3, 0, 1));
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
   ComputeAndCompareR3<float>(&builder, Array3D<float>(3, 0, 3), {},
                              ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -258,27 +259,29 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1) {
   });
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
-  auto concatenated = builder.ConcatInDim({a, b}, 2);
+  builder.ConcatInDim({a, b}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 6}}, {{2, 3, 7}}, {{4, 5, 8}},
+      {{0, 1, 6}},
+      {{2, 3, 7}},
+      {{4, 5, 8}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R1_1x1_1x1_1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
-  auto concatenated = builder.ConcatInDim({a, b, c}, 0);
+  builder.ConcatInDim({a, b, c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> a_array({
       // 3x1x2
       {{0, 1}},
@@ -300,35 +303,35 @@ XLA_TEST_F(ConcatTest, Concat_R3_3x1x2_3x1x1_3x1x1) {
   auto a = builder.ConstantR3FromArray3D(a_array);
   auto b = builder.ConstantR3FromArray3D(b_array);
   auto c = builder.ConstantR3FromArray3D(c_array);
-  auto concatenated = builder.ConcatInDim({a, b, c}, 2);
+  builder.ConcatInDim({a, b, c}, 2);
 
   Array3D<float> expected({
-      {{0, 1, 2, 3}}, {{4, 5, 6, 7}}, {{8, 9, 10, 11}},
+      {{0, 1, 2, 3}},
+      {{4, 5, 6, 7}},
+      {{8, 9, 10, 11}},
   });
   ComputeAndCompareR3<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatLeftAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = (a concat b) concat c
-  auto concatenated =
-      builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
+  builder.ConcatInDim({builder.ConcatInDim({a, b}, 0), c}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConcatTest, DoubleConcatRightAssociative) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0});
   auto b = builder.ConstantR1<float>({64.0});
   auto c = builder.ConstantR1<float>({256.0});
   // concatenated = a concat (b concat c)
-  auto concatenated =
-      builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
+  builder.ConcatInDim({a, builder.ConcatInDim({b, c}, 0)}, 0);
 
   std::vector<float> expected = {42, 64, 256};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -342,7 +345,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim0) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 0);
@@ -363,7 +366,7 @@ XLA_TEST_F(ConcatTest, Concat_1x1024_With_1x1024_InDim1) {
     rhs(0, i) = i + 1024;
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -388,7 +391,7 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2D<float>(lhs);
   auto b = builder.ConstantR2FromArray2D<float>(rhs);
   builder.ConcatInDim({a, b}, 1);
@@ -404,13 +407,13 @@ XLA_TEST_F(ConcatTest, Concat_64x64_With_64x2) {
 
 // Show that we can't concatenate with an opaques.
 XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto opaque_shape = ShapeUtil::MakeOpaqueShape();
   auto r1f32 = xla::ShapeUtil::MakeShape(xla::F32, {1});
   auto x = builder.Parameter(0, r1f32, "x");
   auto y = builder.Parameter(1, opaque_shape, "y");
-  auto concatenated = builder.ConcatInDim({x, y}, 0);
-  StatusOr<Computation> computation_status = builder.Build();
+  builder.ConcatInDim({x, y}, 0);
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_FALSE(computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -418,23 +421,23 @@ XLA_TEST_F(ConcatTest, CannotConcatOpaques) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralBoxedPredicates) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto p0 = builder.ConstantR1<bool>({true});
   auto p1 = builder.ConstantR1<bool>({false});
   auto p2 = builder.ConstantR1<bool>({true});
-  auto concatenated = builder.ConcatInDim({p0, p1, p2}, 0);
+  builder.ConcatInDim({p0, p1, p2}, 0);
 
   bool expected[] = {true, false, true};
   ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR1<int32>({1});
   auto a1 = builder.ConstantR1<int32>({2, 3});
   auto a2 = builder.ConstantR1<int32>({4, 5, 6});
   auto a3 = builder.ConstantR1<int32>({7, 8, 9, 10});
-  auto concatenated = builder.ConcatInDim({a0, a1, a2, a3}, 0);
+  builder.ConcatInDim({a0, a1, a2, a3}, 0);
 
   std::vector<int32> expected(10);
   std::iota(expected.begin(), expected.end(), 1);
@@ -442,7 +445,7 @@ XLA_TEST_F(ConcatTest, ConcatSeveralR1S32s) {
 }
 
 XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Array3D<float> arr0(9, 17, 1);
   arr0.Fill(1);
@@ -462,14 +465,14 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
     }
   }
 
-  ComputationDataHandle h0;
+  XlaOp h0;
   auto p0 = CreateR3Parameter<float>(arr0, /*parameter_number=*/0, "p0",
                                      &builder, &h0);
-  ComputationDataHandle h1;
+  XlaOp h1;
   auto p1 = CreateR3Parameter<float>(arr1, /*parameter_number=*/1, "p1",
                                      &builder, &h1);
 
-  auto concatenated = builder.ConcatInDim({h0, h1}, 2);
+  builder.ConcatInDim({h0, h1}, 2);
 
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
@@ -495,7 +498,7 @@ TEST_P(ConcatR2BinaryTest, DoIt) {
   Array2D<int32> rhs(spec.rhs_dim0, spec.rhs_dim1);
   rhs.FillUnique(1000);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a0 = builder.ConstantR2FromArray2D<int32>(lhs);
   auto a1 = builder.ConstantR2FromArray2D<int32>(rhs);
   builder.ConcatInDim({a0, a1}, spec.concat_dimension);
@@ -521,7 +524,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
   auto x_data = client_->TransferToServer(*x_literal).ConsumeValueOrDie();
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, f32_scalar, "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto mul = builder.Mul(x, y);
@@ -545,7 +548,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "z");
@@ -573,7 +576,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
   auto y_data = client_->TransferToServer(*y_literal).ConsumeValueOrDie();
   auto z_data = client_->TransferToServer(*z_literal).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, x_literal->shape(), "x");
   auto y = builder.Parameter(1, f32_scalar, "y");
   auto z = builder.Parameter(2, f32_scalar, "y");
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index b917dee77b5400db8f2c0a6a86258fee64723d71..7ff6706935740c7d76ee5cd03eae292386760397 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -23,8 +24,8 @@ namespace {
 
 class ConditionalOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0ConstantComputation(float value) {
-    ComputationBuilder builder(client_, "Constant");
+  XlaComputation CreateR0ConstantComputation(float value) {
+    XlaBuilder builder("Constant");
     builder.Parameter(0, empty_tuple_, "tuple");
     builder.ConstantR0<float>(value);
     auto build_status = builder.Build();
@@ -32,16 +33,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateCeilComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Ceil");
+  XlaComputation CreateCeilComputation(const Shape& shape) {
+    XlaBuilder builder("Ceil");
     auto param = builder.Parameter(0, shape, "param");
     builder.Ceil(param);
     auto build_status = builder.Build();
@@ -49,16 +50,16 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0CeilComputation() {
+  XlaComputation CreateR0CeilComputation() {
     return CreateCeilComputation(r0f32_);
   }
 
-  Computation CreateR1CeilComputation() {
+  XlaComputation CreateR1CeilComputation() {
     return CreateCeilComputation(r1s2f32_);
   }
 
-  Computation CreateFloorComputation(const Shape& shape) {
-    ComputationBuilder builder(client_, "Floor");
+  XlaComputation CreateFloorComputation(const Shape& shape) {
+    XlaBuilder builder("Floor");
     auto param = builder.Parameter(0, shape, "param");
     builder.Floor(param);
     auto build_status = builder.Build();
@@ -66,17 +67,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0FloorComputation() {
+  XlaComputation CreateR0FloorComputation() {
     return CreateFloorComputation(r0f32_);
   }
 
-  Computation CreateR1FloorComputation() {
+  XlaComputation CreateR1FloorComputation() {
     return CreateFloorComputation(r1s2f32_);
   }
 
-  Computation CreateTupleCeilComputation(const string& computation_name,
-                                         const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleCeilComputation(const string& computation_name,
+                                            const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -88,17 +89,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleCeilComputation() {
+  XlaComputation CreateR0TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleCeilComputation() {
+  XlaComputation CreateR1TupleCeilComputation() {
     return CreateTupleCeilComputation("CeilR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleFloorComputation(const string& computation_name,
-                                          const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleFloorComputation(const string& computation_name,
+                                             const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -110,17 +111,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleFloorComputation() {
+  XlaComputation CreateR0TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleFloorComputation() {
+  XlaComputation CreateR1TupleFloorComputation() {
     return CreateTupleFloorComputation("FloorR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleAddComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleAddComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -130,17 +131,17 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleAddComputation() {
+  XlaComputation CreateR0TupleAddComputation() {
     return CreateTupleAddComputation("AddR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleAddComputation() {
+  XlaComputation CreateR1TupleAddComputation() {
     return CreateTupleAddComputation("AddR1", tuple_2_r1s2f32_);
   }
 
-  Computation CreateTupleSubComputation(const string& computation_name,
-                                        const Shape& tuple_shape) {
-    ComputationBuilder builder(client_, computation_name);
+  XlaComputation CreateTupleSubComputation(const string& computation_name,
+                                           const Shape& tuple_shape) {
+    XlaBuilder builder(computation_name);
     auto tuple = builder.Parameter(0, tuple_shape, "tuple");
     auto x = builder.GetTupleElement(tuple, 0);
     auto y = builder.GetTupleElement(tuple, 1);
@@ -150,11 +151,11 @@ class ConditionalOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0TupleSubComputation() {
+  XlaComputation CreateR0TupleSubComputation() {
     return CreateTupleSubComputation("SubR0", tuple_2_r0f32_);
   }
 
-  Computation CreateR1TupleSubComputation() {
+  XlaComputation CreateR1TupleSubComputation() {
     return CreateTupleSubComputation("SubR1", tuple_2_r1s2f32_);
   }
 
@@ -170,26 +171,25 @@ class ConditionalOpTest : public ClientLibraryTestBase {
 
 // Test true and false computations that do not take any parameters.
 XLA_TEST_F(ConditionalOpTest, Parameters0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   auto true_computation = CreateR0ConstantComputation(56.0f);
   auto false_computation = CreateR0ConstantComputation(12.0f);
-  auto result = builder.Conditional(pred, operands, true_computation, operands,
-                                    false_computation);
+  builder.Conditional(pred, operands, true_computation, operands,
+                      false_computation);
 
   ComputeAndCompareR0<float>(&builder, 56.0f, {}, error_spec_);
 }
 
 // Test true and false computations that take in 1 parameter.
 XLA_TEST_F(ConditionalOpTest, Parameters1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto identity = CreateR0IdentityComputation();
-  auto result =
-      builder.Conditional(pred, operand1, identity, operand2, identity);
+  builder.Conditional(pred, operand1, identity, operand2, identity);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -197,12 +197,12 @@ XLA_TEST_F(ConditionalOpTest, Parameters1) {
 // Test conditional with two different computations in the true and false cases
 // that take in different arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -210,11 +210,11 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsDiffArgs) {
 // Test conditional with two different computations in the true and false cases
 // that take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand, CreateR0CeilComputation(),
-                                    operand, CreateR0FloorComputation());
+  builder.Conditional(pred, operand, CreateR0CeilComputation(), operand,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -222,12 +222,12 @@ XLA_TEST_F(ConditionalOpTest, DiffComputationsSameArg) {
 // Test conditional with the same computation in the true and false cases but
 // take in different arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand1, floor, operand2, floor);
+  builder.Conditional(pred, operand1, floor, operand2, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -235,11 +235,11 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffArgs) {
 // Test conditional with the same computation in the true and false cases that
 // take in the same arguments.
 XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand = builder.ConstantR0<float>(12.6f);
   auto floor = CreateR0FloorComputation();
-  auto result = builder.Conditional(pred, operand, floor, operand, floor);
+  builder.Conditional(pred, operand, floor, operand, floor);
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -247,12 +247,12 @@ XLA_TEST_F(ConditionalOpTest, SameComputationSameArg) {
 // Test conditional with different instances of the same computation in the true
 // and false cases.
 XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
-  auto result = builder.Conditional(pred, operand1, CreateR0FloorComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0FloorComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(&builder, 12.0f, {}, error_spec_);
 }
@@ -260,7 +260,7 @@ XLA_TEST_F(ConditionalOpTest, SameComputationDiffInstances) {
 // Test the case when a call invokes a computation that contains a conditional.
 XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   auto pred_cond = inner_builder.Parameter(0, r0bool, "param0");
   auto true_operand = inner_builder.Parameter(1, r0f32_, "param1");
   auto false_operand = inner_builder.Parameter(2, r0f32_, "param2");
@@ -268,7 +268,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
                             false_operand, CreateR0FloorComputation());
   auto inner_builder_result = inner_builder.Build();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.4f);
   auto operand2 = builder.ConstantR0<float>(12.6f);
@@ -281,14 +281,13 @@ XLA_TEST_F(ConditionalOpTest, ConditionalWithCall) {
 // Test true and false computations that take in 2 parameters and predicate is
 // true.
 XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 68.0f, {}, error_spec_);
 }
@@ -296,14 +295,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2TrueBranch) {
 // Test true and false computations that take in 2 parameters and predicate is
 // false.
 XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR0TupleAddComputation(),
-                          operands, CreateR0TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR0TupleAddComputation(), operands,
+                      CreateR0TupleSubComputation());
 
   ComputeAndCompareR0<float>(&builder, 44.0f, {}, error_spec_);
 }
@@ -311,14 +309,13 @@ XLA_TEST_F(ConditionalOpTest, Parameters2FalseBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is true.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {34.0f, 67.0f}, {}, error_spec_);
 }
@@ -326,21 +323,20 @@ XLA_TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 // Test true and false computations that take in 2 array parameters and
 // predicate is false.
 XLA_TEST_F(ConditionalOpTest, Parameters2ArrayFalseBranch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR1<float>({24.0f, 56.0f});
   auto operand2 = builder.ConstantR1<float>({10.0f, 11.0f});
   auto operands = builder.Tuple({operand1, operand2});
-  auto result =
-      builder.Conditional(pred, operands, CreateR1TupleAddComputation(),
-                          operands, CreateR1TupleSubComputation());
+  builder.Conditional(pred, operands, CreateR1TupleAddComputation(), operands,
+                      CreateR1TupleSubComputation());
 
   ComputeAndCompareR1<float>(&builder, {14.0f, 45.0f}, {}, error_spec_);
 }
 
 // Test true and false computations that return a tuple of scalars.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple(
       {builder.ConstantR0<float>(12.2f), builder.ConstantR0<float>(25.6f)});
@@ -356,7 +352,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfScalars) {
 
 // Test true and false computations that return a tuple of arrays.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({builder.ConstantR1<float>({12.2f, 15.8f}),
                                  builder.ConstantR1<float>({25.6f, 29.2f})});
@@ -373,7 +369,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleOfArrays) {
 // Test true and false computations that return a tuple of a predicate, a
 // scalar, and an array.
 XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_pred = true_builder.ConstantR0<bool>(true);
@@ -384,7 +380,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_pred = false_builder.ConstantR0<bool>(false);
@@ -395,7 +391,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -411,7 +407,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnTupleofPredicateScalarArray) {
 
 // Test true and false computations that return a nested tuple.
 XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
-  ComputationBuilder true_builder(client_, TestName() + ".true");
+  XlaBuilder true_builder(TestName() + ".true");
   {
     true_builder.Parameter(0, empty_tuple_, "tuple");
     auto true_constant1 = true_builder.ConstantR0<float>(12.2f);
@@ -424,7 +420,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto true_builder_result = true_builder.Build();
   EXPECT_IS_OK(true_builder_result.status());
 
-  ComputationBuilder false_builder(client_, TestName() + ".false");
+  XlaBuilder false_builder(TestName() + ".false");
   {
     false_builder.Parameter(0, empty_tuple_, "tuple");
     auto false_constant1 = false_builder.ConstantR0<float>(46.6f);
@@ -438,7 +434,7 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   auto false_builder_result = false_builder.Build();
   EXPECT_IS_OK(false_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto operands = builder.Tuple({});
   builder.Conditional(pred, operands, true_builder_result.ConsumeValueOrDie(),
@@ -460,16 +456,16 @@ XLA_TEST_F(ConditionalOpTest, ReturnNestedTuple) {
 // params.
 XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(true, 0, "pred", &builder, &pred);
   auto operand1_param =
       CreateR0Parameter<float>(56.3f, 1, "operand1", &builder, &operand1);
   auto operand2_param =
       CreateR0Parameter<float>(12.7f, 2, "operand2", &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR0CeilComputation(),
-                                    operand2, CreateR0FloorComputation());
+  builder.Conditional(pred, operand1, CreateR0CeilComputation(), operand2,
+                      CreateR0FloorComputation());
 
   ComputeAndCompareR0<float>(
       &builder, 57.0f,
@@ -480,16 +476,16 @@ XLA_TEST_F(ConditionalOpTest, ScalarOperandsFromExternalParams) {
 // Test conditional that takes in array operands in the form of external params.
 XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
   Shape r0bool = ShapeUtil::MakeShape(PRED, {});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle pred, operand1, operand2;
+  XlaOp pred, operand1, operand2;
   auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
   auto operand1_param = CreateR1Parameter<float>({24.3f, 56.7f}, 1, "operand1",
                                                  &builder, &operand1);
   auto operand2_param = CreateR1Parameter<float>({10.2f, 11.6f}, 2, "operand2",
                                                  &builder, &operand2);
-  auto result = builder.Conditional(pred, operand1, CreateR1CeilComputation(),
-                                    operand2, CreateR1FloorComputation());
+  builder.Conditional(pred, operand1, CreateR1CeilComputation(), operand2,
+                      CreateR1FloorComputation());
 
   ComputeAndCompareR1<float>(
       &builder, {10.0f, 11.0f},
@@ -499,7 +495,7 @@ XLA_TEST_F(ConditionalOpTest, ArrayOperandsFromExternalParams) {
 
 // Test the case where one conditional is nested within another.
 XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -514,7 +510,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred1 = builder.ConstantR0<bool>(true);
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
@@ -529,7 +525,7 @@ XLA_TEST_F(ConditionalOpTest, NestedConditionals) {
 }
 
 XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
-  ComputationBuilder inner_builder(client_, TestName() + ".inner_conditional");
+  XlaBuilder inner_builder(TestName() + ".inner_conditional");
   {
     Shape r0bool = ShapeUtil::MakeShape(PRED, {});
     Shape tuple_shape = ShapeUtil::MakeTupleShape({r0bool, r0f32_, r0f32_});
@@ -544,7 +540,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
   auto inner_builder_result = inner_builder.Build();
   EXPECT_IS_OK(inner_builder_result.status());
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred2 = builder.ConstantR0<bool>(false);
   auto operand1 = builder.ConstantR0<float>(1.1f);
   auto operand2 = builder.ConstantR0<float>(12.2f);
@@ -556,7 +552,7 @@ XLA_TEST_F(ConditionalOpTest, ConditionalInNestedComputation) {
 
 // Test a mismatch in the shape of the true operand and true computation.
 XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto operand1 = builder.ConstantR0<float>(56.0f);
   auto operand2 = builder.ConstantR0<float>(12.0f);
@@ -573,27 +569,27 @@ XLA_TEST_F(ConditionalOpTest, ShapeMismatch) {
 
 XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   Shape tuple_shape = ShapeUtil::MakeTupleShape({r0f32_, r0f32_});
-  Computation swapper;
+  XlaComputation swapper;
   {
-    ComputationBuilder builder(client_, TestName() + ".swapper");
+    XlaBuilder builder(TestName() + ".swapper");
     auto param0 = builder.Parameter(0, tuple_shape, "sp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({y, x});
     swapper = builder.Build().ConsumeValueOrDie();
   }
-  Computation forwarder;
+  XlaComputation forwarder;
   {
-    ComputationBuilder builder(client_, TestName() + ".forwarder");
+    XlaBuilder builder(TestName() + ".forwarder");
     auto param0 = builder.Parameter(0, tuple_shape, "fp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
     builder.Tuple({x, y});
     forwarder = builder.Build().ConsumeValueOrDie();
   }
-  Computation main;
+  XlaComputation main;
   {
-    ComputationBuilder builder(client_, TestName() + ".main");
+    XlaBuilder builder(TestName() + ".main");
     auto param0 = builder.Parameter(0, tuple_shape, "mp0");
     auto x = builder.GetTupleElement(param0, 0);
     auto y = builder.GetTupleElement(param0, 1);
@@ -605,7 +601,7 @@ XLA_TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
   }
 
   auto test_swap = [&](float a, float b) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto x = builder.ConstantR0<float>(a);
     auto y = builder.ConstantR0<float>(b);
     auto tuple_operand = builder.Tuple({x, y});
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index 9a899b79141fbc35fabd8d2e5d4195fb589dd84c..0842a8918bcfec037ab0f9aa24014c7d8296cdf8 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -230,6 +230,43 @@ XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
   ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
 }
 
+XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
+  ComputationBuilder builder(client_, TestName());
+  // Test cases from compiler_rt library.
+  std::vector<float> arg{0.0f,
+                         0.5f,
+                         0.99f,
+                         1.0f,
+                         1.5f,
+                         1.99f,
+                         2.0f,
+                         2.01f,
+                         2147483648.f,
+                         -0.5f,
+                         -0.99f,
+                         -1.0f,
+                         -1.5f,
+                         -1.99f,
+                         -2.0f,
+                         -2.01f,
+                         0x1.FFFFFEp+62F,
+                         0x1.FFFFFCp+62F,
+                         -0x1.FFFFFEp+62F,
+                         -0x1.FFFFFCp+62F};
+  std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
+  auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
+  std::unique_ptr<GlobalData> arg_data =
+      client_->TransferToServer(*arg_literal).ConsumeValueOrDie();
+
+  builder.ConvertElementType(arg_param, S64);
+
+  std::vector<int64> expected(arg.size());
+  for (int64 i = 0; i < arg.size(); ++i) {
+    expected[i] = static_cast<int64>(arg[i]);
+  }
+  ComputeAndCompareR1<int64>(&builder, expected, {arg_data.get()});
+}
+
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
   ComputationBuilder builder(client_, TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 72715398dea468d0000144759454c5f8d8673516..5eb3136abea35eb7bfa819c05b449ded9d8794a7 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -88,12 +88,12 @@ class ForwardPassConvolution_3x3x256_256_OutputZ_Iota : public ConvolutionTest {
     ASSERT_EQ(2, arhs->width());
     ASSERT_EQ(2, arhs->height());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto lhs = builder.ConstantR4FromArray4D<T>(*alhs);
     auto rhs = builder.ConstantR4FromArray4D<T>(*arhs);
-    auto conv = builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
+    builder.Conv(lhs, rhs, {1, 1}, Padding::kValid);
 
-    ComputeAndCompare(&builder, conv, {}, error_spec_);
+    ComputeAndCompare(&builder, {}, error_spec_);
   }
 };
 
@@ -106,12 +106,12 @@ template <typename T>
 class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 1, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 1, 2);
     input_data.FillWithYX(Array2D<T>({
@@ -122,7 +122,7 @@ class Convolve_1x1x1x2_1x1x1x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -137,12 +137,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+    builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -156,7 +156,7 @@ class Convolve_1x1x4x4_1x1x2x2_Valid : public ConvolutionTest {
         {5.0f, 6.0f},
         {7.0f, 8.0f},
     }));
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -171,12 +171,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 2, 2});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({
@@ -191,7 +191,7 @@ class Convolve_1x1x4x4_1x1x2x2_Same : public ConvolutionTest {
         {7.0f, 8.0f},
     }));
 
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -207,12 +207,12 @@ template <typename T>
 class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 4, 4});
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 1, 3, 3});
     auto input = builder.Parameter(0, input_shape, "input");
     auto filter = builder.Parameter(1, filter_shape, "filter");
-    auto conv = builder.Conv(input, filter, {1, 1}, Padding::kSame);
+    builder.Conv(input, filter, {1, 1}, Padding::kSame);
 
     Array4D<T> input_data(1, 1, 4, 4);
     input_data.FillWithYX(Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f},
@@ -223,7 +223,7 @@ class Convolve_1x1x4x4_1x1x3x3_Same : public ConvolutionTest {
     filter_data.FillWithYX(Array2D<T>(
         {{5.0f, 6.0f, 7.0f}, {8.0f, 9.0f, 10.0f}, {11.0f, 12.0f, 13.0f}}));
     // clang-format on
-    ComputeAndCompare(&builder, conv,
+    ComputeAndCompare(&builder,
                       {std::move(*Literal::CreateFromArray(input_data)),
                        std::move(*Literal::CreateFromArray(filter_data))},
                       error_spec_);
@@ -234,7 +234,7 @@ TYPED_TEST_CASE(Convolve_1x1x4x4_1x1x3x3_Same, TestTypes);
 TYPED_TEST(Convolve_1x1x4x4_1x1x3x3_Same, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -264,7 +264,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithRHSDilation : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -300,7 +300,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithRHSDilation, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithRHSDilation, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -331,7 +331,7 @@ XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSDilation) {
 }
 
 XLA_TEST_F(ConvolutionTest, Convolve1D_1x2x5_1x2x2_WithLHSAndRHSDilation) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape input_shape = ShapeUtil::MakeShape(F32, {1, 2, 5});
     Shape filter_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
@@ -365,7 +365,7 @@ template <typename T>
 class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     {
       Shape input_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 5});
       Shape filter_shape = ShapeUtil::MakeShapeWithType<T>({1, 2, 2});
@@ -402,7 +402,7 @@ TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes);
 TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); }
 
 XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> input_dims = {1, 4, 2, 3, 3};
   std::vector<int64> filter_dims = {2, 2, 2, 3, 3};
   Shape input_shape = ShapeUtil::MakeShape(F32, input_dims);
@@ -469,7 +469,7 @@ template <typename T>
 class Convolve2D_1x3x3x5_3x3x5x5_Valid : public ConvolutionTest {
  public:
   void RunTest() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     std::vector<int64> input_dims = {1, 3, 3, 5};
     std::vector<int64> filter_dims = {3, 3, 5, 3};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
@@ -537,7 +537,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
     execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
         "convolution-canonicalization");
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(F32, {4, 29});
   Shape filter_shape = ShapeUtil::MakeShape(F32, {4, 10});
 
@@ -551,8 +551,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   dnums.set_kernel_output_feature_dimension(1);
   dnums.set_output_batch_dimension(0);
   dnums.set_output_feature_dimension(1);
-  auto conv = builder.ConvWithGeneralDimensions(input, filter, {},
-                                                Padding::kValid, dnums);
+  builder.ConvWithGeneralDimensions(input, filter, {}, Padding::kValid, dnums);
 
   Array2D<float> param0(4, 29);
   param0.FillUnique();
@@ -563,7 +562,7 @@ XLA_TEST_P(ConvolveWithAndWithoutCanonicalization,
   Array2D<float> expected_result(29, 10);
   expected_result.Fill(0);
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(param0)),
                      std::move(*Literal::CreateFromArray(param1))},
                     error_spec_);
@@ -587,7 +586,7 @@ class Convolve1D1WindowTestBase
  protected:
   template <typename T>
   void TestImpl() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     int64 input_feature = GetParam().input_feature;
     int64 output_feature = GetParam().output_feature;
     int64 batch = GetParam().batch;
@@ -724,12 +723,12 @@ INSTANTIATE_TEST_CASE_P(
 #endif
 
 XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Shape input_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   Shape filter_shape = ShapeUtil::MakeShape(BF16, {1, 1, 1, 2});
   auto input = builder.Parameter(0, input_shape, "input");
   auto filter = builder.Parameter(1, filter_shape, "filter");
-  auto conv = builder.Conv(input, filter, {1, 1}, Padding::kValid);
+  builder.Conv(input, filter, {1, 1}, Padding::kValid);
 
   Array4D<bfloat16> input_data(1, 1, 1, 2);
   input_data.FillWithYX(Array2D<bfloat16>({
@@ -740,7 +739,7 @@ XLA_TEST_F(ConvolutionTest, Convolve_bf16_1x1x1x2_1x1x1x2_Valid) {
       {bfloat16(5), bfloat16(6)},
   }));
 
-  ComputeAndCompare(&builder, conv,
+  ComputeAndCompare(&builder,
                     {std::move(*Literal::CreateFromArray(input_data)),
                      std::move(*Literal::CreateFromArray(filter_data))},
                     error_spec_);
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 2d847a66b0ae7c8f09fa0cb181a4c84ea99be5b1..b43d5c9ff5d75ee0e1b3c9ceb2bc295e631ac107 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -134,9 +134,9 @@ class CustomCallClientAPITest : public ClientLibraryTestBase {};
 // When using the client API, CustomCall targets can't begin with '$' -- these
 // are reserved for internal use.
 XLA_TEST_F(CustomCallClientAPITest, IllegalCustomCallTarget) {
-  ComputationBuilder builder(client_, TestName());
-  auto call = builder.CustomCall("$illegal", /*operands=*/{},
-                                 ShapeUtil::MakeShape(F32, {1}));
+  XlaBuilder builder(TestName());
+  builder.CustomCall("$illegal", /*operands=*/{},
+                     ShapeUtil::MakeShape(F32, {1}));
 
   StatusOr<std::unique_ptr<GlobalData>> result =
       Execute(&builder, /*arguments=*/{});
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 09b1dd283e4d026a2f0007240d88cd9ac38acb19..7b994a4c172cafee53ede9bfd4f30b0e0c9888d5 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -54,6 +54,25 @@ using TypesF16F32F64CF64 =
 #error "Situation not handled yet"
 #endif
 
+// Check that we can safely pass an input tuple's elements to a dot operation.
+TEST_F(DotOperationTest, DotOfInputTupleElem) {
+  ComputationBuilder builder(client_, TestName());
+
+  ComputationDataHandle param;
+  auto param_data = CreateParameterAndTransferLiteral(
+      0,
+      *Literal::MakeTuple({Literal::CreateR2<float>({{1, 2}, {3, 4}}).get(),
+                           Literal::CreateR2<float>({{5, 6}, {7, 8}}).get()}),
+      "arg0", &builder, &param);
+  auto lhs = builder.GetTupleElement(param, 0);
+  auto rhs = builder.GetTupleElement(param, 1);
+  builder.Dot(lhs, rhs);
+
+  ComputeAndCompareLiteral(&builder,
+                           *Literal::CreateR2<float>({{19, 22}, {43, 50}}),
+                           {param_data.get()});
+}
+
 template <typename T>
 class DotOperationTest_F16F32F64CF64 : public DotOperationTest {};
 TYPED_TEST_CASE(DotOperationTest_F16F32F64CF64, TypesF16F32F64CF64);
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 4f354e6aefe70a51c09be1c0ca151af2bb9f0a2c..5f00c34002803553b9c17b4fce0abafda7369796 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -18,9 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@@ -112,10 +111,8 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   void TestR3Wrap() {
     // Slice at dimension boundaries, but with sizes that cause indices to wrap.
     RunR3<IndexT, DataT>(
-      {{{1, 2}, {3, 4}, {5, 6}},
-       {{7, 8}, {9, 10}, {11, 12}}},
-      {0, 2, 1}, {2, 1, 2},
-      {{{6, 5}}, {{12, 11}}});
+        {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
+        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -137,9 +134,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -163,9 +160,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -189,9 +186,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -281,6 +278,15 @@ XLA_TEST_F(DynamicSliceTest, Int32R3Pred) {
 
 class DynamicUpdateSliceTest : public ClientLibraryTestBase {
  protected:
+  template <typename IndexT, typename DataT>
+  void TestR0() {
+    // Disable algebraic simplifier, otherwise the op will be replaced by a
+    // constant.
+    execution_options_.mutable_debug_options()->add_xla_disable_hlo_passes(
+        "algsimp");
+    RunR0<IndexT, DataT>(0, 123, {}, 123);
+  }
+
   template <typename IndexT, typename DataT>
   void TestR1() {
     // Slice at dimension start.
@@ -341,6 +347,35 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
         {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
   }
 
+  template <typename IndexT, typename DataT>
+  void RunR0(int input_value_int, int update_value_int,
+             const std::vector<IndexT> slice_starts, int expected_value_int) {
+    Literal input_value =
+        std::move(*Literal::CreateR0(input_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal update_value =
+        std::move(*Literal::CreateR0(update_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+    Literal expected_value =
+        std::move(*Literal::CreateR0(expected_value_int)
+                       ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
+                       .ValueOrDie());
+
+    ComputationBuilder builder(client_, TestName());
+    // Initialize and transfer dynamic slice start indices parameter.
+    ComputationDataHandle starts;
+    std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
+        slice_starts, 0, "slice_starts", &builder, &starts);
+    // Build dynamic slice computation.
+    auto input = builder.ConstantLiteral(input_value);
+    auto update = builder.ConstantLiteral(update_value);
+    builder.DynamicUpdateSlice(input, update, starts);
+    // Run computation and compare against expected values.
+    ComputeAndCompareLiteral(&builder, expected_value, {start_data.get()});
+  }
+
   template <typename IndexT, typename DataT>
   void RunR1(tensorflow::gtl::ArraySlice<int> input_values_int,
              tensorflow::gtl::ArraySlice<int> update_values_int,
@@ -359,9 +394,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -390,9 +425,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -421,9 +456,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -474,13 +509,13 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     }
 
     // Build dynamic slice computation.
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer input parameter.
-    ComputationDataHandle input;
+    XlaOp input;
     std::unique_ptr<GlobalData> input_data =
         CreateR3Parameter<T>(input_values, 0, "input_values", &builder, &input);
     // Initialize and transfer update parameter.
-    ComputationDataHandle update;
+    XlaOp update;
     std::unique_ptr<GlobalData> update_data = CreateR3Parameter<T>(
         update_values, 1, "update_values", &builder, &update);
     auto starts = builder.ConstantR1<int32>({index, 0, 0});
@@ -500,6 +535,11 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 };
 
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0BF16) { TestR0<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32R0) { TestR0<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64R0) { TestR0<int64, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64R0) { TestR0<uint64, float>(); }
+
 // TODO(b/71820067): The CPU parallel backend failed for this on 2018-01-10.
 XLA_TEST_F(DynamicUpdateSliceTest, DISABLED_ON_CPU_PARALLEL(Int32R1BF16)) {
   TestR1<int32, bfloat16>();
@@ -672,7 +712,7 @@ void BM_DynamicSlice(int num_iters) {
       TransferManager::GetForPlatform(platform).ValueOrDie();
   int device_ordinal = client->default_device_ordinal();
 
-  ComputationBuilder builder(client, "DynamicSlice");
+  XlaBuilder builder("DynamicSlice");
 
   // Create input as a constant: shape [1, 2, 3, 4]
   auto input_literal = Literal::CreateR4(
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 9db68ff7a6dcbd9204fb2b3a37734a9aaed35dfd..90496d55e60b4f45fc2d46b2746f94d775cf9f94 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -405,7 +405,7 @@ class GatherClientLibraryTest : public ClientLibraryTestBase {};
 // GPU and CPU_PARALLEL.
 XLA_TEST_F(GatherClientLibraryTest,
            DISABLED_ON_CPU_PARALLEL(DISABLED_ON_GPU(Basic))) {
-  // We create this HLO, but using the ComputationBuilder API.
+  // We create this HLO, but using the XlaBuilder API.
   //
   // ENTRY main {
   //   operand = s32[3,3] parameter(0)
@@ -418,7 +418,7 @@ XLA_TEST_F(GatherClientLibraryTest,
   //       window_bounds={1, 3}
   // }
 
-  ComputationBuilder builder(client_, "gather_basic");
+  XlaBuilder builder("gather_basic");
 
   Shape operand_shape = ShapeUtil::MakeShape(S32, {3, 3});
   Shape indices_shape = ShapeUtil::MakeShape(S32, {2});
@@ -443,8 +443,8 @@ XLA_TEST_F(GatherClientLibraryTest,
                           client_->GetDeviceHandles(1));
   xla::ExecutionOptions execution_options = CreateDefaultExecutionOptions();
   *execution_options.add_device_handles() = devices[0];
-  TF_ASSERT_OK_AND_ASSIGN(Computation computation, builder.Build());
-  std::vector<xla::Client::ComputationInstance> computation_instances = {
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, builder.Build());
+  std::vector<xla::Client::XlaComputationInstance> computation_instances = {
       {computation,
        {operand_arg.get(), indices_arg.get()},
        execution_options,
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index e574644dea7c1ba144ba87fbeb7f28cc52312e26..21f71fc91bb84540e5347811cb4643a8aeda445c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -91,7 +91,7 @@ HloTestBase::HloTestBase()
 HloTestBase::HloTestBase(se::Platform* test_platform,
                          se::Platform* reference_platform)
     : test_runner_(test_platform), reference_runner_(reference_platform) {
-  hlo_verifier_ = MakeUnique<HloVerifier>();
+  hlo_verifier_ = MakeUnique<HloVerifier>(/*allow_mixed_precision=*/true);
 }
 
 /* static */
@@ -142,8 +142,7 @@ StatusOr<std::unique_ptr<HloModule>> HloTestBase::MakeReferenceModule(
           "reference preprocessor must not modify the program shape");
     }
   }
-  TF_RETURN_IF_ERROR(VerifyHloModule(*reference_runner_.backend().platform(),
-                                     reference_module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(reference_module.get()).status());
   return std::move(reference_module);
 }
 
@@ -151,8 +150,7 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
     std::unique_ptr<HloModule> module, const ArraySlice<Literal*> arguments,
     const optional<ErrorSpec>& error, bool run_hlo_passes,
     const std::function<void(HloModule*)>& reference_preprocessor) {
-  TF_RETURN_IF_ERROR(
-      VerifyHloModule(*test_runner_.backend().platform(), module.get()));
+  TF_RETURN_IF_ERROR(hlo_verifier_->Run(module.get()).status());
   TF_ASSIGN_OR_RETURN(auto reference_module,
                       MakeReferenceModule(*module, reference_preprocessor));
 
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 0cd812fd1b4bc69c34b70d3ca0fd0aa6cf57fa4c..efe6cc67872713a8aeecc11aeafe4902676817a6 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -50,18 +52,18 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOne() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOne() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
-    auto adder_to_one = mapped_builder.Add(x, one);
+    mapped_builder.Add(x, one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
   }
 
-  Computation CreateMax() {
-    ComputationBuilder b(client_, TestName());
+  XlaComputation CreateMax() {
+    XlaBuilder b(TestName());
     auto lhs = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto rhs = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     b.Max(lhs, rhs);
@@ -73,8 +75,8 @@ class MapTest : public ClientLibraryTestBase {
   // Creates a computation that accepts an F32 and returns T(1) (ignoring the
   // argument).
   template <class T>
-  Computation CreateScalarOne() {
-    ComputationBuilder mapped_builder(client_, "scalar_one");
+  XlaComputation CreateScalarOne() {
+    XlaBuilder mapped_builder("scalar_one");
     (void)mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     mapped_builder.ConstantR0<T>(1);
     auto computation_status = mapped_builder.Build();
@@ -87,11 +89,11 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (mul)
   //                /
   // 2.0f ---------/
-  Computation CreateMulByTwo() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateMulByTwo() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto two = mapped_builder.ConstantR0<float>(2.0);
-    auto mul_by_two = mapped_builder.Mul(x, two);
+    mapped_builder.Mul(x, two);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -105,12 +107,12 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} ----> (add) ----> (mul)
   //                /
   // 1.0f ---------/
-  Computation CreateAdderToOneTimesItself() {
-    ComputationBuilder mapped_builder(client_, TestName());
+  XlaComputation CreateAdderToOneTimesItself() {
+    XlaBuilder mapped_builder(TestName());
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto one = mapped_builder.ConstantR0<float>(1.0);
     auto adder_to_one = mapped_builder.Add(x, one);
-    auto result = mapped_builder.Mul(x, adder_to_one);
+    mapped_builder.Mul(x, adder_to_one);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -122,12 +124,13 @@ class MapTest : public ClientLibraryTestBase {
   // x {R0F32} -----------> (map) ----> (add)
   //                         /           /
   // embedded_computation --/       n --/
-  Computation CreateMapPlusN(const Computation& embedded_computation, float n) {
-    ComputationBuilder builder(client_, TestName());
+  XlaComputation CreateMapPlusN(const XlaComputation& embedded_computation,
+                                float n) {
+    XlaBuilder builder(TestName());
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto map = builder.Map({x}, embedded_computation, {});
     auto constant_n = builder.ConstantR0<float>(n);
-    auto add = builder.Add(map, constant_n);
+    builder.Add(map, constant_n);
     auto computation_status = builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -135,11 +138,11 @@ class MapTest : public ClientLibraryTestBase {
 
   // Creates a binary function with signature (F32, F32) -> Pred
   // defined by (x, y) -> x > y.
-  Computation CreateGt() {
-    ComputationBuilder b(client_, "Gt");
+  XlaComputation CreateGt() {
+    XlaBuilder b("Gt");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = b.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
-    auto gt = b.Gt(x, y);
+    b.Gt(x, y);
     auto computation_status = b.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -152,13 +155,13 @@ class MapTest : public ClientLibraryTestBase {
   // y {R0F32} ----> (add) ---> (add)
   //                           /
   // z {R0F32} ---------------/
-  Computation CreateTernaryAdder() {
-    ComputationBuilder mapped_builder(client_, "TernaryAdder");
+  XlaComputation CreateTernaryAdder() {
+    XlaBuilder mapped_builder("TernaryAdder");
     auto x = mapped_builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = mapped_builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     auto z = mapped_builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "z");
     auto xy = mapped_builder.Add(x, y);
-    auto xyz = mapped_builder.Add(xy, z);
+    mapped_builder.Add(xy, z);
     auto computation_status = mapped_builder.Build();
     TF_CHECK_OK(computation_status.status());
     return computation_status.ConsumeValueOrDie();
@@ -167,13 +170,13 @@ class MapTest : public ClientLibraryTestBase {
 
 TEST_F(MapTest, MapEachElemPlusOneR0) {
   // Applies lambda (x) (+ x 1)) to an input scalar.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(42.0);
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {});
+  builder.Map({param}, CreateAdderToOne(), {});
 
   ComputeAndCompareR0<float>(&builder, 43.0, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -181,13 +184,13 @@ TEST_F(MapTest, MapEachElemPlusOneR0) {
 
 XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -195,55 +198,55 @@ XLA_TEST_F(MapTest, MapEachElemPlusOneR1S0) {
 
 TEST_F(MapTest, MapEachElemPlusOneR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0});
+  builder.Map({param}, CreateAdderToOne(), {0});
 
   ComputeAndCompareR1<float>(&builder, {3.2f, 4.3f, 5.4f, 6.5f},
                              {param0_data.get()}, ErrorSpec(0.01f));
 }
 
 TEST_F(MapTest, MapEachF32ElementToS32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<int32>(), {0});
+  builder.Map({param}, CreateScalarOne<int32>(), {0});
 
   ComputeAndCompareR1<int32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachF32ElementToU32Constant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateScalarOne<uint32>(), {0});
+  builder.Map({param}, CreateScalarOne<uint32>(), {0});
 
   ComputeAndCompareR1<uint32>(&builder, {1, 1, 1, 1}, {param0_data.get()});
 }
 
 TEST_F(MapTest, MapEachElemLongerChainR1) {
   // Maps (lambda (x) (* (+ x 1) x)) onto an input R1F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.6f, -5.1f, 0.1f, 0.2f, 999.0f, 255.5f});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOneTimesItself(), {0});
+  builder.Map({param}, CreateAdderToOneTimesItself(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {9.36f, 20.91f, 0.11f, 0.24f, 999000.0f, 65535.75f},
@@ -253,14 +256,14 @@ TEST_F(MapTest, MapEachElemLongerChainR1) {
 XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 0, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR1<float>({});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {}, {param0_data.get()},
                              ErrorSpec(0.01f));
@@ -269,7 +272,7 @@ XLA_TEST_F(MapTest, MapMultipleMapsR1S0) {
 TEST_F(MapTest, MapMultipleMapsR1S4) {
   // Maps (lambda (x) (+ x 1)) onto an input R1F32 vector of length 4, and then
   // maps (lambda (x) (* x 2)) on the result.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -277,7 +280,7 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
   auto map1 = builder.Map({param}, CreateAdderToOne(), {0});
-  auto map2 = builder.Map({map1}, CreateMulByTwo(), {0});
+  builder.Map({map1}, CreateMulByTwo(), {0});
 
   ComputeAndCompareR1<float>(&builder, {6.4f, 8.6f, 10.8f, 13.0f},
                              {param0_data.get()}, ErrorSpec(0.01f));
@@ -285,14 +288,14 @@ TEST_F(MapTest, MapMultipleMapsR1S4) {
 
 TEST_F(MapTest, MapEachElemPlusOneR2) {
   // Maps (lambda (x) (+ x 1)) onto an input R2F32 vector.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2<float>(
       {{13.25f, 14.0f}, {-7.1f, -7.2f}, {-8.8f, 8.8f}});
   std::unique_ptr<GlobalData> param0_data =
       client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
 
   auto param = builder.Parameter(0, param0_literal->shape(), "param0");
-  auto map = builder.Map({param}, CreateAdderToOne(), {0, 1});
+  builder.Map({param}, CreateAdderToOne(), {0, 1});
 
   Array2D<float> expected_array(
       {{14.25f, 15.0f}, {-6.1f, -6.2f}, {-7.8f, 9.8f}});
@@ -317,18 +320,18 @@ XLA_TEST_F(MapTest, ComplexNestedMaps) {
   auto embed2 = CreateMapPlusN(embed1, 2.0);
   auto embed3 = CreateMapPlusN(embed1, 4.0);
 
-  ComputationBuilder embed4_builder(client_, "embed4");
+  XlaBuilder embed4_builder("embed4");
   auto embed4_param = embed4_builder.Parameter(0, scalar_shape, "x");
   auto embed4_map_lhs = embed4_builder.Map({embed4_param}, embed2, {});
   auto embed4_map_rhs = embed4_builder.Map({embed4_param}, embed3, {});
-  auto embed4_add = embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
+  embed4_builder.Add(embed4_map_lhs, embed4_map_rhs);
   auto embed4_status = embed4_builder.Build();
   ASSERT_IS_OK(embed4_status.status());
   auto embed4 = embed4_status.ConsumeValueOrDie();
 
   auto embed5 = CreateMapPlusN(embed2, 6.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant_42 = builder.ConstantR0<float>(42.0);
   auto constant_7 = builder.ConstantR0<float>(7.0);
   auto map_42 = builder.Map({constant_42}, embed5, {});
@@ -359,7 +362,8 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
   // Add another Add(1) operation to the existing embedded computation. This
   // requires using the stub interface because the ComputationBuilder does not
-  // allow modification to the Computation objects after they have been built.
+  // allow modification to the XlaComputation objects after they have been
+  // built.
   BinaryOpRequest request;
   request.set_binop(BINOP_ADD);
   *request.mutable_lhs() = adder_to_one;
@@ -381,7 +385,7 @@ TEST_F(MapTest, VersionedEmbeddedComputation) {
 
 TEST_F(MapTest, MapBinaryAdder) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -393,8 +397,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(F32, &builder), {0});
+  builder.Map({param0, param1}, CreateScalarAddComputation(F32, &builder), {0});
 
   ComputeAndCompareR1<float>(&builder, {7.3f, 7.7, 4.3f, 0},
                              {param0_data.get(), param1_data.get()},
@@ -404,7 +407,7 @@ TEST_F(MapTest, MapBinaryAdder) {
 // Adds two rank-2 arrays with different layouts. This test exercises a path
 // for Map that used to fail in shape inference (b/28989438).
 XLA_TEST_F(MapTest, AddWithMixedLayouts) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal = Literal::CreateR2WithLayout(
       {{1, 2}, {3, 4}}, LayoutUtil::MakeLayout({1, 0}));
   std::unique_ptr<GlobalData> param0_data =
@@ -417,8 +420,8 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1});
 
   Array2D<int32> expected(2, 2);
   expected(0, 0) = 11;
@@ -430,7 +433,7 @@ XLA_TEST_F(MapTest, AddWithMixedLayouts) {
 }
 
 XLA_TEST_F(MapTest, AddR3_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR3FromArray3D<int32>(Array3D<int32>(3, 0, 2));
   std::unique_ptr<GlobalData> param0_data =
@@ -443,8 +446,8 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1},
-                         CreateScalarAddComputation(S32, &builder), {0, 1, 2});
+  builder.Map({param0, param1}, CreateScalarAddComputation(S32, &builder),
+              {0, 1, 2});
 
   ComputeAndCompareR3<int32>(&builder, Array3D<int32>(3, 0, 2),
                              {param0_data.get(), param1_data.get()});
@@ -452,7 +455,7 @@ XLA_TEST_F(MapTest, AddR3_3x0x2) {
 
 TEST_F(MapTest, MapTernaryAdder) {
   // Maps (lambda (x y z) (+ x y z)) onto three R1F32 vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({2.2f, 3.3f, 4.4f, 5.5f});
   std::unique_ptr<GlobalData> param0_data =
@@ -469,7 +472,7 @@ TEST_F(MapTest, MapTernaryAdder) {
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
   auto param2 = builder.Parameter(2, param2_literal->shape(), "param2");
-  auto map = builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
+  builder.Map({param0, param1, param2}, CreateTernaryAdder(), {0});
 
   ComputeAndCompareR1<float>(
       &builder, {-2.7f, -92.3f, -895.7f, -400.0f},
@@ -479,24 +482,24 @@ TEST_F(MapTest, MapTernaryAdder) {
 
 TEST_F(MapTest, MapGt) {
   // Maps (x,y) -> x > y onto two R1F32 vectors.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto gt = CreateGt();
   b.Map({b.ConstantR1<float>({1, 20}), b.ConstantR1<float>({10, 2})}, gt, {0});
   ComputeAndCompareR1<bool>(&b, {false, true}, {});
 }
 
 TEST_F(MapTest, NestedBinaryMap) {
-  Computation max_with_square;
+  XlaComputation max_with_square;
   {
     // max_with_square(x) = do max(x, x^2) via a map.
-    ComputationBuilder b(client_, "max_with_square");
+    XlaBuilder b("max_with_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     b.Map({x, b.Mul(x, x)}, CreateMax(), {});
     auto computation_status = b.Build();
     ASSERT_IS_OK(computation_status.status());
     max_with_square = computation_status.ConsumeValueOrDie();
   }
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({0.1f, 0.5f, -0.5f, 1.0f, 2.0f});
   b.Map({input}, max_with_square, {0});
   ComputeAndCompareR1<float>(&b, {0.1f, 0.5f, 0.25f, 1.0f, 4.0f}, {});
@@ -505,13 +508,13 @@ TEST_F(MapTest, NestedBinaryMap) {
 TEST_F(MapTest, MapOperantionWithBuildError) {
   // Maps (lambda (x y) (+ x y)) onto two R1F32 vectors but uses an unsupported
   // type combination (F32 + U16) to test that the error is reported to the
-  // outermost ComputationBuilder.
-  ComputationBuilder builder(client_, TestName());
+  // outermost XlaBuilder.
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("ErrorAdd");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
   auto y = sub_builder->Parameter(1, ShapeUtil::MakeShape(U16, {}), "y");
-  auto adder = sub_builder->Add(x, y);
+  sub_builder->Add(x, y);
   auto error_add = sub_builder->BuildAndNoteError();
 
   std::unique_ptr<Literal> param0_literal =
@@ -525,9 +528,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   auto param0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto param1 = builder.Parameter(1, param1_literal->shape(), "param1");
-  auto map = builder.Map({param0, param1}, error_add, {0});
+  builder.Map({param0, param1}, error_add, {0});
 
-  StatusOr<Computation> computation_status = builder.Build();
+  StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
   EXPECT_THAT(
       computation_status.status().ToString(),
@@ -545,7 +548,7 @@ using MapTestWithFullOpt = ClientLibraryTestBase;
 // to have issues with such patterns and maybe invalidate the pointer to entry
 // computation.
 TEST_F(MapTestWithFullOpt, MapScalarPower) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -572,7 +575,7 @@ TEST_F(MapTestWithFullOpt, MapScalarPower) {
 // Regression test for b/35786417, where the inliner would not notice the change
 // of parameter order inside the map.
 TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
@@ -598,7 +601,7 @@ TEST_F(MapTestWithFullOpt, MapSubtractOppositeOrder) {
 // Regression test for b/35786417, where the inliner would CHECK-fail due to the
 // mul inside the map having more parameters than the map does.
 TEST_F(MapTestWithFullOpt, MapSquare) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   auto sub_builder = builder.CreateSubBuilder("power");
   auto x = sub_builder->Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index 8cef8dd34dc7b16b1e58ded67d6b6a4ba79f20db..ce295b832d79e4f00656f2893c2ba1162693dd73 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -85,7 +85,7 @@ class PadTestFloat : public PadTest,
 
 // Tests a Pad() with a zero-element input and output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 0, high: 0, interior: 0}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -100,7 +100,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS0Array) {
 
 // Tests a Pad() with a zero-element input but a non-zero-element output.
 XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -115,7 +115,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS0ToS5Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   // Set up the padding configuration {low: 3, high: 0, interior: 1}.
   PaddingConfig padding_config;
   auto dimension = padding_config.add_dimensions();
@@ -130,7 +130,7 @@ XLA_TEST_P(PadTestFloat, Pad1DS3Array) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Pad(AddParam(Array4D<float>(2, 0, 3, 2), &b),
         AddParam(*Literal::CreateR0<float>(1.5), &b), r4_padding_on_dim0_dim1_);
   ComputeAndCompareR4<float>(&b, Array4D<float>(5, 2, 3, 2, 1.5f), {},
@@ -138,7 +138,7 @@ XLA_TEST_P(PadTestFloat, Pad4D_2x0x3x2_FloatArray) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<float>>(1, 1, 3, 2);
   Array2D<float> input_xy({
       {1.0f, 2.0f},  // row 0
@@ -162,7 +162,7 @@ TEST_P(PadTestFloat, Pad4DFloat_1x1x3x2_Array) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   const float pad_value = 1.5f;
   Array4D<float> input(3, 2, 1, 1, {1, 2, 3, 4, 5, 6});
@@ -181,7 +181,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayWithInteriorPadding) {
 }
 
 TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -223,7 +223,7 @@ TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstSmall) {
 }
 
 XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   PaddingConfig padding_config;
   auto dimension0 = padding_config.add_dimensions();
@@ -266,7 +266,7 @@ XLA_TEST_P(PadTestFloat, Pad4DFloatArrayMinorFirstNonTrivialMinorDimensions) {
 }
 
 XLA_TEST_F(PadTest, Pad4DU8Array) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = MakeUnique<Array4D<uint8>>(1, 1, 3, 2);
   Array2D<uint8> input_xy({
       {1, 2},  // row 0
@@ -290,7 +290,7 @@ XLA_TEST_F(PadTest, Pad4DU8Array) {
 }
 
 XLA_TEST_F(PadTest, Pad4DPredArray) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   // Since bool is currently not well supported, use Broadcast operation to
   // create the operand for Pad.
@@ -317,7 +317,7 @@ XLA_TEST_F(PadTest, Pad4DPredArray) {
 }
 
 XLA_TEST_P(PadTestFloat, Large2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   auto ones = MakeUnique<Array2D<float>>(4, 4);
   ones->Fill(1.0f);
@@ -329,15 +329,14 @@ XLA_TEST_P(PadTestFloat, Large2DPad) {
     padding_config.mutable_dimensions(dim)->set_edge_padding_high(58 +
                                                                   100 * dim);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*ones, padding_config, 0.0f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 35;
   constexpr int64 in_cols = 35;
@@ -352,15 +351,14 @@ XLA_TEST_P(PadTestFloat, AllTypes2DPad) {
   padding_config.mutable_dimensions(1)->set_edge_padding_low(6);
   padding_config.mutable_dimensions(1)->set_edge_padding_high(4);
   padding_config.mutable_dimensions(1)->set_interior_padding(2);
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(3.14f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 3.14f);
   ComputeAndCompareR2<float>(&b, *expected, {}, DefaultErrorSpec());
 }
 
 XLA_TEST_P(PadTestFloat, High2DPad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -378,8 +376,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -387,7 +384,7 @@ XLA_TEST_P(PadTestFloat, High2DPad) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativePadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 129;
   constexpr int64 in_cols = 129;
@@ -406,8 +403,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -415,7 +411,7 @@ XLA_TEST_P(PadTestFloat, NegativePadding2D) {
 }
 
 XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   constexpr int64 in_rows = 8;
   constexpr int64 in_cols = 11;
@@ -434,8 +430,7 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
     padding_config.mutable_dimensions(dim)->set_interior_padding(
         interior_padding[dim]);
   }
-  auto padded = b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b),
-                      padding_config);
+  b.Pad(input, AddParam(*Literal::CreateR0<float>(2.718f), &b), padding_config);
 
   auto expected = ReferenceUtil::PadArray2D(*operand, padding_config, 2.718f);
 
@@ -444,20 +439,19 @@ XLA_TEST_P(PadTestFloat, NegativeAndInteriorPadding2D) {
 
 // Regression test for b/31827337.
 XLA_TEST_P(PadTestFloat, ReducePad) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto ones = MakeUnique<Array4D<float>>(2, 2, 2, 2);
   ones->Fill(1.0);
   auto input = AddParam(*ones, &b);
 
-  Computation add = CreateScalarAddComputation(FloatType(), &b);
+  XlaComputation add = CreateScalarAddComputation(FloatType(), &b);
   auto reduce =
       b.Reduce(input, AddParam(*Literal::CreateR0<float>(0.0), &b), add, {0});
 
   PaddingConfig padding_config = MakeNoPaddingConfig(3);
   padding_config.mutable_dimensions(0)->set_edge_padding_low(1);
   padding_config.mutable_dimensions(0)->set_edge_padding_high(1);
-  auto padded = b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b),
-                      padding_config);
+  b.Pad(reduce, AddParam(*Literal::CreateR0<float>(0.0f), &b), padding_config);
 
   Array3D<float> expected({{{0.0, 0.0}, {0.0, 0.0}},
                            {{2.0, 2.0}, {2.0, 2.0}},
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index dc7ce3253cee255a7949326fa5b49fc8917432b8..b311785449f1774c3bc1e4d7ad35c2866e3b4061 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
@@ -228,15 +228,14 @@ XLA_TEST_P(ReducePrecisionAccuracyTest, ReducePrecisionF32) {
   // This is required for proper handling of NaN values.
   SetFastMathDisabled(true);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({input_values});
   std::unique_ptr<GlobalData> a_data =
       client_->TransferToServer(*a_literal).ConsumeValueOrDie();
   auto a = builder.Parameter(0, a_literal->shape(), "a");
 
-  auto reduce_precision =
-      builder.ReducePrecision(a, exponent_bits, mantissa_bits);
+  builder.ReducePrecision(a, exponent_bits, mantissa_bits);
 
   ComputeAndCompareR1<float>(&builder, expected_values, {a_data.get()});
 }
@@ -252,7 +251,7 @@ class ReducePrecisionInsertionTest : public ClientLibraryTestBase {};
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionBeforeFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -265,7 +264,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
   // Near 1.0, Log(x) approximates x - 1; this lets us confirm that the
   // reduce-precision operation showed up in the correct place in the
   // graph.
-  auto log = builder.Log(abs);
+  builder.Log(abs);
 
   // Insert precision-reduction after the Abs(x) operation, rounding that
   // result to exactly 1.0f.
@@ -281,7 +280,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -290,7 +289,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kAbs operations.  This
   // should not see into the fusion nodes and thus should not affect the
@@ -307,7 +306,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedAfterFusion)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -316,7 +315,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass after operation fusion, suffixing kFusion operations.
   auto reduce_precision_pass = execution_options_.mutable_debug_options()
@@ -331,7 +330,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionSkippedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -340,7 +339,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kCos operations.  This
   // should have no effect.
@@ -356,7 +355,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 // The interpreter has no fusion pass, so skip this test.
 XLA_TEST_F(ReducePrecisionInsertionTest,
            DISABLED_ON_INTERPRETER(ReducePrecisionAddedFusionContains)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> a_literal = Literal::CreateR1<float>({1.00001});
   std::unique_ptr<GlobalData> a_data =
@@ -365,7 +364,7 @@ XLA_TEST_F(ReducePrecisionInsertionTest,
 
   // These two operations should be fused by any reasonable backend.
   auto abs = builder.Abs(a);
-  auto neg = builder.Neg(abs);
+  builder.Neg(abs);
 
   // Add a pass suffixing fusion nodes containing kAbs operations.  This
   // should see the kAbs operation within the above fusion node.
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 3a097a01ab095b8a21a39f0d738a43c3d6a4d1d7..768beec15e7ec3b8e7d2b4ed8a5aae62fac9dd7a 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -39,6 +39,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -57,6 +59,11 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using FuncGeneratorForType = Computation (*)(PrimitiveType,
+                                             ComputationBuilder*);
+
+using FuncGenerator = Computation (*)(ComputationBuilder*);
+
 class ReduceTest : public ClientLibraryTestBase {
  protected:
   ReduceTest() {
@@ -497,21 +504,18 @@ XLA_TEST_F(ReduceTest, TransposeAndReduceElementwiseR2_111x50_To_R1) {
 // Test that algebraic simplifier does not incorrectly fold a transpose into a
 // reduction operation.
 XLA_TEST_F(ReduceTest, TransposeAndReduceR3_12x111x50_To_R2) {
-  ComputationBuilder builder(client_, TestName());
-  Computation add_f32 = CreateScalarAddComputation(F32, &builder);
+  XlaBuilder builder(TestName());
+  XlaComputation add_f32 = CreateScalarAddComputation(F32, &builder);
   const Shape input_shape = ShapeUtil::MakeShape(F32, {12, 111, 50});
-  ComputationDataHandle input = builder.Parameter(0, input_shape, "input");
-  ComputationDataHandle zero = builder.ConstantR0<float>(0.0);
-  ComputationDataHandle transpose =
-      builder.Transpose(input, /*permutation=*/{1, 0, 2});
-  ComputationDataHandle reduce =
-      builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
+  XlaOp input = builder.Parameter(0, input_shape, "input");
+  XlaOp zero = builder.ConstantR0<float>(0.0);
+  XlaOp transpose = builder.Transpose(input, /*permutation=*/{1, 0, 2});
+  builder.Reduce(transpose, zero, add_f32, /*dimensions_to_reduce=*/{0});
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> input_data,
                           MakeFakeLiteral(input_shape));
 
-  ComputeAndCompare(&builder, reduce, {std::move(*input_data)},
-                    ErrorSpec(0.01, 1e-4));
+  ComputeAndCompare(&builder, {std::move(*input_data)}, ErrorSpec(0.01, 1e-4));
 }
 
 XLA_TEST_F(ReduceTest, Reshape_111x2x25Reduce_111x50_To_R1) {
@@ -755,53 +759,57 @@ XLA_TEST_F(ReduceTest, ReduceR3AmongDim2) {
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Add) {
-  RunVectorizedReduceTest(CreateScalarAddComputation,
-                          [](float a, float b) { return a + b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) +
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarAddComputation),
+      [](float a, float b) { return a + b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) +
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a + b; }, 0.0, 0, 0);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Multiply) {
-  RunVectorizedReduceTest(CreateScalarMultiplyComputation,
-                          [](float a, float b) { return a * b; },
-                          [](int32 a, int32 b) {
-                            return static_cast<int32>(static_cast<uint32>(a) *
-                                                      static_cast<uint32>(b));
-                          },
-                          [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMultiplyComputation),
+      [](float a, float b) { return a * b; },
+      [](int32 a, int32 b) {
+        return static_cast<int32>(static_cast<uint32>(a) *
+                                  static_cast<uint32>(b));
+      },
+      [](uint32 a, uint32 b) { return a * b; }, 1.0, 1, 1);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Max) {
-  RunVectorizedReduceTest(CreateScalarMaxComputation,
-                          [](float a, float b) { return std::max(a, b); },
-                          [](int32 a, int32 b) { return std::max(a, b); },
-                          [](uint32 a, uint32 b) { return std::max(a, b); },
-                          std::numeric_limits<float>::min(),
-                          std::numeric_limits<int32>::min(),
-                          std::numeric_limits<uint32>::min());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMaxComputation),
+      [](float a, float b) { return std::max(a, b); },
+      [](int32 a, int32 b) { return std::max(a, b); },
+      [](uint32 a, uint32 b) { return std::max(a, b); },
+      std::numeric_limits<float>::min(), std::numeric_limits<int32>::min(),
+      std::numeric_limits<uint32>::min());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
-  RunVectorizedReduceTest(CreateScalarMinComputation,
-                          [](float a, float b) { return std::min(a, b); },
-                          [](int32 a, int32 b) { return std::min(a, b); },
-                          [](uint32 a, uint32 b) { return std::min(a, b); },
-                          std::numeric_limits<float>::max(),
-                          std::numeric_limits<int32>::max(),
-                          std::numeric_limits<uint32>::max());
+  RunVectorizedReduceTest(
+      static_cast<FuncGeneratorForType>(CreateScalarMinComputation),
+      [](float a, float b) { return std::min(a, b); },
+      [](int32 a, int32 b) { return std::min(a, b); },
+      [](uint32 a, uint32 b) { return std::min(a, b); },
+      std::numeric_limits<float>::max(), std::numeric_limits<int32>::max(),
+      std::numeric_limits<uint32>::max());
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarAndComputation, [](bool a, bool b) { return a && b; }, true);
+      static_cast<FuncGenerator>(CreateScalarAndComputation),
+      [](bool a, bool b) { return a && b; }, true);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) {
   RunVectorizedReduceTestForType<bool>(
-      CreateScalarOrComputation, [](bool a, bool b) { return a || b; }, false);
+      static_cast<FuncGenerator>(CreateScalarOrComputation),
+      [](bool a, bool b) { return a || b; }, false);
 }
 
 class ReduceR3ToR2Test : public ReduceTest,
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 9c317fe579394c5b7a1d599169f471d484950199..8ef980ebd98bb3bd05d93537bda0a818811953ed 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -21,10 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -63,11 +64,9 @@ class ReduceWindowTestBase : public ClientLibraryTestBase {
 class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                          public ReduceWindowTestBase {
  public:
-  ReduceWindowTest() : builder_(client_, TestName()) {
-    set_use_bfloat16(GetParam());
-  }
+  ReduceWindowTest() : builder_(TestName()) { set_use_bfloat16(GetParam()); }
 
-  void ReduceWindowAdd(const ComputationDataHandle& input,
+  void ReduceWindowAdd(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -78,16 +77,17 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMax(const ComputationDataHandle& input,
+  void ReduceWindowMax(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
     auto init = CreateConstantFromLiteral(Literal::MinValue(F32), &builder_);
-    builder_.ReduceWindow(input, init, CreateScalarMax(), window_dimensions,
-                          window_strides, padding);
+    builder_.ReduceWindow(input, init,
+                          CreateScalarMaxComputation(FloatType(), &builder_),
+                          window_dimensions, window_strides, padding);
   }
 
-  void ReduceWindowMin(const ComputationDataHandle& input,
+  void ReduceWindowMin(const XlaOp& input,
                        tensorflow::gtl::ArraySlice<int64> window_dimensions,
                        tensorflow::gtl::ArraySlice<int64> window_strides,
                        Padding padding) {
@@ -97,7 +97,7 @@ class ReduceWindowTest : public ::testing::WithParamInterface<bool>,
                           window_dimensions, window_strides, padding);
   }
 
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 TEST_P(ReduceWindowTest, MismatchedRanksGivesErrorStatus) {
@@ -252,6 +252,48 @@ TEST_P(ReduceWindowTest, AmongMajor2DimsMediumSize) {
                            DefaultErrorSpec());
 }
 
+// Tests the super windowing logic w.r.t handling prime number of windows in a
+// major dimension with reduction.
+TEST_P(ReduceWindowTest, PrimeWindowsInReductionDimension) {
+  Array4D<float> input_array(15, 15, 4, 128);
+  input_array.FillRandom(2.f, 4.f);
+
+  int win_len = 3;
+  int win_stride = 2;
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  // Reduce only along the x and y dimensions, according to the win_len.
+  ReduceWindowAdd(input_data_handle, {win_len, win_len, 1, 1},
+                  {win_stride, win_stride, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {win_len, win_len, 1, 1},
+      {win_stride, win_stride, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
+TEST_P(ReduceWindowTest, ReduceAlongLaneDimension) {
+  Array4D<float> input_array(19, 17, 8, 256);
+  input_array.FillWithMinorDimNum();
+
+  const auto input_data_handle =
+      CreateConstantFromArray(input_array, &builder_);
+
+  Padding padding = Padding::kSame;
+  ReduceWindowAdd(input_data_handle, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  auto result = ReferenceUtil::ReduceWindow4DAdd(
+      input_array, 0.0f, {1, 1, 1, 11}, {1, 1, 1, 1}, padding);
+
+  ComputeAndCompareLiteral(&builder_, *Literal::CreateFromArray(*result), {},
+                           DefaultErrorSpec());
+}
+
 // Tests a reduction function that is not a simple add/min/max/etc.
 XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   Array4D<float> input_array(1, 2, 2, 1);
@@ -268,7 +310,7 @@ XLA_TEST_P(ReduceWindowTest, NonstandardReduceFunction) {
   auto rhs = b->Parameter(1, scalar, "rhs");
   b->Min(b->Add(lhs, rhs),
          CreateConstantFromLiteral(*Literal::CreateR0<float>(8.0f), b.get()));
-  Computation reduce_fn = b->BuildAndNoteError();
+  XlaComputation reduce_fn = b->BuildAndNoteError();
 
   builder_.ReduceWindow(
       input,
@@ -296,7 +338,7 @@ TEST_P(ReduceWindowTest, R4UnitWindow) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -364,7 +406,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -386,7 +428,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -408,7 +450,7 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle input;
+  XlaOp input;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "parameter", &builder_, &input);
 
@@ -509,7 +551,7 @@ TEST_P(ReduceWindowTest, R2ReduceWindowInceptionFromBroadcast) {
 
 TEST_P(ReduceWindowTest, R2ReduceWindowNonOverlappingFromBroadcast) {
   Array2D<float> input_array(6, 4, 1.0f);
-  ComputationDataHandle input = builder_.Broadcast(
+  XlaOp input = builder_.Broadcast(
       CreateConstantFromLiteral(Literal::One(F32), &builder_), {6, 4});
 
   Padding padding = Padding::kSame;
@@ -568,7 +610,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
   R4ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
 
     const float kInitValue = 0.0f;
@@ -579,7 +621,7 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     std::unique_ptr<Literal> input_literal =
         Literal::CreateR4FromArray4DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
 
@@ -920,7 +962,7 @@ class R3ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R3ReduceWindowTest, Add) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd);
 
@@ -931,7 +973,7 @@ TEST_P(R3ReduceWindowTest, Add) {
       Literal::CreateR3FromArray3DWithLayout(
           input, LayoutUtil::MakeLayout(param.layout));
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
   auto init_value =
@@ -1058,7 +1100,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
   R2ReduceWindowTest() { set_use_bfloat16(::testing::get<1>(GetParam())); }
 
   void DoIt() {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     const auto& param = ::testing::get<0>(GetParam());
     CHECK(param.reducer == kAdd);
 
@@ -1068,7 +1110,7 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         Literal::CreateR2FromArray2DWithLayout(
             input, LayoutUtil::MakeLayout(param.layout));
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                        &b, &parameter);
     std::vector<std::pair<int64, int64>> padding(2);
@@ -1256,7 +1298,7 @@ class R1ReduceWindowTest : public ReduceWindowTestBase,
 };
 
 TEST_P(R1ReduceWindowTest, DoIt) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   const auto& param = ::testing::get<0>(GetParam());
   CHECK(param.reducer == kAdd || param.reducer == kMax);
 
@@ -1265,7 +1307,7 @@ TEST_P(R1ReduceWindowTest, DoIt) {
   std::iota(std::begin(input_vector), std::end(input_vector), 0);
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR1(tensorflow::gtl::ArraySlice<float>(input_vector));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_arg = CreateParameterAndTransferLiteral(0, *input_literal, "p0",
                                                      &b, &parameter);
 
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 02272d60171c70896f44b0d6b96f176ea52e686f..d7462d581b8596dc43b81b0162b3f5020cebb546 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -20,11 +20,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -53,11 +52,11 @@ class ReshapeTest : public ::testing::WithParamInterface<bool>,
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to 1 dimension.
 XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -68,9 +67,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{});
@@ -81,9 +80,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
 }
 
 XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0});
@@ -95,11 +94,11 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
 
 // Collapses 2-dimensional pseudo-scalar (single-element array) to scalar.
 XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(1, 1);
   input_array.Fill(1.0f);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "parameter",
                                                  &builder, &parameter);
   auto reshape = builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -112,15 +111,14 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal = Literal::CreateR0<float>(1.0f);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   auto a = builder.Neg(parameter);
-  auto reshape =
-      builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
+  builder.Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
   auto expected_literal = Literal::CreateR1<float>({-1.0f});
   ComputeAndCompareLiteral(&builder, *expected_literal, {input.get()},
@@ -131,10 +129,10 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(0, 3);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -147,11 +145,11 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-05-15
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *param0_literal, "param0",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -164,10 +162,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial0x3WithParameter)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array2D<float> input_array(3, 0);
   auto input_literal = Literal::CreateR2FromArray2D(input_array);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -178,9 +176,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(Trivial3x0)) {
 
 // Collapses a 2-dimensional row vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial1x3) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -191,9 +189,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
 
 // Collapses a 2-dimensional column vector to 1 dimension.
 XLA_TEST_P(ReshapeTest, Trivial3x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
@@ -344,9 +342,9 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitNoShuffleZeroElements)) {
 // does not handle zero-sized shapes correctly. Failed last on 2017-11-30
 // with an incorrect result rank.
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array4D<float>(2, 3, 4, 0));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -359,10 +357,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeR4ToR2ZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), but no reordering (no shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
@@ -379,9 +377,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
 // with an incorrect result rank.
 //
 XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(Array2D<float>(0, 6));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -394,10 +392,10 @@ XLA_TEST_P(ReshapeTest, DISABLED_ON_GPU(ReshapeSplitAndShuffleZeroElements)) {
 // Reshapes a 2-dimensional array with dimensions that are not just a
 // rearrangement of the originals (split), and reorder the input (shuffle).
 XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = Literal::CreateFromArray(*a4x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
@@ -421,9 +419,9 @@ static Array3D<float> ArrayForDocR3Tests() {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -436,9 +434,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
@@ -456,9 +454,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -471,9 +469,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -491,9 +489,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
 }
 
 XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input_literal = Literal::CreateFromArray(ArrayForDocR3Tests());
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
@@ -521,12 +519,12 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
 //
 // 1 2 3 4 5 6 1 2 3 4 5 6
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t2x2x2x3(2, 2, 2, 3);
   auto filler2x3 = MakeLinspaceArray2D(1.0f, 6.0f, 2, 3);
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = Literal::CreateFromArray(t2x2x2x3);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
@@ -540,7 +538,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
 
 // As above, but uses reshape directly.
 XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array4D<float> t(2, 1, 2, 2);
   t(0, 0, 0, 0) = 0;
   t(0, 0, 0, 1) = 1;
@@ -551,7 +549,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 0) = 6;
   t(1, 0, 1, 1) = 7;
   auto input_literal = Literal::CreateFromArray(t);
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -566,7 +564,7 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
 // Reshape various ranks to a scalar.
 XLA_TEST_P(ReshapeTest, ToScalar) {
   for (int rank = 0; rank < 8; ++rank) {
-    ComputationBuilder b(client_, TestName());
+    XlaBuilder b(TestName());
     std::vector<int64> ones(rank, 1);  // this is {1, ..., 1}.
     std::vector<int64> dimensions(rank);
     std::iota(dimensions.begin(), dimensions.end(), 0);
@@ -574,7 +572,7 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     std::vector<int64> zeros(rank, 0);  // this is {0, ..., 0}.
     input_literal.Set<float>(zeros, 83.0f);
 
-    ComputationDataHandle parameter;
+    XlaOp parameter;
     auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
                                                    &b, &parameter);
     b.Reshape(parameter, dimensions, {});
@@ -586,9 +584,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
 }
 
 XLA_TEST_P(ReshapeTest, BadDimensions) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {}, {});
@@ -598,9 +596,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
 }
 
 XLA_TEST_P(ReshapeTest, BadNewSizes) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input_literal = Literal::CreateR1<float>({1.0f, 2.0f});
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input", &b,
                                                  &parameter);
   b.Reshape(parameter, {1}, {});
@@ -609,7 +607,7 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
 }
 
 XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // clang-format off
   auto input_literal = Literal::CreateR4FromArray4DWithLayout(Array4D<float>{
     {
@@ -635,7 +633,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   },
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
 
@@ -646,7 +644,7 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
       {222, 333, 444, 555, 666, 777, 888, 999},
   });
 
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
@@ -664,13 +662,13 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
@@ -691,13 +689,13 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
 
 // Tests R2->R4 reshape with the reshape dimensions {1, 0}.
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> input_literal = Literal::CreateR2<float>({
       {0, 1, 2, 3, 4, 5, 6, 7},
       {100, 101, 102, 103, 104, 105, 106, 107},
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *input_literal, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
@@ -717,7 +715,7 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 1, 1);
@@ -727,7 +725,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
@@ -739,7 +737,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(2, 1, 4, 1);
@@ -749,7 +747,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
@@ -762,7 +760,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
 
 // Tests R4->R2 reshape with the reshape dimensions {0, 2, 1, 3}.
 XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input(5, 10, 2, 3);
@@ -772,7 +770,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
@@ -789,7 +787,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
 }
 
 XLA_TEST_P(ReshapeTest, NoopReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::mt19937 rng;
   std::uniform_real_distribution<float> distribution;
   Array4D<float> input_array(2, 3, 5, 7);
@@ -799,12 +797,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
                   /*new_sizes=*/{7, 2, 3, 5});
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
@@ -827,12 +825,12 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
 }
 
 XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto literal_1x2x3x4 = Literal::CreateR4<float>(
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationDataHandle parameter;
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
@@ -846,8 +844,8 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
       {{{{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}},
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input = CreateParameterAndTransferLiteral(0, *literal_1x2x3x4, "input",
                                                  &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
@@ -880,8 +878,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -909,8 +907,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -938,8 +936,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -968,8 +966,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
@@ -997,8 +995,8 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
   std::unique_ptr<Literal> input_literal =
       Literal::CreateR4FromArray4DWithLayout(
           input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle parameter;
+  XlaBuilder builder(TestName());
+  XlaOp parameter;
   auto input_data = CreateParameterAndTransferLiteral(
       0, *input_literal, "input", &builder, &parameter);
   builder.Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 8fc841f14087cdea02fe44cdaea521ff92122aec..6959c95502cb7af6b720592e7836c6789719a528 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -85,7 +85,7 @@ TEST_P(FloatReverseTest, Reverses) {
   auto r1_literal = Literal::CreateR1<float>(input_vector);
   auto input_literal = r1_literal->Reshape(spec.input_dims).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = AddParam(*input_literal, &builder);
   builder.Rev(a, spec.reversal);
 
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index d268fdcacebcb162bf61bc7dd4b208f4db6c4a5f..7015e5a6a31f506d30c2629d7735482cf354455a 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -50,7 +50,7 @@ class SelectAndScatterTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<SelectAndScatterTestParam> {
  public:
-  SelectAndScatterTest() : builder_(client_, TestName()) {
+  SelectAndScatterTest() : builder_(TestName()) {
     // Create S32 GE and ADD computations for select and scatter respectively.
     ge_s32_ = CreateScalarGeComputation(S32, &builder_);
     add_s32_ = CreateScalarAddComputation(S32, &builder_);
@@ -60,13 +60,13 @@ class SelectAndScatterTest
     min_f32_ = CreateScalarMinComputation(F32, &builder_);
   }
 
-  ComputationBuilder builder_;
-  Computation ge_s32_;
-  Computation add_s32_;
-  Computation ge_f32_;
-  Computation add_f32_;
-  Computation max_f32_;
-  Computation min_f32_;
+  XlaBuilder builder_;
+  XlaComputation ge_s32_;
+  XlaComputation add_s32_;
+  XlaComputation ge_f32_;
+  XlaComputation add_f32_;
+  XlaComputation max_f32_;
+  XlaComputation min_f32_;
 };
 
 XLA_TEST_P(SelectAndScatterTest, ParamTest) {
@@ -80,12 +80,11 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
   s.FillRandom(12.0f);
   auto source = builder_.ConstantFromArray(s);
 
-  auto select_and_scatter = builder_.SelectAndScatter(
-      operand, ge_f32_, GetParam().window_dimensions, GetParam().window_strides,
-      GetParam().padding_type, source, builder_.ConstantR0<float>(0.0f),
-      add_f32_);
+  builder_.SelectAndScatter(operand, ge_f32_, GetParam().window_dimensions,
+                            GetParam().window_strides, GetParam().padding_type,
+                            source, builder_.ConstantR0<float>(0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, select_and_scatter, {}, ErrorSpec(1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index a14a365bd0529ba82a25cdfacfe3902a655c4876..52195db2aa74710b901dd7744a670764a034e96b 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -41,7 +41,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x3x1_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 3, 1}, {1, 1, 1});
 
@@ -54,7 +54,7 @@ TEST_F(SliceTest, Slice3x3x3_To_3x1x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {3, 1, 3}, {1, 1, 1});
 
@@ -67,7 +67,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
   Array3D<float> values(3, 3, 3);
   values.FillIota(0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR3FromArray3D<float>(values);
   builder.Slice(original, {0, 0, 0}, {1, 3, 3}, {1, 1, 1});
 
@@ -77,7 +77,7 @@ TEST_F(SliceTest, Slice3x3x3_To_1x3x3_F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   builder.Slice(original, {0, 0}, {0, 0}, {1, 1});
 
@@ -85,7 +85,7 @@ XLA_TEST_F(SliceTest, Slice0x0to0x0F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 20));
   builder.Slice(original, {0, 15}, {0, 20}, {1, 1});
 
@@ -93,7 +93,7 @@ XLA_TEST_F(SliceTest, Slice0x20to0x5F32) {
 }
 
 XLA_TEST_F(SliceTest, Slice3x0to2x0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(Array2D<float>(3, 0));
   builder.Slice(original, {1, 0}, {3, 0}, {1, 1});
 
@@ -108,7 +108,7 @@ XLA_TEST_F(SliceTest, SliceQuadrantOf256x256) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {128, 128}, {256, 256}, {1, 1});
 
@@ -126,7 +126,7 @@ TEST_F(SliceTest, Slice_1x4096_To_1x1024) {
   Array2D<float> values(1, 4096);
   std::iota(values.data(), values.data() + 4096, 0.0);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 3072}, {1, 4096}, {1, 1});
 
@@ -147,7 +147,7 @@ TEST_F(SliceTest, Slice_16x4_To_16x2) {
       }
     }
   }
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR2FromArray2D<float>(values);
   builder.Slice(original, {0, 0}, {16, 2}, {1, 1});
   ComputeAndCompareR2<float>(&builder, expected, {}, ErrorSpec(0.000001));
@@ -159,7 +159,7 @@ TEST_F(SliceTest, SliceR4ThreeDimsMiddleMinor) {
   values.FillRandom(3.14f);
   auto expected = ReferenceUtil::Slice4D(
       values, {{1, 0, 8, 0}}, {{2, 2, 16, 128}}, /*strides=*/{{1, 1, 1, 1}});
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {1, 0, 8, 0}, {2, 2, 16, 128}, {1, 1, 1, 1});
   ComputeAndCompareR4(&builder, *expected, {}, ErrorSpec(0.000001));
@@ -172,7 +172,7 @@ XLA_TEST_F(SliceTest, StridedSliceR4WithOutputLayout) {
                                          /*strides=*/{{1, 1, 2, 1}});
   auto expected_literal = Literal::CreateR4FromArray4DWithLayout(
       *expected, LayoutUtil::MakeLayout({0, 1, 2, 3}));
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto original = builder.ConstantR4FromArray4D(values);
   builder.Slice(original, {0, 0, 0, 0}, {2, 4, 6, 8}, {1, 1, 2, 1});
   ComputeAndCompareLiteral(&builder, *expected_literal, {}, ErrorSpec(0.000001),
@@ -198,7 +198,7 @@ class SliceR1Test : public ClientLibraryTestBase,
     tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto original = builder.ConstantR1<NativeT>(input);
     builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
                   {spec.slice_stride});
@@ -214,6 +214,9 @@ class SliceR1Test : public ClientLibraryTestBase,
   }
 };
 
+// A version of SliceR1Test used to label and disable 'large' tests
+class SliceR1LargeTest : public SliceR1Test {};
+
 string SliceR1TestDataToString(const ::testing::TestParamInfo<R1Spec>& data) {
   const R1Spec& spec = data.param;
   return ::tensorflow::strings::Printf("%lld_%lld_%lld_%lld", spec.input_dim0,
@@ -233,8 +236,21 @@ XLA_TEST_P(SliceR1Test, DoIt_U64) { Run<uint64>(GetParam()); }
 
 XLA_TEST_P(SliceR1Test, DoIt_S64) { Run<int64>(GetParam()); }
 
+XLA_TEST_P(SliceR1LargeTest, DoIt_F32) { Run<float>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_F64) { Run<double>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U32) { Run<uint32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S32) { Run<int32>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_U64) { Run<uint64>(GetParam()); }
+
+XLA_TEST_P(SliceR1LargeTest, DoIt_S64) { Run<int64>(GetParam()); }
+
 XLA_TEST_P(SliceR1Test, DoIt_PRED) { Run<bool>(GetParam()); }
 
+
 // Tests for R1 slice ops.
 // The format for each testcase is {input size, start, limit, stride}.
 // clang-format off
@@ -242,12 +258,6 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestInstantiation,
     SliceR1Test,
     ::testing::Values(
-// TODO(b/69425338): This uses too much memory on GPU.
-#ifndef XLA_TEST_BACKEND_GPU
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
-        R1Spec{16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1},
-#endif
         R1Spec{10, 0, 0, 1},
         R1Spec{10, 7, 7, 1},
         R1Spec{10, 0, 5, 1},
@@ -283,6 +293,23 @@ INSTANTIATE_TEST_CASE_P(
     SliceR1TestDataToString
 );
 
+// TODO(b/69425338): This uses too much memory on GPU.
+#ifndef XLA_TEST_BACKEND_GPU
+INSTANTIATE_TEST_CASE_P(
+    SliceR1TestBigSlicesInstantiation,
+    SliceR1LargeTest,
+    ::testing::Values(
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024, 12 * 1024 * 1024, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 + 1, 12 * 1024 * 1024 - 1, 1},
+          R1Spec{
+              16 * 1024 * 1024, 4 * 1024 * 1024 - 1, 12 * 1024 * 1024 + 1, 1}
+    ),
+    SliceR1TestDataToString
+);
+#endif
+
 INSTANTIATE_TEST_CASE_P(
     SliceStridedR1TestInstantiation,
     SliceR1Test,
@@ -339,7 +366,7 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR2FromArray2DWithLayout<int32>(
       input, LayoutUtil::MakeLayout(spec.layout));
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
@@ -429,7 +456,7 @@ class SliceR4Test : public ClientLibraryTestBase,
     values.FillRandom(3.14f);
     auto expected = ReferenceUtil::Slice4D(
         values, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto literal = Literal::CreateR4FromArray4DWithLayout(
         values, LayoutUtil::MakeLayout(spec.input_layout));
     auto parameter = builder.Parameter(0, literal->shape(), "p0");
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 0bc7df2a65b44a76f877b6513e6bf93b99fbc1a3..e30d115fae3655a57dade6d3f569b1d020442319 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -23,14 +23,14 @@ namespace xla {
 
 namespace {
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
-                                         std::minstd_rand0* engine) {
+template <typename FloatT, typename GeneratorT>
+void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
+                                             std::minstd_rand0* engine) {
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
   // numbers.
-  std::uniform_real_distribution<FloatT> generator(1.0f, 1.125f);
+  std::uniform_real_distribution<GeneratorT> generator(1.0f, 1.125f);
   const bool should_index_bias = ShapeUtil::ElementsIn(literal->shape()) > 1000;
   TF_CHECK_OK(literal->Populate<FloatT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices) {
@@ -52,10 +52,22 @@ void PopulateWithRandomFloatingPointData(Literal* literal,
         FloatT index_bias =
             static_cast<FloatT>(index_product % 113 - negative_bias) /
             static_cast<FloatT>(256.0f);
-        return (generator(*engine) - 1.0625) + index_bias;
+        return static_cast<FloatT>(generator(*engine) - 1.0625f) + index_bias;
       }));
 }
 
+template <typename FloatT>
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
+}
+
+template <>
+void PopulateWithRandomFloatingPointData<half>(Literal* literal,
+                                               std::minstd_rand0* engine) {
+  PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
+}
+
 // The standard library does not have a case for bfloat16, unsurprisingly, so we
 // handle that one specially.
 template <>
@@ -100,6 +112,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
       break;
+    case F16:
+      PopulateWithRandomFloatingPointData<half>(literal.get(), engine);
+      break;
     case F32:
       PopulateWithRandomFloatingPointData<float>(literal.get(), engine);
       break;
@@ -145,27 +160,38 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
   return std::move(literal);
 }
 
-// Matches binary addition computations.
-bool LooksLikeSum(const HloComputation& computation) {
+enum class ConstantType { kUnknown, kZero, kOne };
+
+// Return the constant type required by this computation, if known.
+ConstantType GetInitValue(const HloComputation& computation) {
   const HloInstruction* const root = computation.root_instruction();
-  return root->opcode() == HloOpcode::kAdd &&
-         computation.num_parameters() == 2 &&
-         root->operand(0)->opcode() == HloOpcode::kParameter &&
-         root->operand(1)->opcode() == HloOpcode::kParameter &&
-         root->operand(0) != root->operand(1);
+  if (computation.num_parameters() != 2 || root->operand_count() != 2 ||
+      root->operand(0)->opcode() != HloOpcode::kParameter ||
+      root->operand(1)->opcode() != HloOpcode::kParameter ||
+      root->operand(0) == root->operand(1)) {
+    return ConstantType::kUnknown;
+  }
+
+  switch (root->opcode()) {
+    case HloOpcode::kAdd:
+      return ConstantType::kZero;
+    case HloOpcode::kMultiply:
+      return ConstantType::kOne;
+    default:
+      return ConstantType::kUnknown;
+  }
 }
 
-// Reduce, ReduceWindow, and SelectAndScatter ops may use binary addition,
-// which requires an init_value of 0 rather than a random value.
-bool NeedsZeroInitValue(const HloUse& use) {
+// Reduce, ReduceWindow, and SelectAndScatter ops may need a non-random
+// initialization value.
+bool NeedsInitValue(const HloUse& use) {
   const HloInstruction* const instruction = use.instruction;
   const HloOpcode opcode = instruction->opcode();
   const int64 op_num = use.operand_number;
   return (
       ((opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow) &&
-       op_num == 1 && LooksLikeSum(*instruction->to_apply())) ||
-      (opcode == HloOpcode::kSelectAndScatter && op_num == 2 &&
-       LooksLikeSum(*instruction->scatter())));
+       op_num == 1) ||
+      (opcode == HloOpcode::kSelectAndScatter && op_num == 2));
 }
 
 // Generate random values that are constrained to the input_shape minus the
@@ -207,7 +233,7 @@ std::vector<HloInstruction*> FindConstrainedUses(
         auto fused_uses = FindConstrainedUses(dataflow, *to_analyze);
         constrained_uses.insert(constrained_uses.end(), fused_uses.begin(),
                                 fused_uses.end());
-      } else if (NeedsZeroInitValue(use)) {
+      } else if (NeedsInitValue(use)) {
         constrained_uses.push_back(instruction);
       } else if (opcode == HloOpcode::kConvert ||
                  opcode == HloOpcode::kReducePrecision) {
@@ -228,7 +254,8 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
     const tensorflow::gtl::ArraySlice<HloInstruction*> constrained_uses,
     const HloInstruction& param, std::minstd_rand0* engine) {
   HloInstruction* needs_index = nullptr;
-  HloInstruction* needs_zero = nullptr;
+  HloInstruction* needs_constant = nullptr;
+  ConstantType constant_type = ConstantType::kUnknown;
   for (HloInstruction* use : constrained_uses) {
     switch (use->opcode()) {
       case HloOpcode::kDynamicSlice:
@@ -243,8 +270,13 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
 
       case HloOpcode::kReduce:
       case HloOpcode::kReduceWindow:
+        needs_constant = use;
+        constant_type = GetInitValue(*use->to_apply());
+        break;
+
       case HloOpcode::kSelectAndScatter:
-        needs_zero = use;
+        needs_constant = use;
+        constant_type = GetInitValue(*use->scatter());
         break;
 
       default:
@@ -253,17 +285,26 @@ StatusOr<std::unique_ptr<Literal>> CreateLiteralForConstrainedUses(
             use->ToString().c_str());
     }
   }
-  if (needs_index != nullptr && needs_zero != nullptr) {
+  if (needs_index != nullptr && needs_constant != nullptr) {
     return Unimplemented(
         "Conflicting operand generation constraints.\nNeeds index: %s\nNeeds "
-        "zero: %s\n",
-        needs_index->ToString().c_str(), needs_zero->ToString().c_str());
+        "constant: %s\n",
+        needs_index->ToString().c_str(), needs_constant->ToString().c_str());
   }
   if (needs_index != nullptr) {
     return MakeRandomNonwrappingSliceIndex(needs_index->operand(0)->shape(),
                                            needs_index->shape(), engine);
-  } else if (needs_zero != nullptr) {
-    return Literal::CreateFromShape(param.shape());
+  } else if (needs_constant != nullptr) {
+    switch (constant_type) {
+      case ConstantType::kZero:
+        return Literal::Zero(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kOne:
+        return Literal::One(param.shape().element_type()).CloneToUnique();
+      case ConstantType::kUnknown:
+        // We want the identity element for the computation, but we don't really
+        // know what it is - so any value we generate will be just as wrong.
+        return MakeFakeLiteralInternal(param.shape(), engine);
+    }
   } else {
     return MakeFakeLiteralInternal(param.shape(), engine);
   }
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8efc6e2a83f42bf81fc1261ba508632cf3f85b3
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// A test fixture is used because we need a client for our computation builder.
+class TestUtilsTest : public LocalClientTestBase {};
+
+XLA_TEST_F(TestUtilsTest, UnusedParam) {
+  ComputationBuilder builder(local_client_, TestName());
+  // Make the reduction lambda.
+  Shape single_float = ShapeUtil::MakeShape(F32, {});
+  builder.Parameter(0, single_float, "unused");
+  builder.Parameter(1, single_float, "used");
+  auto computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  // Make the reduction.
+  Shape pair_float = ShapeUtil::MakeShape(F32, {2});
+  builder.Reduce(builder.Parameter(0, pair_float, "operand"),
+                 builder.Parameter(1, single_float, "init"),
+                 computation_status.ValueOrDie(), {0});
+  computation_status = builder.Build();
+  TF_ASSERT_OK(computation_status.status());
+
+  auto executable_status = local_client_->Compile(
+      computation_status.ValueOrDie(), {&pair_float, &single_float},
+      ExecutableBuildOptions());
+  TF_ASSERT_OK(executable_status.status());
+  HloModule& module = const_cast<HloModule&>(
+      executable_status.ValueOrDie()->executable()->module());
+  TF_ASSERT_OK(MakeFakeArguments(&module).status());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index fa60af4b6a7d4f249b28be14357b8cad9a42c783..098be6d7aabe88d0deef600716229ddbd0bcae2f 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -41,7 +43,7 @@ class TupleTest : public ClientLibraryTestBase {
 
 // Tests a tuple-shaped constant.
 XLA_TEST_F(TupleTest, TupleConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -54,13 +56,13 @@ XLA_TEST_F(TupleTest, TupleConstant) {
                           Literal::CreateR1<float>(constant_vector).get(),
                           Literal::CreateR2<float>(constant_matrix).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests a tuple made of scalar constants.
 XLA_TEST_F(TupleTest, TupleScalarConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar1 = 7.3f;
   const float constant_scalar2 = 1.2f;
@@ -68,13 +70,13 @@ XLA_TEST_F(TupleTest, TupleScalarConstant) {
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar1).get(),
                           Literal::CreateR0<float>(constant_scalar2).get()});
 
-  auto result = builder.ConstantLiteral(*value);
+  builder.ConstantLiteral(*value);
   ComputeAndCompareTuple(&builder, *value, {}, error_spec_);
 }
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreate) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   const float constant_scalar = 7.3f;
   std::initializer_list<float> constant_vector = {1.1f, 2.0f, 3.3f};
@@ -82,9 +84,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
       {1.1f, 2.2f, 3.5f},  // row 0
       {4.8f, 5.0f, 6.7f},  // row 1
   };
-  auto result = builder.Tuple({builder.ConstantR0<float>(constant_scalar),
-                               builder.ConstantR1<float>(constant_vector),
-                               builder.ConstantR2<float>(constant_matrix)});
+  builder.Tuple({builder.ConstantR0<float>(constant_scalar),
+                 builder.ConstantR1<float>(constant_vector),
+                 builder.ConstantR2<float>(constant_matrix)});
 
   auto expected =
       Literal::MakeTuple({Literal::CreateR0<float>(constant_scalar).get(),
@@ -95,9 +97,9 @@ XLA_TEST_F(TupleTest, TupleCreate) {
 
 // Tests the creation of tuple data.
 XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  auto result = builder.Tuple(
+  builder.Tuple(
       {builder.ConstantR0<float>(7.0), builder.ConstantR1<float>({})});
 
   auto expected = Literal::MakeTuple({Literal::CreateR0<float>(7.0).get(),
@@ -107,15 +109,15 @@ XLA_TEST_F(TupleTest, TupleCreateWithZeroElementEntry) {
 
 // Tests the creation of an empty tuple.
 XLA_TEST_F(TupleTest, EmptyTupleCreate) {
-  ComputationBuilder builder(client_, TestName());
-  auto result = builder.Tuple({});
+  XlaBuilder builder(TestName());
+  builder.Tuple({});
   auto expected = Literal::MakeTuple({});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElement) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -123,23 +125,23 @@ XLA_TEST_F(TupleTest, GetTupleElement) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(constant_matrix), {},
                              error_spec_);
 }
 
 // Trivial test for extracting a tuple element with GetTupleElement.
 XLA_TEST_F(TupleTest, GetTupleElementWithZeroElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto tuple_data = builder.Tuple(
       {builder.ConstantR1<float>({}),
        builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 101))});
-  auto matrix_element = builder.GetTupleElement(tuple_data, 1);
+  builder.GetTupleElement(tuple_data, 1);
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 101), {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto value = builder.ConstantR1<float>({4.5f});
   builder.GetTupleElement(value, 1);
   auto result_status = builder.Build();
@@ -152,7 +154,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNonTupleFailsGracefully) {
 // Extracts both elements from a tuple with GetTupleElement and then adds them
 // together.
 XLA_TEST_F(TupleTest, AddTupleElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -164,22 +166,22 @@ XLA_TEST_F(TupleTest, AddTupleElements) {
   auto matrix_element = builder.GetTupleElement(tuple_data, 1);
   auto vector_shape = builder.GetShape(vector_element).ConsumeValueOrDie();
   auto matrix_shape = builder.GetShape(matrix_element).ConsumeValueOrDie();
-  auto result = builder.Add(matrix_element, vector_element,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(matrix_element, vector_element,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {2.f, 4.f, 6.f},  // row 0
       {5.f, 7.f, 9.f},  // row 1
   });
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*vector_shape, F32, {3}));
-  ASSERT_TRUE(ShapeUtil::ShapeIs(*matrix_shape, F32, {/*y=*/2, /*x=*/3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(vector_shape, F32, {3}));
+  ASSERT_TRUE(ShapeUtil::ShapeIs(matrix_shape, F32, {/*y=*/2, /*x=*/3}));
   ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
 }
 
 // Extracts both elements from a tuple and then puts them into a new tuple in
 // the opposite order.
 XLA_TEST_F(TupleTest, TupleGTEToTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -187,8 +189,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
   };
   auto tuple_data = builder.Tuple({builder.ConstantR1<float>(constant_vector),
                                    builder.ConstantR2<float>(constant_matrix)});
-  auto new_tuple = builder.Tuple({builder.GetTupleElement(tuple_data, 1),
-                                  builder.GetTupleElement(tuple_data, 0)});
+  builder.Tuple({builder.GetTupleElement(tuple_data, 1),
+                 builder.GetTupleElement(tuple_data, 0)});
   auto expected =
       Literal::MakeTuple({Literal::CreateR2<float>(constant_matrix).get(),
                           Literal::CreateR1<float>(constant_vector).get()});
@@ -196,8 +198,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTuple) {
 }
 
 XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle v1, v2;
+  XlaBuilder b(TestName());
+  XlaOp v1, v2;
 
   for (bool direction : {false, true}) {
     std::unique_ptr<GlobalData> v1_data =
@@ -210,7 +212,7 @@ XLA_TEST_F(TupleTest, SelectBetweenPredTuples) {
     auto v2_gt = b.Gt(v2, v1);             // true
     auto v1_v2 = b.Tuple({v1_gt, v2_gt});  // {false, true}
     auto v2_v1 = b.Tuple({v2_gt, v1_gt});  // {true, false}
-    auto select = b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
+    b.Select(direction ? v1_gt : v2_gt, v1_v2, v2_v1);
     auto expected =
         Literal::MakeTuple({Literal::CreateR0<bool>(direction).get(),
                             Literal::CreateR0<bool>(!direction).get()});
@@ -237,7 +239,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   //              \                (tuple10)--                     /
   //               \              /           \                   /
   //                -----(GTE 0)--             --(GTE 1)----------
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::initializer_list<float> constant_vector = {1.f, 2.f, 3.f};
   std::initializer_list<std::initializer_list<float>> constant_matrix = {
       {1.f, 2.f, 3.f},  // row 0
@@ -257,8 +259,8 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
   auto addvectors = builder.Add(vector_from_01, vector_from_10);
   auto addmatrices = builder.Add(matrix_from_01, matrix_from_10);
 
-  auto result = builder.Add(addmatrices, addvectors,
-                            /*broadcast_dimensions=*/{1});
+  builder.Add(addmatrices, addvectors,
+              /*broadcast_dimensions=*/{1});
 
   Array2D<float> expected({
       {4.f, 8.f, 12.f},    // row 0
@@ -269,7 +271,7 @@ XLA_TEST_F(TupleTest, TupleGTEToTupleToGTEAdd) {
 
 XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
   // Tests a selection between tuples with "false" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -278,8 +280,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnFalse)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
@@ -314,7 +315,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) {
 
 XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
   // Tests a selection between tuples with "true" path taken.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -323,8 +324,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
   auto tuple21 = builder.Tuple(
       {builder.ConstantR1<float>(vec2), builder.ConstantR1<float>(vec1)});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(true), tuple12, tuple21);
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec1).get(),
                                       Literal::CreateR1<float>(vec2).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
@@ -333,7 +333,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesOnTrue)) {
 XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
   // Tests a selection between tuples but the final result is an element of the
   // tuple, not the whole tuple.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -344,7 +344,7 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesElementResult) {
 
   auto select =
       builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
-  auto element = builder.GetTupleElement(select, 0);
+  builder.GetTupleElement(select, 0);
 
   ComputeAndCompareR1<float>(&builder, vec2, {}, error_spec_);
 }
@@ -368,7 +368,7 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
   //                                /             --(GTE 1)--
   //                               /
   //                          (tuple 21)
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -384,8 +384,8 @@ XLA_TEST_F(TupleTest, DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesCascaded)) {
       builder.Select(builder.GetTupleElement(pred_tuple, 0), tuple12, tuple21);
   auto select2 =
       builder.Select(builder.GetTupleElement(pred_tuple, 1), tuple21, select1);
-  auto result = builder.Add(builder.GetTupleElement(select2, 0),
-                            builder.GetTupleElement(select2, 1));
+  builder.Add(builder.GetTupleElement(select2, 0),
+              builder.GetTupleElement(select2, 1));
 
   ComputeAndCompareR1<float>(&builder, {3.f, 6.f, 9.f}, {}, error_spec_);
 }
@@ -394,7 +394,7 @@ XLA_TEST_F(TupleTest,
            DISABLED_ON_CPU_PARALLEL(SelectBetweenTuplesReuseConstants)) {
   // Similar to SelectBetweenTuples, but the constants are shared between the
   // input tuples.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   std::initializer_list<float> vec1 = {1.f, 2.f, 3.f};
   std::initializer_list<float> vec2 = {2.f, 4.f, 6.f};
@@ -403,19 +403,18 @@ XLA_TEST_F(TupleTest,
   auto tuple12 = builder.Tuple({c1, c2});
   auto tuple21 = builder.Tuple({c2, c1});
 
-  auto select =
-      builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+  builder.Select(builder.ConstantR0<bool>(false), tuple12, tuple21);
+
   auto expected = Literal::MakeTuple({Literal::CreateR1<float>(vec2).get(),
                                       Literal::CreateR1<float>(vec1).get()});
   ComputeAndCompareTuple(&builder, *expected, {}, error_spec_);
 }
 
 XLA_TEST_F(TupleTest, NestedTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto inner_tuple = builder.Tuple(
       {builder.ConstantR1<float>({1.0, 2.0}), builder.ConstantR0<float>(42.0)});
-  auto outer_tuple =
-      builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
+  builder.Tuple({inner_tuple, builder.ConstantR1<float>({22.0, 44.0})});
 
   auto expected_v1 = Literal::CreateR1<float>({1.0, 2.0});
   auto expected_s = Literal::CreateR0<float>(42.0);
@@ -429,7 +428,7 @@ XLA_TEST_F(TupleTest, NestedTuples) {
 }
 
 XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   Shape data_shape = ShapeUtil::MakeShape(F32, {3});
   Shape inner_tuple_shape = ShapeUtil::MakeTupleShape({data_shape, data_shape});
@@ -460,7 +459,7 @@ XLA_TEST_F(TupleTest, GetTupleElementOfNestedTuple) {
 }
 
 XLA_TEST_F(TupleTest, ComplexTuples) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   {
     Shape c64r0 = ShapeUtil::MakeShape(C64, {});
     Shape c64r1 = ShapeUtil::MakeShape(C64, {2});
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 33d457c70bac84c2da10e3cf9302c2c952cf1bc2..89ce2ce797f979b8668fbdb172a4a3abc5922b9f 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -54,29 +54,28 @@ TEST_F(WhileTest, WhileWithScalarS32Result) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -91,29 +90,28 @@ TEST_F(WhileTest, WhileWithScalarS64Result) {
   auto result_shape = ShapeUtil::MakeShape(S64, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int64>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int64>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int64>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int64>(&builder, 5, {});
 }
@@ -123,31 +121,30 @@ TEST_F(WhileTest, WhileWithScalarResultNonConstInit) {
   auto orig_shape = ShapeUtil::MakeShape(S32, {2});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Gt(builder.ConstantR0<int32>(5), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Reduce(builder.ConstantR1<int32>(2, 1),
                              builder.ConstantR0<int32>(0),
                              CreateScalarAddComputation(S32, &builder), {0});
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -156,28 +153,28 @@ TEST_F(WhileTest, WhileWithPredicateResult) {
   auto result_shape = ShapeUtil::MakeShape(PRED, {});
 
   // Create a computation for the condition: run until condition is true.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Ne(builder.ConstantR0<bool>(true), prev);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body: or condition with true.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
-    auto result = builder.Or(prev, builder.ConstantR0<bool>(true));
+    builder.Or(prev, builder.ConstantR0<bool>(true));
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.Ne(builder.ConstantR0<bool>(false),
                          builder.ConstantR0<bool>(true));
-  auto result = builder.While(condition, body, init);
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<bool>(&builder, true, {});
 }
@@ -194,9 +191,9 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {0});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -205,33 +202,34 @@ TEST_F(WhileTest, DISABLED_WhileWithEmptyVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 15.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>({});
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>({});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   ComputeAndCompareR1<float>(&builder, {}, {}, ErrorSpec(0.0001));
 }
@@ -247,9 +245,9 @@ TEST_F(WhileTest, WhileWithVectorResult) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -258,33 +256,34 @@ TEST_F(WhileTest, WhileWithVectorResult) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   // Individual elements with increase by 1/8 each time through the loop, so
   // the sum will increase by 1.0.  It will first be >15.5 when the elements
@@ -306,9 +305,9 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {8});
 
   // Create a computation for the reduction.
-  Computation add;
+  XlaComputation add;
   {
-    ComputationBuilder builder(client_, "add");
+    XlaBuilder builder("add");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
     builder.Add(x, y);
@@ -317,34 +316,34 @@ TEST_F(WhileTest, WhileWithVectorResultIntoTuple) {
 
   // Create a computation for the condition.
   // Repeat until the sum of the result vector is less than 5.5f.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto sum = builder.Reduce(prev, builder.ConstantR0<float>(0.0f), add,
                               /*dimensions_to_reduce=*/{0});
-    auto test = builder.Gt(builder.ConstantR0<float>(15.5f), sum);
+    builder.Gt(builder.ConstantR0<float>(15.5f), sum);
     condition = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a computation for the body.
   // Add a constant vector of 1.f to the result vector.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR1<float>(8, 0.125f);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.ConstantR1<float>(8, 0.f);
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   builder.Tuple({result});
 
   // Individual elements with increase by 1/8 each time through the loop, so
@@ -366,9 +365,9 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -377,28 +376,28 @@ TEST_F(WhileTest, WhileWithPermutationAndTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(N);
   auto expected_w1 = Literal::CreateR1<float>({1.0f, 1.0f, 1.0f});
@@ -419,9 +418,9 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   // Create a computation for the condition.
   // Repeat for N iterations.
   const int N = 2;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(N), iteration);
@@ -430,21 +429,21 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable permute the weights.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto w1 = builder.GetTupleElement(prev, 1);
     auto w2 = builder.GetTupleElement(prev, 2);
     auto w3 = builder.GetTupleElement(prev, 3);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), w3, w1, w2});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(3, 1.f),
        builder.ConstantR1<float>(3, 2.f), builder.ConstantR1<float>(3, 3.f)});
@@ -455,7 +454,7 @@ TEST_F(WhileTest, WhileWithPermutationAndVectorResult) {
   auto result = builder.Add(add12, builder.GetTupleElement(xla_while, 3));
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   std::vector<float> expected = {6.f, 6.f, 6.f};
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
 }
@@ -474,9 +473,9 @@ TEST_F(WhileTest, WhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -486,26 +485,27 @@ TEST_F(WhileTest, WhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
-  VLOG(2) << "while = " << ShapeUtil::HumanString(
-                               *builder.GetShape(result).ConsumeValueOrDie());
+  VLOG(2) << "while = "
+          << ShapeUtil::HumanString(
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -523,9 +523,9 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -534,27 +534,27 @@ TEST_F(WhileTest, WhileWithPredicateTupleResult) {
 
   // Create a computation for the body.
   // Add 1 to the iteration variable and or the predicate with true
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto pred = builder.GetTupleElement(prev, 1);
     auto new_pred = builder.Or(pred, builder.ConstantR0<bool>(true));
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_pred});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple({builder.ConstantR0<int32>(0),
                              builder.Ne(builder.ConstantR0<bool>(false),
                                         builder.ConstantR0<bool>(true))});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_predicate = Literal::CreateR0<bool>(true);
@@ -570,9 +570,9 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -582,25 +582,24 @@ TEST_F(WhileTest, WhileWithTupleConstantScalarResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and set the other tuple element to a
   // constant.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
-    auto result =
-        builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
-                       builder.ConstantR0<int32>(7)});
+    builder.Tuple({builder.Add(iteration, builder.ConstantR0<int32>(1)),
+                   builder.ConstantR0<int32>(7)});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR0<int32>(7)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR0<int32>(7);
@@ -631,20 +630,20 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -654,34 +653,34 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  Computation body2;
+  XlaComputation body2;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body2, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -692,11 +691,11 @@ TEST_F(WhileTest, TwoWhileWithTupleResult) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -710,20 +709,20 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -733,21 +732,21 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -758,11 +757,11 @@ TEST_F(WhileTest, TwoWhileLoopsAndSharedBody) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -777,20 +776,20 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   const int c1 = 5;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c1));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation condition2;
+  XlaComputation condition2;
   const int c2 = 7;
   {
-    ComputationBuilder builder(client_, "condition2");
+    XlaBuilder builder("condition2");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(c2));
@@ -800,21 +799,21 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     auto weights = builder.GetTupleElement(prev, 1);
     auto input = builder.ConstantR1<float>(10, 1.f);
     auto new_weights = builder.Add(weights, input);
-    auto result = builder.Tuple(
+    builder.Tuple(
         {builder.Add(iteration, builder.ConstantR0<int32>(1)), new_weights});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto while1 = builder.While(condition, body, init);
@@ -824,11 +823,11 @@ TEST_F(WhileTest, DISABLED_ON_GPU(WhileLoopsWithSharedBodyAndInit)) {
   auto while_result2 = builder.GetTupleElement(while2, 1);
   VLOG(2) << "while_result2 = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(while_result2).ConsumeValueOrDie());
+                 builder.GetShape(while_result2).ConsumeValueOrDie());
   auto result = builder.Add(while_result1, while_result2);
   VLOG(2) << "result = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
   const float sum = c1 + c2;
   std::vector<float> expected(10, sum);
   ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
@@ -844,9 +843,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
 
   // Create a computation for the condition.
   // Repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
@@ -856,9 +855,9 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
   // Create a computation for the body.
   // Add 1 to the iteration variable and add a constant vector of 1.0f to
   // the weight variable, both of which are tuple elements.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -873,18 +872,18 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
 
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, "while");
+  XlaBuilder builder("while");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), builder.ConstantR1<float>(10, 0.f)});
   auto result = builder.While(condition, body, init);
   VLOG(2) << "while = "
           << ShapeUtil::HumanString(
-                 *builder.GetShape(result).ConsumeValueOrDie());
+                 builder.GetShape(result).ConsumeValueOrDie());
 
   auto expected_counter = Literal::CreateR0<int32>(5);
   auto expected_data = Literal::CreateR1<float>(
@@ -915,18 +914,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
 
   // Create a computation for the condition: repeat for count iterations.
   auto build_condition = [this, v6s32](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prev = builder.Reshape(
         builder.Slice(builder.Parameter(0, v6s32, "prev"), {0}, {1}, {1}), {0},
-          {});
+        {});
     builder.Gt(builder.ConstantR0<int32>(count), prev);
     return builder.Build().ConsumeValueOrDie();
   };
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, v6s32, "prev");
     auto inc = builder.ConcatInDim(
         {builder.ConstantR1<int32>({1}),
@@ -934,16 +933,15 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
                             builder.ConstantR0<int32>(100),
                             ShapeUtil::MakeShape(S32, {5}))},
         0);
-    auto result = builder.Add(inc, prev);
+    builder.Add(inc, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
   auto while_loop = [this, &body, build_condition](int count) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto init = builder.ConstantR1<int32>({0, 0, 0, 0, 0, 0});
-    auto result = builder.While(build_condition(count), body, init);
-    auto shape = builder.GetShape(result).ConsumeValueOrDie();
+    builder.While(build_condition(count), body, init);
     return builder.Build();
   };
 
@@ -1107,9 +1105,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
   auto inner_result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  Computation inner_condition;
+  XlaComputation inner_condition;
   {
-    ComputationBuilder builder(client_, "inner_condition");
+    XlaBuilder builder("inner_condition");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     builder.Lt(i, builder.ConstantR0<int32>(7));
@@ -1118,9 +1116,9 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the outer loop condition:
   // repeat while result < 30.
-  Computation outer_condition;
+  XlaComputation outer_condition;
   {
-    ComputationBuilder builder(client_, "outer_condition");
+    XlaBuilder builder("outer_condition");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     builder.Lt(prev, builder.ConstantR0<int32>(30));
     outer_condition = builder.Build().ConsumeValueOrDie();
@@ -1128,34 +1126,33 @@ XLA_TEST_F(WhileTest, NestedWhileWithScalarResult) {
 
   // Creates a computation for the inner loop body: add 1 to `i`, and add 2 to
   // `result`.
-  Computation inner_body;
+  XlaComputation inner_body;
   {
-    ComputationBuilder builder(client_, "inner_body");
+    XlaBuilder builder("inner_body");
     auto params = builder.Parameter(0, inner_result_shape, "prev");
     auto i = builder.GetTupleElement(params, 0);
     auto result = builder.GetTupleElement(params, 1);
     i = builder.Add(builder.ConstantR0<int32>(1), i);
     result = builder.Add(builder.ConstantR0<int32>(2), result);
-    auto output = builder.Tuple({i, result});
+    builder.Tuple({i, result});
     inner_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Creates a computation for the outer loop: run the inner loop with i = 0.
-  Computation outer_body;
+  XlaComputation outer_body;
   {
-    ComputationBuilder builder(client_, "outer_body");
+    XlaBuilder builder("outer_body");
     auto prev = builder.Parameter(0, outer_result_shape, "prev");
     auto init = builder.Tuple({builder.ConstantR0<int32>(0), prev});
     auto result = builder.While(inner_condition, inner_body, init);
-    auto output = builder.GetTupleElement(result, 1);
+    builder.GetTupleElement(result, 1);
     outer_body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(outer_condition, outer_body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(outer_condition, outer_body, init);
 
   ComputeAndCompareR0<int32>(&builder, 42, {});
 }
@@ -1170,18 +1167,18 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   auto result_shape = ShapeUtil::MakeShape(S32, {});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition_callee;
+  XlaComputation condition_callee;
   {
-    ComputationBuilder builder(client_, "condition_callee");
+    XlaBuilder builder("condition_callee");
     auto prev = builder.Parameter(0, result_shape, "prev");
     builder.Tuple({builder.Gt(builder.ConstantR0<int32>(5), prev)});
 
     condition_callee = builder.Build().ConsumeValueOrDie();
   }
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto result = builder.Call(condition_callee, {prev});
     builder.GetTupleElement(result, 0);
@@ -1189,20 +1186,19 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithCallInsideCondition)) {
   }
 
   // Create a computation for the body: add 1 to the result variable.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, result_shape, "prev");
     auto input = builder.ConstantR0<int32>(1);
-    auto result = builder.Add(input, prev);
+    builder.Add(input, prev);
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While node with computations for the condition and the body.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto init = builder.ConstantR0<int32>(0);
-  auto result = builder.While(condition, body, init);
-  auto shape = builder.GetShape(result).ConsumeValueOrDie();
+  builder.While(condition, body, init);
 
   ComputeAndCompareR0<int32>(&builder, 5, {});
 }
@@ -1214,28 +1210,28 @@ TEST_F(WhileTest, WhileWithLoopInvariantOperation) {
       {scalar_s32, matrix_shape, matrix_shape, matrix_shape});
 
   // Create a computation for the condition: repeat for 5 iterations.
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client_, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_shape, "state");
     builder.Gt(builder.ConstantR0<int32>(5), builder.GetTupleElement(state, 0));
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client_, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_shape, "state");
     auto indvar = builder.GetTupleElement(state, 0);
     auto input_0 = builder.GetTupleElement(state, 1);
     auto input_1 = builder.GetTupleElement(state, 2);
     auto output = builder.Tanh(builder.Dot(input_0, input_1));
     auto indvar_next = builder.Add(indvar, builder.ConstantR0<int32>(1));
-    auto tuple_result = builder.Tuple({indvar_next, input_0, input_1, output});
+    builder.Tuple({indvar_next, input_0, input_1, output});
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto matrix_input = builder.Parameter(0, matrix_shape, "matrix");
   auto init = builder.Tuple(
       {builder.ConstantR0<int32>(0), matrix_input, matrix_input, matrix_input});
@@ -1268,9 +1264,9 @@ void BM_WhileLoop(int num_iters) {
 
   // Create while condition computation with 'loop_limit'.
   const int32 loop_limit = 100;
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     auto iteration = builder.GetTupleElement(prev, 0);
     builder.Lt(iteration, builder.ConstantR0<int32>(loop_limit));
@@ -1278,9 +1274,9 @@ void BM_WhileLoop(int num_iters) {
   }
 
   // Create while body computation with unit loop increment.
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto prev = builder.Parameter(0, loop_state_shape, "prev");
     // TupleElement 0
     auto iteration = builder.GetTupleElement(prev, 0);
@@ -1294,12 +1290,12 @@ void BM_WhileLoop(int num_iters) {
     auto starts = builder.ConstantR1<int32>({0, 0, 0});
     // UpdateSlice.
     auto out1 = builder.DynamicUpdateSlice(input, update, starts);
-    auto result = builder.Tuple({out0, out1});
+    builder.Tuple({out0, out1});
     body = builder.Build().ConsumeValueOrDie();
   }
 
   // Create a While instruction.
-  ComputationBuilder builder(client, "while");
+  XlaBuilder builder("while");
   auto zero = builder.ConstantR0<float>(0.0);
   auto input = builder.Broadcast(zero, {seq_len, 1024, 1024});
   auto init = builder.Tuple({builder.ConstantR0<int32>(0), input});
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 24b9f37a8008b6f774634f2dbff9d3296ec0585b..ff3418a128eed82b730a6602d6e3faba4ad7be32 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -294,7 +295,8 @@ XLA_TEST_F(HloProfileTest,
   auto while_body_profile_start =
       std::find_if(profile_output_lines.begin(), profile_output_lines.end(),
                    [](tensorflow::StringPiece s) {
-                     return s.starts_with("Execution profile for body");
+                     return tensorflow::str_util::StartsWith(
+                         s, "Execution profile for body");
                    });
 
   ASSERT_NE(while_body_profile_start, profile_output_lines.end());
diff --git a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
index 92b2b1ee778f8b0f8104e7d7ff27a5c11db59768..a9f2915b458b1816926de727b3da21982d06f6c0 100644
--- a/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
+++ b/tensorflow/compiler/xla/tests/xla_internal_test_main.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 GTEST_API_ int main(int argc, char** argv) {
   std::vector<tensorflow::Flag> flag_list;
@@ -25,7 +29,38 @@ GTEST_API_ int main(int argc, char** argv) {
     return 2;
   }
 
+  // If the --benchmarks flag is passed in then only run the benchmarks, not the
+  // tests.
+  for (int i = 1; i < argc; i++) {
+    tensorflow::StringPiece arg(argv[i]);
+    if (arg == "--benchmarks" ||
+        tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+      const char* pattern = nullptr;
+      if (tensorflow::str_util::StartsWith(arg, "--benchmarks=")) {
+        pattern = argv[i] + strlen("--benchmarks=");
+      } else {
+        // Handle flag of the form '--benchmarks foo' (no '=').
+        if (i + 1 >= argc ||
+            tensorflow::str_util::StartsWith(argv[i + 1], "--")) {
+          LOG(ERROR) << "--benchmarks flag requires an argument.";
+          return 2;
+        }
+        pattern = argv[i + 1];
+      }
+      // Unfortunately Google's internal benchmark infrastructure has a
+      // different API than Tensorflow's.
+#if defined(PLATFORM_GOOGLE)
+      base::SetFlag(&FLAGS_benchmarks, pattern);
+      RunSpecifiedBenchmarks();
+#else
+      tensorflow::testing::Benchmark::Run(pattern);
+#endif
+      return 0;
+    }
+  }
+
   testing::InitGoogleTest(&argc, argv);
+
   if (argc > 1) {
     LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
     return 2;
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 6fa4c48e11d1102367b21bc21d4734466495ef0e..44f874cd2ae8e6f65dc282b8675f195ec9c09415 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -38,7 +38,7 @@ namespace xla {
 
 StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
     tensorflow::StringPiece path) {
-  CHECK(!path.ends_with(".gz"))
+  CHECK(!tensorflow::str_util::EndsWith(path, ".gz"))
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tensorflow::RandomAccessFile> file;
   Status s =
@@ -115,7 +115,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
     tensorflow::StringPiece value_string = pieces[1];
     tensorflow::str_util::RemoveWhitespaceContext(&coordinates_string);
     tensorflow::str_util::RemoveWhitespaceContext(&value_string);
-    if (!coordinates_string.Consume("(")) {
+    if (!tensorflow::str_util::ConsumePrefix(&coordinates_string, "(")) {
       return InvalidArgument(
           "expected '(' at the beginning of coordinates: \"%s\"", line.c_str());
     }
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 2e55f609d17bf42e410f97c51c7b9c6c0e85576d..0bc4045a5490319994b6cf24daf99fe856167507 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -223,17 +223,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index 97aacf6b39f83978e732060817cd93ede81ca782..0fa4b98d0a41a1e7c681bb2302da3b752315867b 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -70,17 +70,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e60a5a4919f2207939821e787c3c59a08ff3ba4e..b2f122982adf750106f034e7e786367720ebafcf 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -724,6 +724,15 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           shape, operands[0], *broadcast_dimensions));
       break;
     }
+    case HloOpcode::kBroadcastDimOne: {
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateBroadcastDimOne(shape, operands[0]));
+      break;
+    }
     case HloOpcode::kConcatenate: {
       optional<std::vector<int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 863081d654390440aa6506bab4576b3cc5c1cbd1..57684b58346166f7e3ef9576f6cd8f70ab9dc389 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -56,6 +57,18 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 
+)"
+},
+// broadcast size-one dimensions
+{
+"BroadcastDimOne",
+R"(HloModule broadcast_dim_one_module
+
+ENTRY %broadcast-dim-one () -> f32[2,2] {
+  %constant = f32[1,2]{1,0} constant(f32[1,2] { { 1.1, 2.2 } })
+  ROOT %broadcast-dim-one = f32[2,2]{1,0} broadcast-dim-one(f32[1,2]{1,0} %constant)
+}
+
 )"
 },
 // pred constant
@@ -894,7 +907,7 @@ class HloParserTest : public ::testing::Test,
                       public ::testing::WithParamInterface<TestData> {
  protected:
   static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-    EXPECT_TRUE(StringPiece(s).contains(expected))
+    EXPECT_TRUE(tensorflow::str_util::StrContains(s, expected))
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index dc4f7a1cb436183f5acfa360fb092795258b6a75..e43498e381b8e63543e2ddda08ca7c0df91817e4 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -243,8 +243,8 @@ string HumanReadableNumOps(double flops, double nanoseconds,
       static_cast<int64>(nano_flops * 1e9));
   tensorflow::StringPiece sp(throughput);
   // Use the more common "G(FLOPS)", rather than "B(FLOPS)"
-  if (sp.ends_with("B") ||  // Ends in 'B', ignoring case
-      sp.ends_with("b")) {
+  if (tensorflow::str_util::EndsWith(sp, "B") ||  // Ends in 'B', ignoring case
+      tensorflow::str_util::EndsWith(sp, "b")) {
     *throughput.rbegin() = 'G';
   }
   throughput += tensorflow::strings::StrCat(op_prefix, "OP/s");
diff --git a/tensorflow/compiler/xla/xla.bzl b/tensorflow/compiler/xla/xla.bzl
index 6b136d333bbf079efd314833f46fe3b98743fbac..1439f1bcc5cec39203a7cb4b1f8604e7349382c6 100644
--- a/tensorflow/compiler/xla/xla.bzl
+++ b/tensorflow/compiler/xla/xla.bzl
@@ -6,7 +6,9 @@ load("//tensorflow/core:platform/default/build_config_root.bzl",
      "if_static")
 
 # xla_proto_library() is a convenience wrapper around cc_proto_library.
-def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
+def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0, **kwargs):
+  if kwargs.get('use_grpc_plugin'):
+    kwargs['use_grpc_namespace'] = True
   cc_proto_library(name=name,
                    srcs=srcs,
                    deps=deps,
@@ -16,6 +18,13 @@ def xla_proto_library(name, srcs=[], deps=[], visibility=None, testonly=0):
                    ),
                    protoc="@protobuf_archive//:protoc",
                    testonly=testonly,
-                   visibility=visibility,)
+                   visibility=visibility,
+                   **kwargs)
+
+def xla_py_grpc_library(**kwargs):
+  # Note: we don't currently define any special targets for Python GRPC in OSS.
+  _ignore = kwargs
+  pass
+
 
 ORC_JIT_MEMORY_MAPPER_TARGETS = []
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index edf1b07af82b5d43fe67c6efdabdb0a9b4b1edea..f619b8dc24038af64a27fc0565c74447ca9d09cf 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -189,6 +189,12 @@ message DebugOptions {
   // directory.
   string xla_dump_per_pass_hlo_proto_to = 96;
 
+  // Generate calls to MKL-DNN in the CPU backend.
+  bool xla_cpu_use_mkl_dnn = 97;
+
+  // Maximum kernel unroll factor for the GPU backend.
+  int32 xla_gpu_max_kernel_unroll_factor = 98;
+
   // Extra options to pass to the compilation backend; specific interpretation
   // of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -299,6 +305,11 @@ message ComputationStatsRequest {
   DebugOptions debug_options = 2;
 }
 
+message ComputationGraphStatsRequest {
+  HloModuleProto computation = 1;
+  DebugOptions debug_options = 2;
+}
+
 message ComputationStatsResponse {
   ComputationStats stats = 1;
 }
@@ -355,6 +366,10 @@ message ExecuteParallelRequest {
   repeated ExecuteRequest requests = 1;
 }
 
+message ExecuteGraphParallelRequest {
+  repeated ExecuteGraphRequest requests = 1;
+}
+
 message ExecuteResponse {
   GlobalDataHandle output = 1;
   ExecutionProfile profile = 2;
@@ -405,6 +420,11 @@ message ComputeConstantRequest {
   repeated LiteralProto parameters = 4;
 }
 
+message ComputeConstantGraphRequest {
+  HloModuleProto computation = 1;
+  Layout output_layout = 2;
+}
+
 message ComputeConstantResponse {
   // A LiteralProto is returned directly for this request, instead of a
   // ComputationDataHandle.
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index fb81b50fe8e29a2e4cb7d127fd4b2b6778da763c..9bef0d8b61ebe8ae65c991c7e414f8f6e58f10d5 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -8,6 +8,7 @@ package(default_visibility = ["//tensorflow:__subpackages__"])
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 py_library(
     name = "contrib_py",
@@ -33,13 +34,13 @@ py_library(
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/data",
+        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/deprecated:deprecated_py",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
         "//tensorflow/contrib/factorization:factorization_py",
         "//tensorflow/contrib/feature_column:feature_column_py",
-        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/fused_conv:fused_conv_py",
         "//tensorflow/contrib/gan",
@@ -62,7 +63,6 @@ py_library(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/contrib/linear_optimizer:sdca_estimator_py",
         "//tensorflow/contrib/linear_optimizer:sdca_ops_py",
-        "//tensorflow/contrib/lite/python:lite",
         "//tensorflow/contrib/lookup:lookup_py",
         "//tensorflow/contrib/losses:losses_py",
         "//tensorflow/contrib/losses:metric_learning_py",
@@ -74,12 +74,14 @@ py_library(
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
         "//tensorflow/contrib/nn:nn_py",
         "//tensorflow/contrib/opt:opt_py",
+        "//tensorflow/contrib/optimizer_v2:optimizer_v2_py",
         "//tensorflow/contrib/periodic_resample:init_py",
         "//tensorflow/contrib/predictor",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
         "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
+        "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
         "//tensorflow/contrib/remote_fused_graph/pylib:remote_fused_graph_ops_py",
         "//tensorflow/contrib/resampler:resampler_py",
@@ -115,7 +117,10 @@ py_library(
             "//tensorflow/contrib/kafka",
         ],
         "//conditions:default": [],
-    }),
+    }) + if_not_windows([
+        "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
+        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
+    ]),
 )
 
 cc_library(
@@ -174,15 +179,3 @@ cc_library(
         "//conditions:default": [],
     }),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 4f6f539027b040de7554d09fe9118ff97aa006f8..aaddb06fa0c22d6162815dc2dbf24e6dc79c0df8 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -1,3 +1,4 @@
+# pylint: disable=g-import-not-at-top
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Add projects here, they will show up under tf.contrib.
 from tensorflow.contrib import batching
 from tensorflow.contrib import bayesflow
@@ -30,6 +33,7 @@ from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
 from tensorflow.contrib import data
 from tensorflow.contrib import deprecated
+from tensorflow.contrib import distribute
 from tensorflow.contrib import distributions
 from tensorflow.contrib import estimator
 from tensorflow.contrib import factorization
@@ -62,6 +66,7 @@ from tensorflow.contrib import periodic_resample
 from tensorflow.contrib import predictor
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
+from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
@@ -83,7 +88,9 @@ from tensorflow.contrib import tpu
 from tensorflow.contrib import training
 from tensorflow.contrib import util
 from tensorflow.contrib.eager.python import tfe as eager
-from tensorflow.contrib.lite.python import lite
+if os.name != "nt":
+  from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
@@ -92,6 +99,7 @@ from tensorflow.contrib.summary import summary
 from tensorflow.python.util.lazy_loader import LazyLoader
 ffmpeg = LazyLoader("ffmpeg", globals(),
                     "tensorflow.contrib.ffmpeg")
+del os
 del LazyLoader
 
 del absolute_import
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 8dff93b4f825277dcf0a64aa3b96bd809d36e1e9..62d1b1cf079d04d50e4899cfd9ba1d405ee1efb9 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -45,16 +45,3 @@ tf_py_test(
         "//tensorflow/python:state_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 6658f0d9c13f6db17b25354cde2593d57f104f17..8add2aacff1d64f1617cd24167c4c6c6706044da 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -38,16 +38,15 @@ def _flatten_tensors(tensors):
     shape: the original shape of each element of input tensors
 
   Raises:
-    ValueError: tensors are empty or non-isomorphic.
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
   """
   if not tensors:
     raise ValueError("tensors cannot be empty")
   shape = tensors[0].shape
   for tensor in tensors:
     shape = shape.merge_with(tensor.shape)
-  if shape.ndims is None:
-    raise ValueError("At least one of the tensors in 'tensors' must have "
-                     "statically known rank.")
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
   if len(shape) != 1:
     reshaped = []
     for t in tensors:
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
index 47bab0a3670a90644972b2c961954a3036b8ecba..b3f5d92259df8475b205110dd3f0cee1cb5bde6f 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce_test.py
@@ -36,6 +36,12 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  def testFlattenTensorsShapesDefined(self):
+    x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
+    with self.assertRaisesRegexp(ValueError,
+                                 "must have statically known shape"):
+      ar._flatten_tensors([x, x])
+
   def testRingPermutations(self):
     # 0 devices
     pred_by_c_d, rank_by_c_d = ar._ring_permutations(1, 0, [])
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 4bff3c27d22c4550747a651a59909bdef80e8285..60306ebdc6cddb04e8807bfd495fa92a56e55ecd 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -38,20 +38,6 @@ cc_library(
     alwayslink = 1,
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # JAR with Java bindings to TF.
 android_library(
     name = "android_tensorflow_inference_java",
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.cc b/tensorflow/contrib/android/asset_manager_filesystem.cc
index 380a652435ad089f46f3ca80e4fd43097fd96e10..513d519eabbd54f46fde9ec0f004247c02277732 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.cc
+++ b/tensorflow/contrib/android/asset_manager_filesystem.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 namespace {
@@ -228,9 +229,8 @@ string AssetManagerFileSystem::NormalizeDirectoryPath(const string& fname) {
 }
 
 string AssetManagerFileSystem::RemoveAssetPrefix(const string& name) {
-  string output(name);
-  StringPiece piece(output);
-  piece.Consume(prefix_);
+  StringPiece piece(name);
+  str_util::ConsumePrefix(&piece, prefix_);
   return piece.ToString();
 }
 
@@ -243,6 +243,11 @@ bool AssetManagerFileSystem::DirectoryExists(const std::string& fname) {
   return AAssetDir_getNextFileName(dir.get()) != NULL;
 }
 
+Status AssetManagerFileSystem::GetMatchingPaths(const string& pattern,
+                                                std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status AssetManagerFileSystem::NewWritableFile(
     const string& fname, std::unique_ptr<WritableFile>* result) {
   return errors::Unimplemented("Asset storage is read only.");
diff --git a/tensorflow/contrib/android/asset_manager_filesystem.h b/tensorflow/contrib/android/asset_manager_filesystem.h
index 665304b5eef1f8a3633c8c522259e20d744b1808..a87ff42ae217c429ecf5d2458b88b3431551ad97 100644
--- a/tensorflow/contrib/android/asset_manager_filesystem.h
+++ b/tensorflow/contrib/android/asset_manager_filesystem.h
@@ -66,6 +66,9 @@ class AssetManagerFileSystem : public FileSystem {
   Status DeleteDir(const string& d) override;
   Status RenameFile(const string& s, const string& t) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
  private:
   string RemoveAssetPrefix(const string& name);
 
diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 608bd82722fa45a7009bd597cfd74060b1239a3b..8f9bffa55e44e4942bb3845945b3d440c7957cc9 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -24,7 +24,6 @@ py_library(
         "continue_statements.py",
         "control_flow.py",
         "decorators.py",
-        "for_loops.py",
         "ifexp.py",
         "list_comprehension.py",
         "lists.py",
@@ -49,6 +48,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":converters",
+        "//tensorflow/contrib/autograph/operators",
         "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/contrib/autograph/pyct/static_analysis",
         "//tensorflow/contrib/autograph/utils",
@@ -61,6 +61,7 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -81,6 +82,7 @@ py_test(
     name = "builtin_functions_test",
     srcs = ["builtin_functions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/python:client_testlib",
@@ -92,6 +94,7 @@ py_test(
     size = "large",
     srcs = ["call_trees_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":test_lib",
         "//tensorflow/contrib/autograph/impl",
@@ -129,16 +132,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "for_loops_test",
-    srcs = ["for_loops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":test_lib",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
 py_test(
     name = "name_scopes_test",
     srcs = ["name_scopes_test.py"],
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 721bc0ccd0a00d09d7b308df867ef3839bb08d43..62115d4005cb80af1bc2e916c2f3b78f0cc91044 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -27,11 +27,12 @@ from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
 class BreakCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes continue statements into additional conditionals."""
+  """Canonicalizes break statements into additional conditionals."""
 
   def __init__(self, context):
     super(BreakCanonicalizationTransformer, self).__init__(context)
     # This is a stack structure, to correctly process nested loops.
+    # Each item is a list [break_used, break_variable_name]
     self.break_uses = []
 
   def _create_break_check(self):
@@ -99,9 +100,9 @@ class BreakCanonicalizationTransformer(transformer.Base):
     self.break_uses.append([False, break_var])
     node.body = self._manual_visit_list(node.body)
     if self.break_uses[-1][0]:
-      anno.setanno(node, 'extra_cond',
-                   gast.UnaryOp(gast.Not(),
-                                gast.Name(break_var, gast.Load(), None)))
+      extra_cond = templates.replace_as_expression(
+          'not var_name', var_name=break_var)
+      anno.setanno(node, 'extra_cond', extra_cond)
       final_nodes = [self._create_break_init(), node]
     else:
       final_nodes = node
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 49d932026ffa9e79e7ddc640f7d3deaec0f4b8a6..55a28e8ac30397d317f1e6548909501d473eb4e8 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -22,6 +22,7 @@ import gast
 
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
@@ -49,11 +50,6 @@ class ControlFlowTransformer(transformer.Base):
   def __init__(self, context):
     super(ControlFlowTransformer, self).__init__(context)
 
-  # pylint:disable=invalid-name
-
-  def visit_For(self, node):
-    assert False, 'for statement should have been canonicalized at this point'
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -170,6 +166,13 @@ class ControlFlowTransformer(transformer.Base):
     body_closure = body_scope.modified - body_scope.created
     all_referenced = body_scope.referenced
 
+    cond_scope = anno.getanno(node, NodeAnno.COND_SCOPE)
+    cond_closure = set()
+    for s in cond_scope.referenced:
+      for root in s.support_set:
+        if root not in body_scope.created:
+          cond_closure.add(root)
+
     state = list(body_closure)
     if not state:
       # TODO(mdan): Implement this properly.
@@ -204,7 +207,8 @@ class ControlFlowTransformer(transformer.Base):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = autograph_utils.run_while(test_name, body_name, [state])
+      state_ast_tuple = __ops.while_loop(
+          test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
         template,
@@ -216,11 +220,67 @@ class ControlFlowTransformer(transformer.Base):
         test=test,
         body_name=self.context.namer.new_symbol('loop_body',
                                                 body_scope.referenced),
-        body=node_body)
+        body=node_body,
+        extra_deps=tuple(s.ast() for s in cond_closure),
+    )
 
     return node
 
-  # pylint:enable=invalid-name
+  def visit_For(self, node):
+    self.generic_visit(node)
+
+    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    body_closure = body_scope.modified - body_scope.created
+    all_referenced = body_scope.referenced
+
+    state = list(body_closure)
+
+    state_ssf = [
+        self.context.namer.new_symbol(s.ssf(), all_referenced) for s in state
+    ]
+    ssf_map = {
+        name: ssf
+        for name, ssf in zip(state, state_ssf)
+        if str(name) != ssf
+    }
+
+    if len(state) == 1:
+      state = state[0]
+      state_ssf = state_ssf[0]
+      state_ast_tuple = state
+    else:
+      state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
+
+    node_body = ast_util.rename_symbols(node.body, ssf_map)
+    if anno.hasanno(node, 'extra_cond'):
+      extra_cond = anno.getanno(node, 'extra_cond')
+      extra_cond = ast_util.rename_symbols(extra_cond, ssf_map)
+    else:
+      extra_cond = parser.parse_expression('True')
+
+    template = """
+      def extra_cond_name(state_ssf):
+        return extra_cond_expr
+      def body_name(iterate, state_ssf):
+        body
+        return state_ssf,
+      state_ast_tuple = __ops.for_loop(
+          iterated, extra_cond_name, body_name, (state,))
+    """
+    node = templates.replace(
+        template,
+        state=state,
+        state_ssf=state_ssf,
+        state_ast_tuple=state_ast_tuple,
+        iterated=node.iter,
+        iterate=node.target,
+        extra_cond_name=self.context.namer.new_symbol('extra_cond',
+                                                      all_referenced),
+        extra_cond_expr=extra_cond,
+        body_name=self.context.namer.new_symbol('loop_body', all_referenced),
+        body=node_body)
+
+    return node
 
 
 def transform(node, context):
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index 86fed51f27bee07f772633f3928ac5263bf57652..c5610b16b4e5de374f404307d3583660707d5e0b 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -94,6 +95,77 @@ class ControlFlowTest(converter_test_base.TestCase):
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_simple_for(self):
+
+    def test_fn(l):
+      s1 = 0
+      s2 = 0
+      for e in l:
+        s1 += e
+        s2 += e * e
+      return s1, s2
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_single_var(self):
+
+    def test_fn(l):
+      s = 0
+      for e in l:
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        l = [1, 2, 3]
+        self.assertEqual(
+            test_fn(l), sess.run(result.test_fn(constant_op.constant(l))))
+        l = []
+        self.assertEqual(
+            test_fn(l),
+            sess.run(
+                result.test_fn(
+                    constant_op.constant(l, shape=(0,), dtype=dtypes.int32))))
+
+  def test_for_with_iterated_expression(self):
+
+    eval_count = [0]
+
+    def count_evals(x):
+      eval_count[0] += 1
+      return x
+
+    def test_fn(n):
+      s = 0
+      for e in count_evals(range(n)):
+        s += e
+      return s
+
+    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      result.count_evals = count_evals
+      self.assertEqual(test_fn(5), result.test_fn(5))
+      # count_evals ran twice, once for test_fn and another for result.test_fn
+      self.assertEqual(eval_count[0], 2)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/converters/converter_test_base.py b/tensorflow/contrib/autograph/converters/converter_test_base.py
index 3ea2cfd668270a69427c24cdf1bbf11d32d66ebe..6f75e9a529b60d08873b4b90046e1cabc274140e 100644
--- a/tensorflow/contrib/autograph/converters/converter_test_base.py
+++ b/tensorflow/contrib/autograph/converters/converter_test_base.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import contextlib
 import imp
 
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.pyct import compiler
 from tensorflow.contrib.autograph.pyct import context
@@ -77,6 +78,7 @@ class TestCase(test.TestCase):
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       result.autograph_utils = utils
       result.autograph_api = self.make_fake_mod('fake_api', converted_call)
+      result.__dict__['__ops'] = operators
       yield result
     except Exception:  # pylint:disable=broad-except
       if source is None:
diff --git a/tensorflow/contrib/autograph/converters/for_loops.py b/tensorflow/contrib/autograph/converters/for_loops.py
deleted file mode 100644
index 4999c47bdc79ec0ea352472cfd3e97b94ebc7cce..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/autograph/converters/for_loops.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Canonicalizes for loops into while loops.
-
-This canonicalizer uses the len function on its argument. That should be
-converted to a tf.shape separately.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.pyct import anno
-from tensorflow.contrib.autograph.pyct import templates
-from tensorflow.contrib.autograph.pyct import transformer
-from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
-
-
-class ForLoopCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes for loops (e.g. into while loops)."""
-
-  def __init__(self, context):
-    super(ForLoopCanonicalizationTransformer, self).__init__(context)
-
-  def visit_For(self, node):
-    self.generic_visit(node)
-    body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    i_var = self.context.namer.new_symbol('i', body_scope.referenced)
-    smart_loop_iter_var = self.context.namer.new_symbol('smart_loop_iter',
-                                                        body_scope.referenced)
-    cont_var = self.context.namer.new_symbol('cont', body_scope.referenced)
-    # TODO(mdan): Use TensorListFromTensor(loop_iter) here.
-    if anno.hasanno(node, 'extra_cond'):
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont and extra_cond:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      return templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var,
-          extra_cond=anno.getanno(node, 'extra_cond'))
-    else:
-      template = """
-        i = 0
-        smart_loop_iter = autograph_utils.dynamic_dataset(loop_iter)
-        cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-        while cont:
-          body
-          i += 1
-          cont, target = autograph_utils.dynamic_for_cond(i, smart_loop_iter)
-      """
-      repl = templates.replace(
-          template,
-          loop_iter=node.iter,
-          target=node.target,
-          body=node.body,
-          i=i_var,
-          smart_loop_iter=smart_loop_iter_var,
-          cont=cont_var)
-      return repl
-
-  def visit_Continue(self, node):
-    assert False, 'continue statement should be desugared at this point'
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
-
-
-def transform(node, context):
-  return ForLoopCanonicalizationTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/for_loops_test.py b/tensorflow/contrib/autograph/converters/for_loops_test.py
deleted file mode 100644
index 943f52de55a3629fdb18e6188e42269a4cb06275..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/autograph/converters/for_loops_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for for_loops module."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.autograph.converters import converter_test_base
-from tensorflow.contrib.autograph.converters import for_loops
-from tensorflow.python.platform import test
-
-
-class ControlFlowTest(converter_test_base.TestCase):
-
-  def test_basic_for(self):
-
-    def test_fn(l):
-      s = 0
-      for e in l:
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      l = [1, 2, 3]
-      self.assertEqual(test_fn(l), result.test_fn(l))
-      l = []
-      self.assertEqual(test_fn(l), result.test_fn(l))
-
-  def test_for_with_iterated_expression(self):
-
-    eval_count = [0]
-
-    def count_evals(x):
-      eval_count[0] += 1
-      return x
-
-    def test_fn(n):
-      s = 0
-      for e in count_evals(range(n)):
-        s += e
-      return s
-
-    node = self.parse_and_analyze(test_fn, {'count_evals': count_evals})
-    node = for_loops.transform(node, self.ctx)
-
-    with self.compiled(node) as result:
-      result.count_evals = count_evals
-      self.assertEqual(test_fn(5), result.test_fn(5))
-      # count_evals ran twice, once for test_fn and another for result.test_fn
-      self.assertEqual(eval_count[0], 2)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..d62390494b78c415212ba91ac914cdfee324f971
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -0,0 +1,1919 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dev Summit 2018 - Autograph",
+      "version": "0.3.2",
+      "views": {},
+      "default_view": {},
+      "provenance": [
+        {
+          "file_id": "1wCZUh73zTNs1jzzYjqoxMIdaBWCdKJ2K",
+          "timestamp": 1522238054357
+        },
+        {
+          "file_id": "1_HpC-RrmIv4lNaqeoslUeWaX8zH5IXaJ",
+          "timestamp": 1521743157199
+        },
+        {
+          "file_id": "1mjO2fQ2F9hxpAzw2mnrrUkcgfb7xSGW-",
+          "timestamp": 1520522344607
+        }
+      ],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python2",
+      "display_name": "Python 2"
+    }
+  },
+  "cells": [
+    {
+      "metadata": {
+        "id": "g7nGs4mzVUHP",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Experimental: TF Autograph\n",
+        "**TensorFlow Dev Summit, 2018.**\n",
+        "\n",
+        "This interactive notebook demonstrates **autograph**, an experimental source-code transformation library to automatically convert TF.Eager and Python code to TensorFlow graphs.\n",
+        "\n",
+        "**Note: this is pre-alpha software!** The notebook works best with Python 2, for now.\n",
+        "\n",
+        "> ![alt text](https://lh3.googleusercontent.com/QOvy0clmg7siaVKzwmSPAjicWWNQ0OeyaB16plDjSJMf35WD3vLjF6mz4CGrhSHw60HnlZPJjkyDCBzw5XOI0oBGSewyYw=s688)\n",
+        "\n",
+        "### Table of Contents\n",
+        "1. _Write Eager code that is fast and scalable._\n",
+        "2. _Case study: complex control flow._\n",
+        "3. _Case study: training MNIST with Keras._\n",
+        "4. _Case study: building an RNN._"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "uFcgBENZqkB2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "# Install TensorFlow; note that Colab notebooks run remotely, on virtual\n",
+        "# instances provided by Google.\n",
+        "!pip install -U -q tf-nightly"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Pa2qpEmoVOGe",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import time\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph\n",
+        "\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "import six\n",
+        "\n",
+        "from google.colab import widgets"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "ZVKfj5ttVkqz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 1. Write Eager code that is fast and scalable\n",
+        "\n",
+        "TF.Eager gives you more flexibility while coding, but at the cost of losing the benefits of TensorFlow graphs. For example, Eager does not currently support distributed training, exporting models, and a variety of memory and computation optimizations.\n",
+        "\n",
+        "Autograph gives you the best of both worlds: write your code in an Eager style, and we will automatically transform it into the equivalent TF graph code. The graph code can be executed eagerly (as a single op), included as part of a larger graph, or exported."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "snaZRFdWd9ym",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "For example, autograph can convert a function like this:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9__n8cSIeDnD",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def g(x):\n",
+        "  if x > 0:\n",
+        "    x = x * x\n",
+        "  else:\n",
+        "    x = 0\n",
+        "  return x"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "gq0eQcuReHET",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "... into a TF graph-building function:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "sELSn599ePUF",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 413
+        },
+        "outputId": "bb0c7216-1ca3-4da1-d1fb-589902cdcd1a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737505,
+          "user_tz": 240,
+          "elapsed": 243,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "print(autograph.to_code(g))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "from tensorflow.contrib.autograph.impl import api as autograph_api\n",
+            "from tensorflow.contrib.autograph import utils as autograph_utils\n",
+            "\n",
+            "def tf__g(x):\n",
+            "  with tf.name_scope('g'):\n",
+            "\n",
+            "    def if_true():\n",
+            "      with tf.name_scope('if_true'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = x_1 * x_1\n",
+            "        return x_1,\n",
+            "\n",
+            "    def if_false():\n",
+            "      with tf.name_scope('if_false'):\n",
+            "        x_1, = x,\n",
+            "        x_1 = 0\n",
+            "        return x_1,\n",
+            "    x = autograph_utils.run_cond(tf.greater(x, 0), if_true, if_false)\n",
+            "    return x\n",
+            "\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "j74n-8hEe6dk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can then use the converted function as you would any regular TF op -- you can pass `Tensor` arguments and it will return `Tensor`s:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AkVaY0-dfEbH",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "4ffe3757-c44d-424c-c2a8-7ddc973bfcce",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345737841,
+          "user_tz": 240,
+          "elapsed": 257,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "tf_g = autograph.to_graph(g)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "\n",
+        "  g_ops = tf_g(tf.constant(9))\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    tf_g_result = sess.run(g_ops)\n",
+        "\n",
+        "  print('g(9) = %s' % g(9))\n",
+        "  print('tf_g(9) = %s' % tf_g_result)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "g(9) = 81\n",
+            "tf_g(9) = 81\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "trrHQBM1VnD0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 2. Case study: complex control flow\n",
+        "\n",
+        "Autograph can convert a large chunk of the Python language into graph-equivalent code, and we're adding new supported language features all the time. In this section, we'll give you a taste of some of the functionality in autograph.\n",
+        "Autograph will automatically convert most Python control flow statements into their correct graph equivalent.\n",
+        "  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "u0YG3DPgZxoW",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We support common statements like `while`, `for`, `if`, `break`, `return` and more. You can even nest them as much as you like. Imagine trying to write the graph version of this code by hand:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xJYDzOcrZ8pI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "6c244ee4-b141-4ad6-eefa-cfffa71f33c6",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738402,
+          "user_tz": 240,
+          "elapsed": 483,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def sum_even(numbers):\n",
+        "  s = 0\n",
+        "  for n in numbers:\n",
+        "    if n % 2 > 0:\n",
+        "      continue\n",
+        "    s += n\n",
+        "  return s\n",
+        "\n",
+        "\n",
+        "tf_sum_even = autograph.to_graph(sum_even)\n",
+        "\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    result = sess.run(tf_sum_even(tf.constant([10, 12, 15, 20])))\n",
+        "\n",
+        "  print('Sum of even numbers: %s' % result)\n",
+        "  \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(sum_even))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Sum of even numbers: 42\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "_YXo4KOcbKrn",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Try replacing the `continue` in the above code with `break` -- Autograph supports that as well!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "xHmC0rBIavW_",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The Python code above is much more readable than the matching graph code. Autograph takes care of tediously converting every piece of Python code into the matching TensorFlow graph version for you, so that you can quickly write maintainable code, but still benefit from the optimizations and deployment benefits of graphs."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UEHWGpBXbS7g",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's try some other useful Python constructs, like `print` and `assert`. We automatically convert Python `assert` statements into the equivalent `tf.Assert` code.  "
+      ]
+    },
+    {
+      "metadata": {
+        "id": "qUU57xlEbauI",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "outputId": "add3db4a-2077-4dd5-f7a7-a5b5a4529c26",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345738697,
+          "user_tz": 240,
+          "elapsed": 253,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(x):\n",
+        "  assert x != 0, 'Do not pass zero!'\n",
+        "  return x * x\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    try:\n",
+        "      print(sess.run(tf_f(tf.constant(0))))\n",
+        "    except tf.errors.InvalidArgumentError as e:\n",
+        "      print('Got error message: %s' % e.message)\n",
+        "      \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Got error message: assertion failed: [Do not pass zero!]\n",
+            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "w5hBZaVJbck4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "You can also use `print` functions in-graph:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "6NdzRKLEboRv",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "fb82dfc3-790f-4127-87f6-361805be9e9b",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345739013,
+          "user_tz": 240,
+          "elapsed": 247,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_sign(n):\n",
+        "  if n >= 0:\n",
+        "    print(n, 'is positive!')\n",
+        "  else:\n",
+        "    print(n, 'is negative!')\n",
+        "  return n\n",
+        "\n",
+        "\n",
+        "tf_print_sign = autograph.to_graph(print_sign)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf_print_sign(tf.constant(1)))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_sign))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "1 is positive!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9u_Z3i3AivLA",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "We can convert lists to TensorArray, so appending to lists also works, with a few modifications:"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "MjhCQJVuiTNR",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 35
+        },
+        "outputId": "dc320b87-595b-4392-d29c-994486fd8a0a",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345744470,
+          "user_tz": 240,
+          "elapsed": 5391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def f(n):\n",
+        "  numbers = []\n",
+        "  # We ask you to tell us about the element dtype.\n",
+        "  autograph.utils.set_element_type(numbers, tf.int32)\n",
+        "  for i in range(n):\n",
+        "    numbers.append(i)\n",
+        "  return numbers.stack() # Stack the list so that it can be used as a Tensor\n",
+        "\n",
+        "\n",
+        "tf_f = autograph.to_graph(f)\n",
+        "with tf.Graph().as_default():\n",
+        "  with tf.Session() as sess:\n",
+        "    print(sess.run(tf_f(tf.constant(5))))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(f))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "[0 1 2 3 4]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UdG8ZFrkTAF2",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "And all of these functionalities, and more, can be composed into more complicated code:\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DVs6wt8NKaGQ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 53
+        },
+        "cellView": "code",
+        "outputId": "0a4b8d08-8f65-4bbc-85ba-dc4c60563519",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345745186,
+          "user_tz": 240,
+          "elapsed": 658,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def print_primes(n):\n",
+        "  \"\"\"Returns all the prime numbers less than n.\"\"\"\n",
+        "  assert n > 0\n",
+        "  \n",
+        "  primes = []\n",
+        "  autograph.utils.set_element_type(primes, tf.int32)\n",
+        "  for i in range(2, n):\n",
+        "    is_prime = True\n",
+        "    for k in range(2, i):\n",
+        "      if i % k == 0:\n",
+        "        is_prime = False\n",
+        "        break\n",
+        "    if not is_prime:\n",
+        "      continue\n",
+        "    primes.append(i)\n",
+        "  all_primes = primes.stack()\n",
+        "\n",
+        "  print('The prime numbers less than', n, 'are:')\n",
+        "  print(all_primes)\n",
+        "  return tf.no_op()\n",
+        "\n",
+        "    \n",
+        "tf_print_primes = autograph.to_graph(print_primes)\n",
+        "with tf.Graph().as_default():  \n",
+        "  with tf.Session() as sess:\n",
+        "    n = tf.constant(50)\n",
+        "    sess.run(tf_print_primes(n))\n",
+        "    \n",
+        "# Uncomment the line below to print the generated graph code\n",
+        "# print(autograph.to_code(print_primes))"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "The prime numbers less than 50 are:\n",
+            "[ 2  3  5  7 11 13 17 19 23 29 31 37 41 43 47]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "JQ8kQT99VqDk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 3. Case study: training MNIST with Keras\n",
+        "\n",
+        "As we've seen, writing control flow in Autograph is easy. So running a training loop in graph should be easy as well!\n",
+        "\n",
+        "Here, we show an example of such a training loop for a simple Keras model that trains on MNIST."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "0CrtGWgwuLJr",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "import gzip\n",
+        "import shutil\n",
+        "\n",
+        "from six.moves import urllib\n",
+        "\n",
+        "\n",
+        "def download(directory, filename):\n",
+        "  filepath = os.path.join(directory, filename)\n",
+        "  if tf.gfile.Exists(filepath):\n",
+        "    return filepath\n",
+        "  if not tf.gfile.Exists(directory):\n",
+        "    tf.gfile.MakeDirs(directory)\n",
+        "  url = 'https://storage.googleapis.com/cvdf-datasets/mnist/' + filename + '.gz'\n",
+        "  zipped_filepath = filepath + '.gz'\n",
+        "  print('Downloading %s to %s' % (url, zipped_filepath))\n",
+        "  urllib.request.urlretrieve(url, zipped_filepath)\n",
+        "  with gzip.open(zipped_filepath, 'rb') as f_in, open(filepath, 'wb') as f_out:\n",
+        "    shutil.copyfileobj(f_in, f_out)\n",
+        "  os.remove(zipped_filepath)\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def dataset(directory, images_file, labels_file):\n",
+        "  images_file = download(directory, images_file)\n",
+        "  labels_file = download(directory, labels_file)\n",
+        "\n",
+        "  def decode_image(image):\n",
+        "    # Normalize from [0, 255] to [0.0, 1.0]\n",
+        "    image = tf.decode_raw(image, tf.uint8)\n",
+        "    image = tf.cast(image, tf.float32)\n",
+        "    image = tf.reshape(image, [784])\n",
+        "    return image / 255.0\n",
+        "\n",
+        "  def decode_label(label):\n",
+        "    label = tf.decode_raw(label, tf.uint8)\n",
+        "    label = tf.reshape(label, [])\n",
+        "    return tf.to_int32(label)\n",
+        "\n",
+        "  images = tf.data.FixedLengthRecordDataset(\n",
+        "      images_file, 28 * 28, header_bytes=16).map(decode_image)\n",
+        "  labels = tf.data.FixedLengthRecordDataset(\n",
+        "      labels_file, 1, header_bytes=8).map(decode_label)\n",
+        "  return tf.data.Dataset.zip((images, labels))\n",
+        "\n",
+        "\n",
+        "def mnist_train(directory):\n",
+        "  return dataset(directory, 'train-images-idx3-ubyte',\n",
+        "                 'train-labels-idx1-ubyte')\n",
+        "\n",
+        "def mnist_test(directory):\n",
+        "  return dataset(directory, 't10k-images-idx3-ubyte', 't10k-labels-idx1-ubyte')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zu1U9Nqir6L",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "First, we'll define a small three-layer neural network using the Keras API"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "x_MU13boiok2",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def mlp_model(input_shape):\n",
+        "  model = tf.keras.Sequential([\n",
+        "      tf.keras.layers.Dense(100, activation='relu', input_shape=input_shape),\n",
+        "      tf.keras.layers.Dense(100, activation='relu'),\n",
+        "      tf.keras.layers.Dense(10, activation='softmax')])\n",
+        "  model.build()\n",
+        "  return model"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Wuqg3H8mi0Xj",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Let's connect the model definition (here abbreviated as `m`) to a loss function, so that we can train our model."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "W51sfbONiz_5",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def predict(m, x, y):\n",
+        "  y_p = m(x)\n",
+        "  losses = tf.keras.losses.categorical_crossentropy(y, y_p)\n",
+        "  l = tf.reduce_mean(losses)\n",
+        "  accuracies = tf.keras.metrics.categorical_accuracy(y, y_p)\n",
+        "  accuracy = tf.reduce_mean(accuracies)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "035tNWQki9tr",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Now the final piece of the problem specification (before loading data, and clicking everything together) is backpropagating the loss through the model, and optimizing the weights using the gradient."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "CsAD0ajbi9iZ",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def fit(m, x, y, opt):\n",
+        "  l, accuracy = predict(m, x, y)\n",
+        "  opt.minimize(l)\n",
+        "  return l, accuracy"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "PcVRIacKjSwb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "These are some utility functions to download data and generate batches for training"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "RVw57HdTjPzi",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def setup_mnist_data(is_training, hp, batch_size):\n",
+        "  if is_training:\n",
+        "    ds = mnist_train('/tmp/autograph_mnist_data')\n",
+        "    ds = ds.shuffle(batch_size * 10)\n",
+        "  else:\n",
+        "    ds = mnist_test('/tmp/autograph_mnist_data')\n",
+        "  ds = ds.repeat()\n",
+        "  ds = ds.batch(batch_size)\n",
+        "  return ds\n",
+        "\n",
+        "def get_next_batch(ds):\n",
+        "  itr = ds.make_one_shot_iterator()\n",
+        "  image, label = itr.get_next()\n",
+        "  x = tf.to_float(tf.reshape(image, (-1, 28 * 28)))\n",
+        "  y = tf.one_hot(tf.squeeze(label), 10)\n",
+        "  return x, y"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "2zEJH5XNjgFz",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "This function specifies the main training loop. We instantiate the model (using the code above), instantiate an optimizer (here we'll use SGD with momentum, nothing too fancy), and we'll instantiate some lists to keep track of training and test loss and accuracy over time.\n",
+        "\n",
+        "In the loop inside this function, we'll grab a batch of data, apply an update to the weights of our model to improve its performance, and then record its current training loss and accuracy. Every so often, we'll log some information about training as well."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "UUI0566FjZPx",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(train_ds, test_ds, hp):\n",
+        "  m = mlp_model((28 * 28,))\n",
+        "  opt = tf.train.MomentumOptimizer(hp.learning_rate, 0.9)\n",
+        "  train_losses = []\n",
+        "  train_losses = autograph.utils.set_element_type(train_losses, tf.float32)\n",
+        "  test_losses = []\n",
+        "  test_losses = autograph.utils.set_element_type(test_losses, tf.float32)\n",
+        "  train_accuracies = []\n",
+        "  train_accuracies = autograph.utils.set_element_type(train_accuracies,\n",
+        "                                                      tf.float32)\n",
+        "  test_accuracies = []\n",
+        "  test_accuracies = autograph.utils.set_element_type(test_accuracies,\n",
+        "                                                     tf.float32)\n",
+        "  i = tf.constant(0)\n",
+        "  while i < hp.max_steps:\n",
+        "    train_x, train_y = get_next_batch(train_ds)\n",
+        "    test_x, test_y = get_next_batch(test_ds)\n",
+        "    step_train_loss, step_train_accuracy = fit(m, train_x, train_y, opt)\n",
+        "    step_test_loss, step_test_accuracy = predict(m, test_x, test_y)\n",
+        "    if i % (hp.max_steps // 10) == 0:\n",
+        "      print('Step', i, 'train loss:', step_train_loss, 'test loss:',\n",
+        "            step_test_loss, 'train accuracy:', step_train_accuracy,\n",
+        "            'test accuracy:', step_test_accuracy)\n",
+        "    train_losses.append(step_train_loss)\n",
+        "    test_losses.append(step_test_loss)\n",
+        "    train_accuracies.append(step_train_accuracy)\n",
+        "    test_accuracies.append(step_test_accuracy)\n",
+        "    i += 1\n",
+        "  return (train_losses.stack(), test_losses.stack(),  train_accuracies.stack(),\n",
+        "          test_accuracies.stack())"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "cYiUQ1ppkHzk",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Everything is ready to go, let's train the model and plot its performance!"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "K1m8TwOKjdNd",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 988
+        },
+        "outputId": "f9d3eef3-5bea-45c1-ddf9-4edee73e4436",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345800262,
+          "user_tz": 240,
+          "elapsed": 52391,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "with tf.Graph().as_default():\n",
+        "  hp = tf.contrib.training.HParams(\n",
+        "      learning_rate=0.05,\n",
+        "      max_steps=500,\n",
+        "  )\n",
+        "  train_ds = setup_mnist_data(True, hp, 50)\n",
+        "  test_ds = setup_mnist_data(False, hp, 1000)\n",
+        "  tf_train = autograph.to_graph(train)\n",
+        "  (train_losses, test_losses, train_accuracies,\n",
+        "   test_accuracies) = tf_train(train_ds, test_ds, hp)\n",
+        "\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    (train_losses, test_losses, train_accuracies,\n",
+        "     test_accuracies) = sess.run([train_losses, test_losses, train_accuracies,\n",
+        "                                  test_accuracies])\n",
+        "    plt.title('MNIST train/test losses')\n",
+        "    plt.plot(train_losses, label='train loss')\n",
+        "    plt.plot(test_losses, label='test loss')\n",
+        "    plt.legend()\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Loss')\n",
+        "    plt.show()\n",
+        "    plt.title('MNIST train/test accuracies')\n",
+        "    plt.plot(train_accuracies, label='train accuracy')\n",
+        "    plt.plot(test_accuracies, label='test accuracy')\n",
+        "    plt.legend(loc='lower right')\n",
+        "    plt.xlabel('Training step')\n",
+        "    plt.ylabel('Accuracy')\n",
+        "    plt.show()"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/train-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/train-labels-idx1-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz to /tmp/autograph_mnist_data/t10k-images-idx3-ubyte.gz\n",
+            "Downloading https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz to /tmp/autograph_mnist_data/t10k-labels-idx1-ubyte.gz\n",
+            "Step 0 train loss: 2.244329 test loss: 2.2499208 train accuracy: 0.12 test accuracy: 0.161\n",
+            "Step 50 train loss: 0.64771986 test loss: 0.56013924 train accuracy: 0.82 test accuracy: 0.836\n",
+            "Step 100 train loss: 0.49011207 test loss: 0.42143965 train accuracy: 0.84 test accuracy: 0.879\n",
+            "Step 150 train loss: 0.3768609 test loss: 0.39319593 train accuracy: 0.88 test accuracy: 0.883\n",
+            "Step 200 train loss: 0.36007702 test loss: 0.37089333 train accuracy: 0.9 test accuracy: 0.881\n",
+            "Step 250 train loss: 0.182115 test loss: 0.28543878 train accuracy: 0.94 test accuracy: 0.915\n",
+            "Step 300 train loss: 0.2119576 test loss: 0.22305593 train accuracy: 0.92 test accuracy: 0.93\n",
+            "Step 350 train loss: 0.12932214 test loss: 0.29057172 train accuracy: 0.96 test accuracy: 0.906\n",
+            "Step 400 train loss: 0.22937602 test loss: 0.2200287 train accuracy: 0.92 test accuracy: 0.925\n",
+            "Step 450 train loss: 0.23444137 test loss: 0.19857481 train accuracy: 0.94 test accuracy: 0.94\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3XmAFNW9Pvynlt5mYdhmQMHggnGN\nS9zCD0ElKug1edUY9ZoQTYze3GuiRk1uYjRqRHNj4n5NrhKjiUYlbihGQFRUFDSoKIvgICAO6+xL\n711V5/2jlq7qZaZnpnumZ3g+/zjTXV1dXSP91PecU+dIQggBIiIiGjLkwT4AIiIi6h2GNxER0RDD\n8CYiIhpiGN5ERERDDMObiIhoiGF4ExERDTEMb6JeOOigg3DllVdmPf6rX/0KBx10kGe766+/3rPN\ne++9h9mzZwMAtm3bhkMPPdR57osvvsCPfvQjzJw5EzNnzsTZZ5+NV199FQBw0003YdasWZg1axYO\nO+wwnHLKKc7v4XDY8x7JZBLz58/v9edavXo1Lr300oK2XbBgAebMmdPn97J19/rZs2fjhRde6PO+\niYY7hjdRL3366aee0Ewmk1izZk3WditXrsQnn3xS0D6vu+46TJs2DYsXL8bixYtxyy234LrrrsPO\nnTtxyy23YNGiRVi0aBHGjRuH3//+987vVVVVnv188sknfQrUI444Ag8//HBB2y5fvhxTpkzp83vZ\n+vt6oj0Zw5uol0444QQsWbLE+f3tt9/GV77ylaztrrnmGtx+++0F7bO+vh5HHnmk8/uRRx6JxYsX\nY/z48QUfV3NzM3784x/jo48+wkUXXQTAbAF48MEHMXPmTOi6jlWrVuHcc8/FrFmzcOaZZ2L58uUA\nzFaB0047DQBw//334ze/+Q2uuOIKfP3rX8d5552HxsZG533ee+89HHzwwVnv9cEHH+Bb3/oWTjvt\nNJx//vloaGgAAOzevRsXX3wxzjzzTJx66qm4++67cx5rPu+99x7OOecczJo1C9/+9redC6Vc++3u\ncSEE/vd//xczZ87EKaecgjlz5kDXdQDAwoULcdZZZ+GMM87AN77xDbz33nsFn3eiwcDwJuqlM844\nAy+99JLz+z//+U/MmjUr53ZCCCxatKjHfU6fPh1XXnkl/va3v2HTpk0AgHHjxkGSpIKPa+zYsbjm\nmmtw1FFH4YknnnAeF0Jg8eLFUBQFv/71r3HppZdi0aJFuPzyy3HTTTfl3NeiRYtw/fXX49VXX8WY\nMWPw7LPPAgA2bdqE2tpaTJgwwfNe4XAY//mf/4lrrrkGS5Yswfe+9z1cddVVAIBHH30Uxx13HF5+\n+WUsWLAADQ0NMAwj57FmikQiuOqqq3DDDTdg0aJF+OEPf4jrrrsOhmHk3G9jY2Pex1944QUsWrQI\nzzzzDJYsWYKGhgY8+eSTAIBbbrkFDz74IBYuXIibbroJr7/+esHnnWgwMLyJeun444/Hxo0b0dLS\nglgshlWrVmHKlCk5t73++uvxhz/8AYlEott9/v73v8d3vvMdLFiwAGeddRZmzJjhBEt/nXzyyc7P\n8+fPxxlnnAEAOOaYY5zqONOxxx6LCRMmQJIkHHLIIdi5cycAYMWKFTk/6wcffIBx48Zh6tSpAICz\nzjoLX3zxBXbs2IExY8bg7bffxvvvvw+/34+77roLdXV1BR376tWrMX78eBxzzDEAgJkzZ6KtrQ3b\nt2/Pu998jy9duhTf+ta3UF1dDVVV8e1vfxuvvPIKAGDMmDF46qmnsH37dhx77LH45S9/WdjJJRok\n6mAfANFQoygKTj/9dCxcuBCjR4/GiSeeCFXN/U/psMMOw3HHHYdHHnkERx99dN59BgIBXHrppbj0\n0kvR2dmJRYsW4fbbb8fEiRMxbdq0fh3vyJEjnZ8XLFiAv/3tb4hEIjAMA/mWNqiurnZ+VhTFaV5+\n5513cMkll2Rt39nZiYaGBk8LhN/vR2trKy655BIYhoFbbrkFjY2N+M53voOf/OQnBR17a2srRowY\nkXVsLS0tefeb7/Guri48/PDDmDdvHgBA13WMHj0aAPCnP/0Jf/rTn3Duuedir732wvXXX4/jjz++\noGMkGgwMb6I+OPPMM3H33Xdj1KhRPfbZ/vSnP8W5556LiRMn5ny+tbUV69evd6rWESNG4Pzzz8ey\nZctQX1/f7/C27d69GzfccAOefvppHHLIIfj8888xc+bMgl+vaRrWrFmT8yKkrq4O+++/P5577rmc\nr7388stx+eWXY8uWLbjsssucSronY8aMQXt7u/O7EAIdHR0YM2YMVFXNud+pU6fmfLyurg4zZszA\nd7/73az3+dKXvoTf/va3MAwD8+fPx7XXXotly5YVeGaIBh6bzYn64Oijj0ZjYyM2btzYY4VWV1eH\n73znO7j//vtzPh+Px3HllVd6wmLr1q34+OOPceyxx/bquFRVRTgczllRt7a2oqKiAvvvvz80TXMq\n0EgkUtC+V69ejYMOOgh+vz/rvY488kg0NTXh448/BgA0NDTgZz/7GYQQ+PWvf4133nkHgBmSY8eO\nhSRJ3R6r7YgjjkBzczNWrVoFwBxfMH78eEycODHvfvM9/vWvfx0vvPACYrEYAOCpp57C888/j9bW\nVnz/+99HOByGLMs48sgjezXWgGgwsPIm6gNJknDaaachFotBlnu+Bv7BD36Ap59+Oudze++9N/70\npz/hvvvuw5w5cyCEQFVVFX75y196RqAX4phjjsEf/vAHTJs2DW+++abnuYMPPhjTp0/HzJkzMWbM\nGPziF7/Ahx9+iNmzZ+O///u/e9y3fYtYvve67777cOuttyISicDn8+Gqq66CJEm48MIL8etf/xq3\n3norhBCYMWMGpkyZgh07dnheryhK1ntWVFTgnnvuwa233opoNIrRo0fjrrvu6na/I0eOzPk4AGzc\nuBHnnHMOADPYb7vtNowePRrTpk3Dt771LSiKAp/Ph9tuu61X551ooElcz5uIiGhoYbM5ERHREMPw\nJiIiGmIY3kREREMMw5uIiGiIYXgTERENMUPmVrGmpq6i7m/UqAq0tUWLus89Ec9j//Ec9h/PYXHw\nPPZfsc9hbW11zsf32MpbVbPvKaXe43nsP57D/uM5LA6ex/4bqHO4x4Y3ERHRUMXwJiIiGmIY3kRE\nREMMw5uIiGiIYXgTERENMQxvIiKiIYbhTURENMQwvImIaNh6443XCt723nvvxI4d23vc7sMP38cN\nN/y8P4fVbwxvIiIalnbu3IFXX11c8PZXXXUt9t57QgmPqHiGzPSoREREvXHXXb/D+vXr8Mgjc2EY\nBnbs2I6dO3fgnnv+iN/+9jdoampELBbDD35wOaZOnYYf//hyXHPNz7F06WuIRML44out2L59G668\n8lpMmTI153u89toSzJv3dyiKgoMOOgS33XYL6us34M47fwefzwe/349bbvktdu7cnvVYdXXuqU8L\nsceGd0c4gfc3NOLYg+sG+1CIiIa9f7z+GVZuaCzqPo87uA7nz5ic9/l///fZeO65f+D7378MDz/8\nIDQthT/+8c9oa2vF8cd/DWeccRa2b9+GG2/8BaZOneZ5bWPjbvzhD/fh3XeX44UXns0Z3tFoFA89\n9AAeeeQJVFRU4Oc//yneffddvPzyyzjnnPMwa9a/4YMPVqK1tQUvv7wg6zGGdx9ceecbaO2M46ZL\njsOk8X0/gURENDQccshhAIDq6hFYv34dXnzxOUiSjM7OjqxtjzjiKABAXV0dwuFwzv01NHyBiRO/\nhIqKCgDA0Ucfg/Xr1+PEE0/CH/7wP2ho+AJf//ppmDRp35yP9cceGd5b23YiPOFNSMnD0dwRZ3gT\nEZXY+TMmd1slDwSfzwcAWLJkETo7O/HAA39GZ2cnfvjD2VnbKkp6gREhRM79SZL3OU1LQZJCOPbY\n4/HnP/8Ny5cvw5w5N+PHP74652Nf/eqxff4se2R4f7ztCyjVbTBG70RLZ3ywD4eIiEpAlmXoup71\neHt7O/baa2/Isow333wdqVSqT/vfZ59J2LbtC0SjEVRUVGLVqg9x1VU/xrPPzsOUKSfi9NPPgBAC\n9fUbsGXLpqzHGN69dPykA7G4CZArO9DSwfAmIhqOJk3aD59+ugH33XcnKiurnMdPPnkGfvGLa/DJ\nJ2vxb//2TdTV1eGRR+b2ev+hUAhXXHEVrr32J5AkGUcccRSOPfZY7NzZghtv/AWqqqrg8/lw/fU3\nob7+06zH+kMS+doDykxTU1dR93fjit+ipTOCQyLn4yfnHlHUfe9Jamuri/632dPwHPYfz2Fx8Dz2\nX7HPYW1t7m7dPfY+7y+P2Q+SL4mmcOtgHwoREVGv7LHhPbFmPACgLdk2yEdCRETUO3tseI8JjQIA\nxBFGPKkN8tEQEREVbs8N74rRAADJH+egNSIiGlL22PAeW2FW3pI/xtvFiIhoSNljw3uME96svImI\naGjZY8M75AvCLwcg+eNoZuVNRDQs9WZJUNtHH32ItjbvnUjlsAyo2x4b3gAwMlDDypuIaJjq7ZKg\ntn/+88Ws8C43e+QMa7a6ijFojDWiqSt7UnoiIhra3EuCXnDBRbj99lvQ1dUFXddx9dU/w+TJB+Lx\nxx/Fm28uhSzLmDp1Gg455FAsW/YGtmzZjDlz7sD48eOz9pu5DOjVV1/nLANaWRkCIJdkGVC3PTy8\nxwItQKfWPtiHQkQ0rD332UtY1bimqPs8uu4rOHfyWXmfdy8J+uijf8YJJ/w/fOMbZ2PLls24994/\n4J57/oinnnoc8+cvgqIomD//WRx33NcwefKXcc01P88Z3LmWAf3ww/fx1ltLcc4552H27AuxaNHr\nJVkG1G2PDu/a0FgAQAysvImIhrM1a1ajvb0Nixe/DABIJMzu0pNP/jquvvq/cNpps3D66bN63E+u\nZUDr6zc4S362tOzClCknlWQZULc9OrzrKszwTildMISALEmDfERERMPTuZPP6rZKLjWfT8VPf/oz\nHH64dy2L6677JbZu/Ryvv74EP/nJf+Chh/7a7X5yLQMaCAScJT/XrFlZsmVA3fboAWt25Y1gFNE4\nZ1kjIhpO3EuCHnro4XjrrTcAAFu2bMZTTz2OcDiMRx6Zi0mT9sX3v38ZqqtrEI1G8i4lCniXAQWA\nVas+xEEHHYpnn52Hzs4OfPOb38QFF1yE+voNzmOnn36G81ix7NGV96hgDSQhQw5EEYmnUBXyDfYh\nERFRkbiXBP3hD3+E2267Gf/1Xz+EYRi4+urrUFVVhfb2Nlx22fcQClXg8MOPwIgRNTjqqK/ihhv+\nG7/97Z3Yf/8DPPvMtQzokUcehVgsihtv/AVGjaoBIJdkGVC3PXZJUHvZtp++fgvicQM/O+pa7L/3\niKK+x56ASwj2H89h//EcFgfPY/9xSdABEpCCkNQUIvHUYB8KERFRQfb48A4qIUiqhs4oJ2ohIqKh\nYY8P7wo1BABoj4YH+UiIiIgKs8eHd6XPvFevIxEZ5CMhIiIqzB4f3iMClQCAjjjDm4iIhoY9PrxH\nVZgj+Xa1tw3ykRARERVmjw/v0RXm7WE7OjrQHk4M8tEQERH1bI8P70qfOWBNUpNYvallkI+GiIio\nZwxvn9nnDSWFpvbY4B4MERFRAUo6Peodd9yBDz74AJqm4T/+4z9w+umnO88tX74cd911FxRFwfTp\n03HFFVeU8lDysm8Vk9QUWjvZbE5EROWvZOH97rvvYuPGjZg3bx7a2tpwzjnneMJ7zpw5ePjhhzFu\n3Dh897vfxcyZMzF58uRSHU5eITVo/qBoaOviRC1ERFT+Shbexx13HI44wlx6bcSIEYjFYtB1HYqi\noKGhATU1Ndhrr70AACeddBJWrFgxKOHtV/wAAJ9foK2NlTcREZW/koW3oijOYuXPPPMMpk+fDkVR\nAABNTU0YPXq0s+3o0aPR0NDQ7f5GjaqAqipFPcba2mqM1M3K2+8XaI8kMXZsFSSu690r+SbOp8Lx\nHPYfz2Fx8Dz230Ccw5IvCfrqq6/imWeewV/+8pd+7aetLVqkIzLZK78IISBLMiTFQCKpY+u2NlQG\nuTRoobgKUf/xHPYfz2Fx8Dz237BYVWzZsmX4v//7P8ydOxfV1ekDqKurQ3Nzs/P77t27UVdXV8pD\nyUuSJPhlP2TVXHi9jYPWiIiozJUsvLu6unDHHXfgwQcfxMiRIz3PTZw4EeFwGNu2bYOmaVi6dCmm\nTp1aqkPpkV/xAbIZ3h2R5KAdBxERUSFK1mz+8ssvo62tDVdffbXz2AknnICDDjoIp512Gm6++WZc\ne+21AIAzzzwT++23X6kOpUd+xY9kyhxpHo5xXW8iIipvJQvvCy64ABdccEHe54877jjMmzevVG/f\nKwHFjw6YS4IyvImIqNzt8TOsAYBf9kMXZmhHGN5ERFTmGN4w+7wNGIBksPImIqKyx/BGeqIWyDrC\ncYY3ERGVN4Y3zD5vAGZ4s/ImIqIyx/CG2ecNAKrPYJ83ERGVPYY3rPu8AYRCEitvIiIqewxvpPu8\nQyEgHNMG+WiIiIi6x/BGus87GABiCQ26YQzyEREREeXH8Ea68g5YS3tH46y+iYiofDG8Afhls89b\nVc2KO57UB/NwiIiIusXwRrryllUBwGw6JyIiKlcMbwABJQAAzrKgrLyJiKicMbwBhFQzvCXVrLjj\nSVbeRERUvhjeAIKKNVJNNu/xjiVYeRMRUflieAMIWpW3IZkVd4yVNxERlTGGN4CQGgIAGJJZecdZ\neRMRURljeAMIWgPWdCQBsM+biIjKG8MbgCqrUCQFmhXe7PMmIqJyxvAGIEkSgmoAKWGFNytvIiIq\nYwxvS1AJImkkAABxTtJCRERljOFtCaoBJHQrvDlJCxERlTGGtyWkBpHQk1BkNpsTEVF5Y3hbgkoQ\nAgKBoOCtYkREVNYY3hZ7opZgCIiyz5uIiMoYw9sSVM0pUitCQCSWGuSjISIiyo/hbQlZ85sHQwJJ\nzUAixaZzIiIqTwxvi115B4IGAFbfRERUvhjeFrvP2+c3wzvM8CYiojLF8LbYzeYqw5uIiMocw9ti\nV96yz+zrZngTEVG5YnhbglblLavmbWIMbyIiKlcMb4tdeUNheBMRUXljeFtC1mhzQzJDm+FNRETl\niuFtCTK8iYhoiGB4W+w+b3tNb85vTkRE5YrhbfHJKmRJdtb0TunGIB8RERFRbgxviyRJCClBZ01v\nneFNRERliuHtElQDiGlxKLLEypuIiMoWw9slqAYR1xJQFRmaJgb7cIiIiHJieLsErWZzRQE0Vt5E\nRFSmGN4uITUAAQHVLxjeRERUthjeLj7Fb/5XNRjeRERUthjeLn7ZBwCQVYGUzj5vIiIqTwxvF5+s\nAgAU1YCm9b/ybutK4MEX16G5I9bvfREREdkY3i4+xay8FaU4fd5PvFqP9z7Zjb8u3NDvfREREdkY\n3i4+u9ncZ0ArQrN5PKl7/ktERFQMDG8Xu89bUQwYQsAw2O9NRETlh+HtYjebS4rZZM5Z1oiIqBwx\nvF2c0eayGdq8XYyIiMoRw9vF7vO2K+9i9HsTEREVG8PbxWk2tyvvItwuRkREVGwlDe/6+nqceuqp\nePzxx7OemzFjBi666CLMnj0bs2fPxu7du0t5KAWxK2/I5ujwfjebC1buRERUfGqpdhyNRnHrrbdi\nypQpebeZO3cuKisrS3UIvebPCG8OWCMionJUssrb7/dj7ty5qKurK9VbFF1Ws3mxwlsqzm6IiIiA\nElbeqqpCVbvf/U033YTt27fjmGOOwbXXXgtJGtyUs6dHFZLdbM5mbyIiKj8lC++eXHnllZg2bRpq\nampwxRVXYPHixZg1a1be7UeNqoCqKkU9htraas/vcf9IAIBqLi6Gqqpg1ja94fObp9enKv3aT7kb\nzp9toPAc9h/PYXHwPPbfQJzDQQvvs88+2/l5+vTpqK+v7za829qiRX3/2tpqNDV1eR4Lx1IAgJSW\nBAA0t4TRVBPo83ukkpq1Pz3rvYaLXOeReofnsP94DouD57H/in0O810IDMqtYl1dXbj00kuRTJoh\nuXLlShx44IGDcSge9mhzQ+KANSIiKl8lq7zXrl2L3/3ud9i+fTtUVcXixYsxY8YMTJw4Eaeddhqm\nT5+OCy64AIFAAIceemi3VfdA8St2n7dZMevs8yYiojJUsvA+/PDD8dhjj+V9/uKLL8bFF19cqrfv\nE6fyBitvIiIqX5xhzUWRFEiQYMCsvDnDGhERlSOGt4skSfApPqfy7uk+7x3hXXjsk38grsUH4vCI\niIgADOJo83Lll33QhTVKvIc+7/s+eghdyTDGVdTi9H1PGYjDIyIiYuWdKagEkDQSAAC9m8p7W2MY\nXckwACBpJAfk2IiIiACGd5bairGIGREEv/oqtic3593ulfcbnJ8lzn9KREQDiOGdYXyFORe7pGpY\nrb2af0N3i/ogT+tKRER7FoZ3hnGV6YVUVPjzbifAe8CJiGhwMLwzjK+oTf8iCquoZTabExHRAGJ4\nZxhfOc75OYEINEPLvaGn8GZ4ExHRwGF4Z6j2V+EHX/4h9I4xgCTQGm/r8TXs8iYiooHE8M5h/5pJ\nMLpGAQCaYq05t/GMV8tTebNXnIiISoHhnYOqSBApc7BaLJV7KVLhSmbeKkZERAOJ4Z2DqsiAYU4+\nl8g7AYsnvYmIiAYMwzsHVZEhDAUAkNBzh3chzeZERESlwPDOQVUkQDfDO5knvHuD4U5ERMXE8M5B\nkiQo1pot21o6cm8kvNsTERENFIZ3Hgp8AICV9TuwsyWS9TxHkhMR0WBheOdhhzdkHZ2R7pvO2SxO\nREQDieGdhyqlwzsX4bpXzBD5lw4lIiIqNoZ3HnZ4S0qe6VFd3EFORERUagzvPHyKz5yIRdaR1Lqv\nrA2w8iYiooHD8M5DlRXAUCApOpKp7KZzd7HNZnMiIhpIDO88fKp1r7esI5nqofJmszkREQ0ghnce\n5ixrKiRFQ0LLUXm7f2blTUREA4jhnYeqyAVX3nqe8GZBTkREpcDwzkOWJXN+c1lHIpljxLn7VjEO\nWCMiogHE8M7DMIQ5YE0WSGiprOe9zeY9lNicw4WIiIqI4Z2HYQhAN+c3j2mJ7rdlnzcREQ0ghnce\nuiGcZUFjqXj2Bp5bxdi5TUREA6eg8F67di2WLl0KALj77rtx8cUX4/333y/pgQ023RAQyQAAIKKH\nu92Wfd5ERDSQCgrvOXPmYL/99sP777+PNWvW4MYbb8R9991X6mMbVIYhIBIhAEBMdGU9z1vFiIho\nsBQU3oFAAPvuuy9ee+01nH/++Zg8eTJkeXi3uJuVdzfh7VmYhM3mREQ0cApK4FgshoULF+LVV1/F\niSeeiPb2dnR2dpb62AaVIQREMggASCLXet7pwK7f1pZzxDkXLCEiolIoKLyvueYaLFiwAD/96U9R\nVVWFxx57DJdcckmJD21w6a5m86Scq8873VS+uy2CpvZY9hZ2djPDiYioiNRCNvra176Gww8/HFVV\nVWhubsaUKVPw1a9+tdTHNqgMwwAMFUJToSvRrOfdlTckkQ5q9zZW5c0KnIiIiqmgyvvWW2/FwoUL\n0d7ejgsvvBCPP/44br755hIf2uD60rhqAIBIhKCrkawAzry3O3ezub1taY6RiIj2TAWF9yeffIJv\nf/vbWLhwIc455xzcc8892Lp1a6mPbVBdcsbB+N7Mg+DTqwFZR0cyo49fSieyxMqbiIgGUEHhbYfP\nG2+8gRkzZgAAkslk6Y6qDFQGfTj56AkIiBEAgMZok+d54b63WxI5VyGxA53ZTURExVRQeO+33344\n88wzEYlEcMghh2D+/Pmoqakp9bGVhZAwP+fOsDe8vROziJwBzcqbiIhKoaABa3PmzEF9fT0OOOAA\nAMDkyZNxxx13lPTAykW1MgotALZ37fY8nll557rXO+lrARSJfd5ERFRUBYV3PB7H66+/jnvvvReS\nJOGoo47C5MmTS31sZWGkbzSAXM3mmaPNvQm9qf1ztO31OvwVYyBaTy71YRIR0R6koGbzG2+8EeFw\nGBdeeCHOP/98NDc344Ybbij1sZWFmmAVhACi1uIkH3zaiBfe3gJkNJvruje869s2AQCUmhb2eRMR\nUVEVVHk3Nzfjrrvucn4/5ZRTMHv27JIdVDmpCKpAVIJm6ACAB55fCwA4cLLrukcS6EqGcdt7D+Oc\nyf+GQ8cchNZ4KwBApHzs8yYioqIqeHrUWCw9g1g0GkUi0f0a18NFZVAFhAxN1z2Pp9y/S8DqjlXY\nEdmFBz5+GADQEm8DAIhkiH3eRERUVAVV3hdccAHOOOMMHH744QCAdevW4aqrrirpgZWLiqAPEBI0\n4Q3vpK65fhMQGQndaoe3prLyJiKioioovM877zxMnToV69atgyRJuPHGG/HYY4+V+tjKgll5S9AN\n74xqKS0d3lLGgDUhhFN5QzYY3kREVFQFhTcA7LXXXthrr72c31evXl2SAyo3duWti+6azYUnoBN6\n0pk+VZJ1DlgjIqKi6vOi3HtKNVkZVCGEbC5U4pIy3GEunAFtABDX4+mnFH2POVdERDQw+hzekiQV\n8zjKVkVQBSBBhze8tYzKO2GkB/DFtHR4S7LOAWtERFRU3Tabn3TSSTlDWgiBtra2kh1UOamw+ryF\n8PZdp3QNAdd2yTzhzT5vIiIqtm7D+4knnhio4yhbiixDEjIMpKC5J2KR3TOsGXkrb7DPm4iIiqzb\n8J4wYcJAHUdZkyUJAgZSWrqpXJJdt4pJQMod3qmoazsDBpjeRERUPH3u8y5EfX09Tj31VDz++ONZ\nzy1fvhznnXceLrjgAjzwwAOlPIx+kyADEEhprn5v1R3eAkmRDu+2RIfn9ULSQEREVCwlC+9oNIpb\nb70VU6ZMyfn8nDlzcP/99+PJJ5/EO++8g88++6xUh9JviiRDSAaSrvD2VN4QSBnp9c2d8DbM0ysk\n721mRERE/VGy8Pb7/Zg7dy7q6uqynmtoaEBNTQ322msvyLKMk046CStWrCjVofSbLCkABOJJVwgr\n3klaUsIV3vF28wctCAAQYOVNRETFU7LwVlUVwWAw53NNTU0YPXq08/vo0aPR1NSUc9tyoMgyJFmg\nI5JuGpdUb+Wtwd1sboV3yhwf+2w3AAAgAElEQVSPzsqbiIiKqeAZ1gbbqFEVUFWlqPusra0uaDtV\nMU+TUFzXOlblLTQVkj/puQu8PWk1m1vhDVkv+L2GouH82QYKz2H/8RwWB89j/w3EORyU8K6rq0Nz\nc7Pz++7du3M2r7u1tUW7fb63amur0dTUVdC2spABCdi2M31vu2SHt+5zqvB9qvZGQ3gHuhJh87lU\nABIAQ9IKfq+hpjfnkXLjOew/nsPi4Hnsv2Kfw3wXAiUdbZ7PxIkTEQ6HsW3bNmiahqVLl2Lq1KmD\ncSgFUWTzNHVE0/3akDXz/m3NvP6pwlhMm+AdnCecZnP2eRMRUfGUrPJeu3Ytfve732H79u1QVRWL\nFy/GjBkzMHHiRJx22mm4+eabce211wIAzjzzTOy3336lOpR+UxUF0IHOqGvaU1UDdBWQzHu4fQhA\nldOn0y/7ENet5nb2eRMRURGVLLwPP/zwbpcNPe644zBv3rxSvX1RqbIV3rH0oDQoGoSuOn3fighA\nkdN98j7Fh6iumE0bDG8iIiqiQWk2H2pUK5S7oq7R5opZeUtOePuhSunwViU1fZ+3zGZzIiIqHoZ3\nAXyKGcrhuN3nLZzK2x6spghvs7kqqxCaFeYyK28iIioehncB/NatYl0xK7xlA5IkzD5v2A/5PM3m\nqqxA6NbvisaVxYiIqGgY3gXwWfeX68KqoJ3bxNzh7Tebyi2qpMIwrN9lnUuTEBFR0TC8C1Dh91k/\nmRHszGvuCm9J+Jy+cQBmFW5V3pJVeXdGk7j/2dXY1hgekOMmIqLhieFdAL/PCm/JmkdNsSpwwzXj\nm65mNJur6cpcMdf0/ufyrVi1sRn3Pbt6AI6aiIiGK4Z3ARTJOk2SgCSlK293szkMNavZ3A53STYr\nb3s98GSKA9iIiKjvGN4FUOxbwCSByqAPvoDVg+2qvCXd22yuuprNoegw2OlNRERFMmQWJhlMslV5\nS5JAZciHsGrAACB0BYmNR0EZ2QRZqvbcKqZIKgAZQpedypuIiKgYWHkXIN1sbuDEr4yHL2D1fRsq\njLbxSG35CoRhB7b9GsXZxu7zdkjSwBw4ERENSwzvAshWEP/7qZNx5tcmweczk9i5jxuArouMZnPV\n2UbKvM+bVTgREfUDw7sAduU9fmwIkiRB8dmVtyu8hchoNndV3jL7vImIqHgY3gWQrSVBDWGGtqxa\no8Vdo811XXjmNreb0IWuAIoGwzDSO2SzORER9QPDuwB2Fa1b4S1Z93m7m80NQzgD2wCkg1xXIUlA\nyuDiJEREVBwcbV4AO5S/6NyGz9o3A0rKfMJwVd6GAclVUctOs7n537iWXguceuetj3dgQm0lDti7\nZrAPhYioLDC8C2D3eS/e+joAQLZWGfMMWDPSk7CYr/Fuc+fqe3AELhqQ4x1OYgkNjy7cAAD4yy9m\nDPLREBGVBzabF0B29WUDgJCyJ2nRDYHHXql3frfDW1LNKj2hJyBghntnJIkHnlsDg6POe6TpRs8b\nERHtYRjeBVAk72kSMMy7vQxvn/fGhnbXa8zntN2T0tsgXZl/UN+Enc2REh0xERENZwzvAmSGNwAr\nuNN93PGk7unztkebG51jobWMN3+Gd05znfeP9YhniIgoG8O7ALKsZD+oe4cLRGIpz+8KXK+xKnQD\n3hHnDO+esWeBiCgbw7sAco7KWxgZ/eAwB1c5r5HdK45Z94lL3srbYHj3iOdoePvX+t247I6l2N0a\nHexDIRpSGN4FyN9s7hV2Vd+K69TaQS/YbN5rDO/h7c8vfQLdEFi2eudgHwrRkMLwLkDmaHMATjXt\n5g7j5vZk+glhbtssbYLkT1cYKY6k7hFH5A9v/PMS9Q3DuwC5Ku9RVaFuX7P4vW3pX6yg362uQ/Co\nt5yHUymGd08Y3kRE2RjeBcjV5z1+VBWqQj4AQCjQw1w3OZrYAVbehWCzORFRNoZ3AZQczeaqrDrL\nfFZX+Lp9vcjRxA4AyZSe83FKY3jvGbhWD1HvMLwLIOf4ZnEv/1kdyg7vQyeNxrUXHoWvHTouu/KW\nNQACb3e8jHe2vwcAWPDOFsxd8ElRj3s4YHYTEWVjeBcgksq+jcW9/Gd1hT/r+ZHVfhy272jzOeE9\nzZI/DskfxxfJT/HEp88CAJ5ftgUr1u0q8pEPnLWbW7BibfGPn5U3EVE2hncBJo3YBwDw5VGTncfM\nZnPz5xGV6fAWmlmFj6kYCQCQ5exmcykQg+RPrzJmrxMOwGmKz2f+ss34+LPmPnyK0rrrHx9j7kvF\nbznggDUiomwM7wJU+6vwwIw7cOa+pzqPqa5Z1/yq7Axei6+ZisTGo3DUhP0BABUBNavZXArEIAVi\nzu+NkXQYdxdWndEkXnznc9z7zOr+faAS6unio7fKObxffGeLs+IZEdFAYnj3guIKbFVWPfNu71NX\nBQCoCYzAdbPOwKTx1QCAiqAv655wSUlB8qfD+4GP/+KsEa7p3YR3JJn3uXKR1Io7gr6cm83nL9uC\ntz7eMdiHMaSV8bUZUVljePeCu59blVQ4y2ZIwMGTRgEAamtCzs8AUBlSs/q8IQlP5d2aaIW692YA\ngN7N7WPhaCrvc+Wi2CPoyzm8iYgGC8O7F7Iqbye7JZxxwpfwzan74rJvHOp5TWXQlzUPOmTDCe+v\n7zPd3F9tAyBrWZV3Uk9i4ZZX0Z7oQGe0/CvvRLHD23U6/ufvH6KxPZZ/40HCC4y+4y1iRH3D8O4F\n9/3e7j5vSQJURcbZ0/ZH7UjvzGuVOZrNIRmQ/HEoIoBzDzwLB4a+AknVIPnj0DIq73/Uv4CXtryC\nFzctQke4/MM7WeRZ49x93vUN7Zj32sai7r8YONlO37HZnKhvGN69oHbT551PZUjN7vOWDEi+JFQj\nCAAwdOt5SUBzVXHtiQ6s2LnS+b0jo897xY6VWLBpUS8/RWkVu/IWGVVtOS7mknnBRURUagzvXvBU\n3pKCQtK7MuiDENnN5lBSkI0AAEDT0o+7+7zvWzU3/RJJdgashQLm/h7f8DQWbX0dulE+M7UVu887\nM6zLsYk6VeRBekREPWF490L2aHMzSLrrtzNvFcucpCUBSQIk3QzvlDUOTZIM6Fafd2ckieZYi/Oa\nqBZzKu+qjBndolr59AMnSthsnuv3cqAxvPuNfd9EvcPw7gXPaHO5h8VILLIsZd/n7TMnaJE0c3IX\n3S5WJQOaYQbBtX98C7rQceDIAwAAsVQMHZEEAMCvKp77qbuS4d5/mCJyH0vxR5tn/l4e4a27Dox9\n3n0nCup8IqJMDO9eUFyBrcqq606xHsoG4X1e8pshbM/GJgzreUkgkdTxxqrt0GWzyq5UK+BX/Ihq\nMcQTZjDqhkBcT8/QFklF+vyZisHdtF30Pu/MyrtMwlvT0sfR3b35ROXglZUNWL+1bbAPg4qI4d0L\n7nW9PQPWemzyywhvnxnMwqq8Dd16Xjbw7Fub8bfFn0JSzI5wvxxEhRpCTIs5FZ4hBLqS6cAO55h7\nfSC5w9tdeacMDXd/+Ces2LEy18sKkt1s3uddFZW72uaANSpniZSOp17biN8/uWqwD4WKiOHdC1kD\n1iw9ZXfdqFDOx42kWXk74S0Z2N5kNoFLqtkRHrDCO6rFnYFRhiEQTrnDe5Arb1d4ufu8t3Y24LP2\nLXh8w9N933eZjjZ3BzYHrFE5K5fWKiouhncvSK5RNYprkpae3Pz94zC+60QkPj3G83gqYTbD233e\nkiTgXApY06X65QBCaghxLY6UZm5oCOFpKh/sZnMtT+WtGVquzXsl84unXAasuQepsc+7H6w/Z+bY\nBiLqHsO7j3yyAvf0qN0J+lVMCnwZRkcthKv/OxaVYRgCuqvytkmqGXw+KYAKXxACAklh9pUbRmaz\n+WBX3q4+by0d3rmWUu2tzLDOvO97sLgvWDjavP/K5aJsOOK5HZ4Y3n2UOT1qTxTFOtVGeluR8iMS\nT0HX0n3e6RdYlbcUQIVaYT1vPmYIIJxKjzAPJ7NDMvMfbCKl4911u5zqvZjczebJZPrnLtcxLnrv\niz7tO/N7Rx/gLyJNN7BkZQOice+88u7AZp93/7Fpt3TKpauJiovh3UfmwiSmQu5R9dnh7VqkROgq\nWjsTcPLUGm0OpPu8VRFASA1ab2pW45l93pnN5l/s7sIPf7cUb3603XnsuTc346EFn2D+si0FfT63\ndVta8doH2/I+7xlt7ro4CLtuYVv4Xu/fF8jRbD7AX0TPv7UZT762EX9fUu953N1Uzmbz/mN4l065\ntFZRcTG8+6jQ+7xtimIlvHuFMV3BLY+uRDhid3obTsVsh7cCPypUc8CbZFXjhiE8TdI7Irs8s6wt\nX7sLAPDU6585jzU0dgEANu3o7NVxA8Cd8z7C35fU521+y9fn7b7/3JD7tiJa1gxrA/w9tHFbBwCg\ntTPheZwD1orD/nOyabd0mN3DE8O7j1RZ6dWiCnblLQz7vxKc028FuiS5m83NKlsRfgTUgPVYesBa\nXDPv8z669itoT3Rgbct656V2FSP7EqhvMwPcp5qj4/vTbJ6vOvKMNk+6wtvVIiAUb/gV/J5Z93kP\nbFC2dZnHPbI64Hnc22wuEI2nsO7z1gE9tuGEAVM6bNUYnhjefaRIhU2PalNVO6itjQ1X5S7Sk7TY\n7CpbNgLpJnopfatYzArv0yedAgB4a9sK57VOv/A+a3DvqofwcdNa+K33T/ajSszXt5tvkhZ35d3X\n8M5s8hvoUcntYfO4R1T4PY+nXIP0NM3AnfM+xp1PfYT6hvYBPb6hzv6nM9AXZW5CCCz9cBt2tQ7u\nfAmlwlaN4Ynh3UeqrOLkoycAAA760qgCtvc2m8vCHd7Wn8E1YE3yJyAMCbLwwWc10UtyepKWmBaD\nLBQ89sIuHDhyf2xo24hdkd3m7uzAC5lBMn/Ty/D5zPdI9WPu8XwDX9yjzd39v+5BdYbct+VMM9/S\nEAKabmTNvFYq9mfOfD8tY5KWLTvN7oimMlxvvJw5zeaD2POweWcnHnulHr+a++7gHUQJsfIenhje\nfeSTFXzntC/jjh9NwWH7ju5xe6fytprNZeSqvN3hHYNIhqDrrv512Wo2N4CYFofQfdi8vRMnjDfv\nH/+0bZP5vN1vnjJHqTdGmyGpZgWZ7EezuZ5jGlAhhGeeb/e0oRHXKHhD6Wt4Zw9Yu/z3b2DO3z7o\n0/56w90FkDkoTcszYI0LbPRNb6vD9zc04sEX1xWlqozEzC6q4VqgsvIenhjefaTKKmRJwtiRuWdP\ny9o+Y7S5e7S63Q9uN5srqg7Jn4RIhKDpBnyKtYqY5K6844BuTtEaUioBAM+/vRHN7bF0FaOkB4nF\nVXOFMvfgqriW7hMvRGbl3RZvx/ee+ylWtb0PqGY4u0MtYbgCW+1bs3n2gDXzd7vSLaXWrvT88cmM\nFgv3eXT/LGekdyyhIZbo/2Q1A03TDWzd1TVg79fb6vCP89fivU92Y3cBTd2vf7itV5/FMATunPeR\n526NoYyV9/DE8O4j91SphVCt0eb2JC2q5FrW0x6wZjWLT5xo7lskQkhpRlazOWCGt6GZj8swt4+m\n4nhx+efpK20lHRqblXcANenp835iwzO4d9VD+KhpbUGfQc+oPjd1fI6ElsCyliUIHvkG4Is74W0I\nA5qhQYHVV9zHyjuzz3sgFwGx108Huq+8NU/l7Q3vK+5+C1fc/VaJjrB0HlrwCW55dOWA9eH3tTp0\n5k/Io7E9hsdfqcctj+afXz+ztaSxPYZ1W1rx10Wf9umYyg2ze3gqaXjffvvtuOCCC3DhhRdi9erV\nnudmzJiBiy66CLNnz8bs2bOxe/fuUh5K0Vz+le/hrP1metb2LoRdeUtWda1KKn71PWu61IxmcyVo\n9puKRAU03Ug3m9vN6rIBXegwUnZ4p5/fvKPTudIWcgp1FWMBAElEoY773FMl2iPUN1rN7T3pbrIH\nSTGgjGx0giypm8EXRJV1Avp2q1jSSHpaEIq95Gh3uqLp982cRU3zDFhzDTTsY7N5S6wVN7xzOza0\nbuzbDors/Q2NAFBQZVsMfa0OMy8oMxXy/0vmn2y4dX2w8h6eShbe//rXv7B161bMmzcPt912G267\n7basbebOnYvHHnsMjz32GMaNG1eqQymqI2sPxxn7fb3Xr3Oaza3wliDjgL1roMiS0w/ujDb3m1+Y\nduWdsjPErrytMBO6VZFb64VLio4dzRGzipEMQNYxOjAKFx30LfP5jACt9JnN7YVOr6plfAnYI95t\nysgmZxR2QrdmiDMqrffuW+W9UnseoWNegz20qbezRdW3fYZH1j2BVB/mWe+KuirvjLECqTxzm/e1\ngnz1izfRlmjH3DWPFbS9EAJPLKnHui2lvT2tKuTreaMi6Gu+9HSPfUE5LHX765DHPu/hqWThvWLF\nCpx66qkAgAMOOAAdHR0Ih8M9vGr4UuzR5s7tZVbftyJD2KPN7T5t1aq8k0GkdAML3ramFrUGrNnL\nhcIKbwjF83wkrjkBH1KDOGj0gZ7nbVU+c0BbOJk7vNvi7Xh03ZOQrIuJzCrHvtf8lJFnw4hXQK5u\ncyrUlNXfrYgghC47y6D2VhhmOMk1zX16/b2rHsL7uz/CxwV2Dbh1uirvzJDwDNJzN6FrfWz+tVpy\nNFHYRcb2pghe/WAb7pz3UZ/erzvukfWZF2yl4q4OOyNJrN3ckndbz/H11I3ShzJ6uGXdnlh5G0Lg\nd3//EP9c8flgH0rJ9G6asF5obm7GYYcd5vw+evRoNDU1oaqqynnspptuwvbt23HMMcfg2muvzeov\ndBs1qgKq2rum6p7U1lYXdX/dGdlsNT/azeaKitraavhUGYmUt887FJKBlFlZ+/wqWtpTwCjXJC5W\neAvdrIpGjrA+hxXOmiGchU1GVY/AXnXWrWzW/u3PXREIAl1AzIjmPBfLPnkbK3evQuAIGfH3T0f1\niJBnu+QXZrhVh6ogkgHIwSg0w0BtbTVi7eaAMkX2QST9gJrM+R7vbVuFUcEafHns/p7Ho/EUKoLp\nqk8Zux1GR61nm978/Xyh3v+9U64vPUOSPK/3B9LHJrv6XYMVfmc7dytBT+8dCJj/FDVDK+g4O+Lp\nC7Fi/3/c1pluUQm5Pk8pqT7FeZ9fPLQEja1R3Hftydhv75qsbSOx9EVVVXXQeV2u44y7rrnyfY6a\n1phnm5he+N9tKGgKpy+cC/k8w+Ezh6NJfNrQjk8b2nHJN78y4O8/IP9mSv4Olsz7ZK+88kpMmzYN\nNTU1uOKKK7B48WLMmjUr7+vb2orb91ZbW42mpoEbTdvVZX1BWAEqdKCpqQuyLOWYpMX6YhYyOrvi\nUCUFCddrneZva8BaW6s5ktsO/65Iup9Y0hR0tVnPWzO0NTZ2QpIkdMbMintXuAlf7GyCX/Z5+vI7\nwzFnv1IwjJaWCJpC5nsmkjpefOdTqOOARFQ4rQApI4mmpi7s6jAHOukpCdD8kIKRrPNtCAN3vvMQ\nAOCBGXc4j2/Y2oY7nlyF807e32yokAClphkpyfBML9ubv19LR5dn+22NYUgSMKG2yrPdZ9s7cO/T\nH+Pq849EY0u6RSIWT3le3+EKuIireb2tPepsF0+mq+jujvWtxmVY9Nkbzu/23ydTMqXj3U9247iD\n69DWnv73UKz/j1es24VdLVEctl/61sfWtuiA/DuJu85vo9XPvnFLC6p82Y2Dja576Ztawmiq9uf9\n99zSkm7ty/c5OjLOZVNzz68ZSlpb0/8f9/R5Bvp7sVQiroWEBvrzFPsc5rsQKFmzeV1dHZqb002d\njY2NqK1NV05nn302xowZA1VVMX36dNTX1+fazbDh3ELk6vMGkNHnbQ1YU+1FjmVougG/Yo3Ylg0E\nfIrTbG73ecPwNpvHk5qzTYUagk/2eZ5/7q3NeGfNTqfPOqEncd1bv8b1Cx/CZ9s7nGN2z58uBSOe\npuKuWNJpAZAMn3MsQtagG4bTbA5dgdB8kBQdCc3bdJ7Qc98+ttIaLLVw5RanA1JSNchVfR/5HE15\nJ0/59V/+hRsf/lfWds8s/QyRuIZnlm5yms1HVPg8zea6YeCL3el/nO4+b/e98O6R/d01Xc5bu8Dz\ne0TLfaG6YPnneHThBjz56saSNO3OXfAJFiz/HM0d6XM1UPO25+qXzddk7668e1qOtZAxEpnvM9xW\n4ervx+mIJPHBp03FOZgBMtz+hrmULLynTp2KxYsXAwDWrVuHuro6p8m8q6sLl156KZJJ88t85cqV\nOPDAA0t1KGUhff+vNe847D5vKV1NWuEtK1Z1bshIaQYCavo+74BPTt8CZjWb6xrM6t0K51hCd6rz\nkBqCIiuQhAzJev6fK7bi4X+uzxpwFg5twd3/SPehulcFU0Y1oiWRHhxlGMK5QJCFz6m8JUWDpgkk\nrNHmwlAgUubFR1vcezUaTaXf372wiv1FLqvmY0I3L07kEd5+UPMiQcsK5lw6k7mvhA1hYHc0/cVk\nT6aj6Qa6oklUhXwI+BVPiK1YuxtrXQPFtDyD19yz2el5phAzRPbjLbHcg9B2tZih/vmu0t7j3tKR\n/ruUMrzdrXG5Lm7yjST3hHcPo80LGayV+d7D7YvffQ76MjPh//z9Qzzw/JohNfVvrgmlhpuShfdX\nv/pVHHbYYbjwwgsxZ84c3HTTTXjuueewZMkSVFdXY/r06c5tZKNHj+62yXw4kK0Ba3a/tV15T6yt\nAmA1nctWVW6FNwwFKV0gqKbv8w74FSek7cldNN2AJBSn2Tye0JyAt5cTlaB41wuHQEJPoNJeKxyA\n0FTPVbq78lZrt+PvDQ8imdJR39BuTlpih7er8oaiIaUbSOr2iHgZ0Mzwrm/x3pIW1dKh61772/4y\ntS8OjC6zGVcOeQc8aprAw2sfw8+W3YRIKuoJccMQePHtLc5kOJ3J3IG3cMur+M27v3fudbfvCtB0\nga5oCtUVPvhUb3hv3tHh2UfKM2DNtba5PUJd0pHQcg9Ei2vZrQ/5Rv/b/w/phijpl1OLq0uglMud\nunMkZ3jnCdGwK7x7Or5CgjgrvHvY519eXo///r/lPe63XLg/X19Gntu3Cw6lqX/zXSwPJyXt877u\nuus8vx988MHOzxdffDEuvvjiUr59WXEqbzugrdHml5xxMPbfewdeiSsw7AFp9n+FDE0zYOj23Oe6\n2Wwup8MdsKoj4Qp1pG/NspcTlYXqHW2uaBAQ2K9mknO/t4hXpudgR+4Q+cvL6/Gv9Y046/9NgqRo\nELoC3YCn8tZ1A0nDWr5UV2AkzGOYt+kZHD/hSAStVdJirvDuSHRiZMAcnGR/v8jWoDsRr4DQFUhB\n7/GkdANrms1j//mymwEAPzjsIhwz7ij8a/1uzH97C0LHCEABOhPp4HdXa+/sMCfvWLV7DY6qPdwJ\n70RKRziWwoSxlYgndU94B/2utdxlAy0j/gUlPgJSMIIW3Q/AHHyXTBmAZCB45FuYv6kT3z3sW1nn\nM7P1AzAHreVi37FgGKLHirM/8lXeiZSOrmgSY2sKm1WwJ+4gyZWx+YI3Ek+fn54uYgoZaZ35Pj0F\n/turd1rbGVDk8p/nyhPehkAP89rkNZRaJIbSsfZV+f+fN0yMqQlaP3mbzasr/Pi3KftClc1wkkc2\nIpwKQ7Kq8ZRuIJGy/keUzD5vJ9ytyjulGea93q5wloLm1fLY0Bjzd6E4zeZAetWyoBLElV/5sfmg\nrHtmrAqnIhgd9C668q/1Zn/0xoYOc1CcrkI3BISRWXlbzea6DL3xSxBJM7Cjrv5cd3gv3rrUqdad\nudntixFdhYhXWp/JfZtQdoDtipjHZ37BC2cZ1Q5X5e2e6tReS317s9msbs+E12atJmZW3rInxGLW\nQLTbLjsBoTFtSFR/Dv8Bq+GbsAmrxItOU3hS0yH545D8CWzsyD0NbVw3gzKoBHDKPicCQM570qOp\nmPP31Q0BrciVhbs5tdm1drm7JeGOJ1bh539a4Zl5rj/0HirCfBcoUdd0sz1V3oWFd+ZtgIV98Wd2\nKQgh8PFnzWU3Ha773PYn1IbSLWdsNqeiGVUdwG2XneBUywq8k18okgJJ1RD48ofYEdllzaomIaUZ\nSKYEhCFDkg34fa6QFq5lPo2McA5EASFhTMhscpZyVN4AsGZjJ3738GdQjRAgG051J4RAOBVBlTWR\nS6akpluVt2p++en2RDEaNF044W1oMiBk6O3mYEU7oAHvILKPm9bipS3mGIms6V11FUas0hz17k+/\nRtMMyJL3f2FP8Lmmh+1IdDrv51621J4AJ2m9zl533V6UpLrSD58qw3AtwBK3ngv61Zwz7dmfMakZ\nkHxmOLfEWz2f3WZX3idNnIqJVXtnfwaYs9XNee9ObAq+CkBYK6sV98vJHUSeytsVjvZ88u5m6/5w\nh0GuUMn3GfU83RQ5t+1L5a27jyv//jOPb/naXbj3mdX466INPb7nQHJ/hP5c8w2lanYoXWj0FcN7\nAO01phIHpk6D3joOU+r+n+e5zBHGftkHVTFHmyeSulllywZURXaazYWr2VwYCiRfylkARA5GIeuh\n9LzoIqMyt5qk7XlzzIsD3ak8E3oSmqGhUq1AcrN5n6Q9hzoAJDTdDEddhaYJT5/3X15ej664GZS6\nZq+mZr7WDnXAW3kDwEeNa8xNnT5vb+UNAFIo3XSe0DSnYjyq1jzGlGEHp56ezAaAgMD7uz92nks/\nYU9ba430z2hTHFFhhjeQDji7sgr6FShq9rdh0khCNwxsbwxD8iec998VzZ4C2J7oJqQGnb9VKiPk\nP2hcjY5kJ8LyLsjVbVafd3Er77hrBTXPimnWZ3avsFasL3F3tZ85h735Prk/o2dq2j40my98dytW\nb2rJu02+VfIyZVbe9iDGTdtLv2hOb3gGBvbjNoVi/z9XSkPpQqOvGN4D7IpZU3H1cT/AtMO+1O12\nqqzCp5qVdyJlhndNtYq6kaH0wDO72Vw3zIFhAEJfXQqoCUj+BJRUFYQQ+OPzaxCNCUiygNPsbM8X\n7p5iVU734dn93RVqJTsCVIYAACAASURBVPTmCdA7R8OADsBuEk5Bks3Qjic1T5/3Z9s6sGqTGVS6\nZlXyVmVu94UDQDSjv7cl3obGaHO6/9OpvBWIlNns7p5mNZyMQkDgyNrD8c39Z5rnwqpaw7GU83q9\nrQ5CAG82LEdjW8QTRPY99kKyBt9l3F49wmo2B9Jf1vGEBglAwK84I+LdknoKTy/dhKde/wzwpZug\nd4R3ZW1rV95BNQjVuqVPM7zhvXLXh87PytjtVp934V9OWzq+wJ/XPo5wMpJ3tHE8zxzg9mduaDSv\n8tQJG/HytpcKfu/u9NRsnm+kuztce2w2z9hvNK7h6Tc24Z6nP05v002fd+b+3ecvc8rcZmtA11in\ni6w86D20cBS8nyE09Vyxu5XKEcN7gPl9Cg6eNKrb2eQAwCer8Cmy1WyuQ5FVCDkJQ04BkvWl4fR5\n604VDgDKSPP+eilZiVhCw/ufNmXdCy75zdCwQ1FYfeZ25b0zYgbNCL81QYDzeiu8EXFeH4lrnsob\nSFfYuqZ4Xp9wVd6fN5mVynXH/Bhn7GtOpdueaE9/Qcqu+9l17/EDQJc1rWuVr8IJPrvyjsRS6dHq\nsSroLXthV2wXfvXss3jh7S3pE23tLwnzizfzy7raVXl/vH0zVu/YjFhSRzCgQJYkyEqu8E5imTWo\nya68ge7DO6QE0pV3RrN5S7wN1b4qyEKFXNmRNWBtV2Q33tuZf33zf9TPx6rG1bjx5b/i9sdzbxfP\n009rn4+GRnNMgG/CJqxu/xDhWApPLKnvVxO6O0dyZUrmMqw276IwvWs2D8ey++u7azbPvIBwH1Pm\nc01Wd8OoEYFuj2mgGT3cklfwfoZQNbsn9HkP2Axr1DuqYt5fHEtqSGoGAlAQTnXhXflvgGwu4iKE\nq9lcl5wFFeRqs0lQSoUQtkfmWkHv+9IGpD4/DFLAXrnMHDls6GZzvH070spdq8ztIxMAtDmVM2Qd\nMFTEpU4oMEeoR42Uq/K2mrqtCwwtZVW2OZrN12zdBXUsMMJf5dzSFtPi6S8J2T52NT0JnSss7TnZ\nK32V8Cne4AvHNE+fubbjAKhjd0Ie0YpVG9OTB9n3w8eMMO54/37ElNEA9nKe19ROrK94BsrY/fBk\nwyLz/RJnOyPOJVflbUSrIFeEkdCTqAgoiCU0p88bAHZEssM77qq884V3OBnBmNAoSMlKdIR2Q0fK\nEzi3vncnAKAmMAL71UxCwJ7Uxz4uawBdomIbNm34ctYxAN5mczc7HKMJLf33ADD3pbVYs6kNmiHw\nvZkH5XxtT/L1ecuSBEOIvCuCefq8ezlgzT1ffa73BrxVW+b+3bPmuS/0hBDOQL5yC7nM0eZ9NZQC\nkc3mNGh8soqAT0VXxPyySfc3Cydw7C/7ZMqAcC2bKVeYVZIwZIStLys7PNW6bZBrmiFb4W3Ezfu8\nDatvWlENGMLA6uZPEMIIvPCKNWGIYc+/bo149pnNqCJe4am81boGyKN2Oc3Qeko2mxFzhLd7xLs7\nvJ2Kxj2TnD0Lnavytu9Dr/RVOLPI2f3F4VjK2b/QVAgtPdGNh7WNhhS2djag0f+x5+k2YxeSUgT+\n/dMLm8STOkKBdDcBACTWHwe9dbzzGYP281blXaFU5q68dbvPO2Teiw/vrWIpQ0Ncj6PaV4VqqRaS\nBBiBzpxNyvd/NBd3rLwPQgi8/uE2ayY2gdZ4m3ksqubchZApka/Z3AooTRee127aaf5/YfSjedId\nJO4+b7v1J6nluaDoplk7U+aXeFeOkfLdNptrmeGt53zOfftavhYDIQSefXOTZxbDgeAZbd6Ppu+B\nWqSmGPaE+7wZ3mXi9Emn4OBRBzr3OvtkFUG/4vzDE3L6S8eerGREyAy8aELzNM/KlWZ4G7qcbtbU\nXaOiJQEpEDOraWsCFfteclk2oBk6UkYKWjQEZ35Su9ncqnxl64vciFeiPZxIr3AGwDfhMwgrZFMp\nCaOqA1AlMzyT1rSpumFO8iKEOUNb0BXedsUl5HSfd2azPwBENKvyViucCxk7+CKxFKCmK3e4lk1N\nnwcjPSFOPkp2c3IslUDQnx5dDwBC8zvvsaO1AxV2ePviEJoPtYFx6Eh2eia+AbwD1pZ+YIb79tb0\ngCd7lrsqfyWqYN72JwKdeQcP7Yo2YmP7Jjz+Sj2WvN+AjmSXZzIcuTJ7MFVcS2Bly/KsVeeAdEBp\nugHZdZ99NGVdlAT7vmSonmcglWKHd54Q9FbehQ9YE0KgM9q7ZvPsyts1sM89IY/r4iffRcfnu7rw\nzxVbcftj+bs4SsGd17kGBvbEPb/AUFGs1oZyxvAuE//fAWfgJ0dfZt0iBqiy2WxuS0npL2A7qGsq\nrfCOa5B82TN1CUN2+vjcfeKSrEMKRK0mc8nZFgBk1XACUNcl1768fd72hCkiXoH2cDIdrjCb0u3K\nW+gKVEXGiKDZPG/fLhWOpszPofmh6cKpvONaHAnNACCQ8rUDwgxGu9nefTucHUqVvgrzVjtIzoC4\ncCzlDG4TKX+Oyl3Af9BKz2fPJQVr/vdPj4HeYYYnanYh4LcvqlyD6qxz8MTrG8zKXElBCkZhRKsw\n2mfeKpdZfcdc4b16o1khN7anJ5SxZ56r9lVBMsygFJLebRW0rvlT5+ftHebAwUlV5gBJKZQ9Teyz\nG1/EB13L4Nvn06zn7PBKaYZ5+6HFvmjpzz3NIk+zuT1oMpmnP7uvfd66Yc6alykz4LuvvHM3m7tb\nLqJGFz5sXJ31Pok8XROl1t8Ba3Z4r9ncUvKpeYuluwuw7vx10QZnEp5yx/AuM4p137JPVhH0eatl\nN2HIGGmFdyyhQW8dl7Uvs/K2vmxEOoilQBSSqkEkXTNluSpbu8/VHinuft4OTykQMydesSZnCfpV\nnDzCmkFMNiAk3ZqaVIJPlVFTYb5XJGmGVWs4BikQgxGvQDJleJrNkykdck0z9EA7KhP7mHO4Z1T+\nABC1Ku8qfyUkSYJPVhFJxLHgnS1WeNvN5n4AMoQhpcPfl4AywgxLvWW8s09ZT48U3m+vaucWPpEM\nQsTMufn9B6xG20irerIH1bmqe8jmjGxyVbvZzN01GmN85t9nc8fnnr+RHd6bGiLOHPbuPm+7X7/K\nX+lZtz39hZT9ZRxOpPvZd0fN/v0vjzBnN5QrO7NGnO+KmhPb5Ap2O7xSuuFtclfSa8c/99YmvPbB\ntqzX9iTfaPN05e0Nu2ff3ISXln+OsNTsdH/0ps9b13NX3lpGuOVbqx0AYnmazd2tBLvHvYSH1z6O\n19au97z2b1/8H/wHZy+GUyhDCM/FQ8Gv62cVav89GhrD+M2j72PTjg7c98xqRON9v3ArplxjI/py\nwZJM6Xjzox34y8vre964DDC8y4w9baoqKZ7KO4sho6bKbPKOJjSkPj8MX459A0YsPamK0CWn8naW\nEQUgWc3u9qxn9v4As9nZvlXJXXlnjVZXXCPMAVQEVewd2MfaRoMOzemHVhUZFX4zFCNWsOzobIYk\nCYh4BZKajpCSEd6VZr9gZXQ/81hzNJt3psyFEsYEzYrYp/iwszWM55dtMf/B+lyVN2BeaNjH75rn\nXa4I43jpAnNbpB8/cOJIp5lbaH7rIsDU5WswH5fTg+Ls1gF17834rKUBcrV5cWCER2Kczzw3G9q8\nM63t6mqF0GX88dkNCPnM/bv7vO1b9qp9Va7WAyNdfarZYbR5d5vzc7u1GEyNOhpGIgg51JX1ZaZI\n9oWZAXWfDVDqvjB/l1zN5prhad2RrM8djafw0vKt+PuSeuyM7Ma8T5/POV97Lkae+7ztqYTXbmnF\nR5+lBxf+c8VWvLBuGT6vfhnq3uY8+T32eXtmFzN6rLx1XXQ72txTeWu5K2+7p+mJ1z51LpQMYaAj\n1QZlRO5FZwpx3zOr8V93vdVta4emG57lMM337t993plTwP7+yVX46LNmvPnR9l7vq9iefXMTfnTn\nm9jZ4p06ubtBh24bt7Wjrcv8/zVfyLd0xEs6HXFfMbzLjF15GxCe8E6PJbcYCmoqrfCOpwChoEau\n9TRf64bkDFjzfPFat4l5mrqtn9uqV6fvv3Y1J2eFp6x7Xl8RUOFTFXMOckWH4QlvCRX/P3vfGW9H\nVa/9TN/19H5OzknvIR0SEjpEulIFiShYLyI2BEQR9PpD5aJX5d5XQbHAtYAIypULWABpIXRIg5De\nc0pO3XXKej+sMmv2npOQkJAE5vlAOHvKXrNm9jzr356/wfTMWax0xyDt5EUKSdiOhxjTYM+5eRSY\nJjgdgxEYq+w2H3D7YGkmKkxqERuqESB33eR9z/k51OD4GZztI2F5FehIjxDu/mRMxylzRvgxascA\nsf34rgGLndIG8VhnODZGNZaFPvlpkZvgZSphKnG0pVqwrm+DiGl7xENPoQskT5vT8LwDuXXqoBTz\nlhdQPO6rsAXKmMqRmBc/EwCwTYqZ9xeY7CuJg2QroJhF9GT7AxKnQo42MQijeQN0Rt5xU4fteMg5\nebylPwa10idSYXnnfCL58Su348mtS/D0tucwHAghIg8j2DDD30f+/Cf3Bd3PWgO18DU2Fsfx8Pra\nnmFL1uRzOR4JlXYtzXrfXZ33cAlroXFuhYhEtqCG/b4RAReWkRvHlOJbv34Bn//RUwGyGS488Xah\nlogfcC/Du9Uudnd4aMlGAMCK9cFFUVAlL/yaO3uz+O7/vIyb734RQDjJ9/Tn8dWfPov/vPe1sm0H\nGxF5H2Lgcp8e8QJu85OSl+CyKR8RWeeEqEjFDWiqItxXhq4G4rfE8RPWvLxvkYsabznWy+uwY9vx\n5zUPBT4D4Mufqi4AAqjB2vKERevS4eo0EU3xyds0NCTMIHl35ujLl1reXiDmXXRcEVsnpCRhTvXd\nxUNuH+pitejpz+P7v30Zjh20qDXTptYwczcTT2rqwv51drbD3dXC+qYbgOohldBx2xePRW1lDBk7\nA0MxAaIGLG9DYeSt+AI1gfkC/GQ3x4DjemhPt8EhDr551xMAgK5cD4jiwcumEDM1DGR4jbwjXLfC\n8jZTUtzft7x5XH989RjUaC1iO8cga4X6qwfXw8vSmv2/L1+BL972NJ5bSePv/YX+wHGinaylwXE9\nPLVlCfr1jZClCbjl3ZcpAIoL64h/iYXGa10rUIqubA8Gi0P4zSNv4KofP4Wt3ZlhNbdLX7YD2SJW\n71oLrWETVOba91gI47W1PfjRH1/Dd38d7o4W51IdvLTzFQxk/UUst4qD3+1hlf009DYa/y+zvAsS\nebthbnP/M0X1hFUnt9flHqF9xe6M561d9HmRPQHDLYzeLrRS5aJDEKW6GYFF2zBW86ad9J70MC3/\nsORH3tt+1cbesm0HGxF5H2KQyduSyLs53YA5jTNgKiwm66mIWzoqkqZI7DE0FfUVPkm7riLI29ky\nDsUNkwFIwiEy2Uj//0bvWwDoAsHfTv/fHLNMxHJlyzwRM6DrKrW8VQdEcaEpPB6uIcXc5isHX8fD\n6/+B3iJzKRcSKNgutnXmoCoqNnX3omh7Qq5UfAdhMWtOiEYBLhxk+k1c87MleHNzHwaGXMhtTxW9\n6LvMAboA4W5zVodOHJal7hIYGvMU6P6POGNnYaksN0Amb7Yw8VAU4QNSQt6KXqTKdVDgekR0U4Pq\nghCC7Sx5jeTSKBRdZLJ+jTx/6QvL20j6fd+lmDe3vFNGCiopDy2IVquOKch7xY4NAIA7HlwJQgj6\nCiWlSxqXf6WWt0tCrErNRUXSxMBQEUosCzVGX3KqomJd/4aApekRD//x4m345Yrf4cnXaDLQ+m0D\nw8a8S8l73dYB/PjV22GOXClkfVHiiXpdcq/L4C9xo/0N3Lf+TxhM+fFM/rIujXFvJstgtKwXf3O8\n2rUcAwU/L6DUba5YGcSP/Jv/5ap/H4ekKgO5MmRf8HZ6cpcuSDj25DbfuGMA/3X/soDrfTjyPpRy\nuEuH+HZi3p0lLU7DKjhMQyv77FBBRN6HGHi3MZd4Abd5OkGJQybvmKkFpBh1XUVrbYX4m3gaeofY\nKp9ocDtHiAYn/Bz+viGPQpjbHIA1eSn9n5KYNz1GBzQXRHVE85WYqSNp+eP86/q/+brmtonnV+7E\nt3/zIlxbw2CBveTYGF23xDvAPuelal2d0o+LaDSBTKUdxVy1ECBcriInn58vWlzPg6XSfXVDJu8M\nYiodu6gVB+ApLC9AsUXSXqAcDzSWbjHCdlxPiKcomgvXI9gyRInMy6XYi1AR96efuXeHbE7eKSGB\nC9XzrT5meadNSu6EoKScboiGDYgKkqXPhpyYtq2/F45EzqpnsnI6D6ahwnY9IfIiQzc8NFbHA+1n\nTxt5MmY3TAfgl8ABtAFNxsliTd86keCn6wrk05JhyAYA1m4rr4tWNV8NcHcQOvkshGEnt4ltnHxl\nK01uHSvvs7p3LX6+7C68YD8ItXon9Kb1cFwPHgsDFG0XetPGkkF6tIwS/n0EEBDuKRsv8fZIzm8n\nbC1n4e+N5X3lfzyOl1d34bkV5Tr85eMoP9ef1/zfbtX+9hV7Gnep5e0GLO/wY3mf8mSMl5mW73co\nl5lF5H2IgVvepJS848wFzcmbqIgZQfI2NEVYhAAATxUPKIXix39RalmHrDBD3OoyFATd5lTpTYei\nuVAU1gwFQNzUUBEP9oC2XZ6lreI1Fssjju5b1oxcbVt+80iWM3P9J9W0fz2uCkUliM/5BxQzD6J4\nAVc37bxGaDy9pDOb6xKYjFw1g373uv4NsD0HMS3BxufPnY0C8k4BLmxhvYfNkaHQc7oegcoFDVUX\nRdtFX44nDkpa2GyBMsAWXZt6ekA8BU7RD4koiivkTLmLO2kkqTEqhwYA5NwMVI/OPfdCFIlv+e0c\npLHCZHYU8svno4KwzHvdEfK8A5LL18vTcyUTCpK8xpuPQU+IOZRlcLn17xFPJPFpqor/emCZf97d\nSHgOZotlOR+q7ore67sDf4nzygoS8/MBuFXtegSKlYU17Um83hOMsfMXf3+BHpdVemGNewVG+5so\n2g4efHo9rvrxU1i5cRfUip7AsUrA8vYTqoazvPNOAf/+3K349crf7/aa9qTbrVZ24U3mPQPefsxb\nnvdk3F+YD7eYKP246Nr4+6YncNeqe3Y7vr2F63n42h1LcO9j4W11AZQ6YgJW9HCaCNt76LuxtoL+\n/sLc6283Uc0jBKs29r6jxi97i4i8DzHwhDW3JObNLW9D5a5XD5apo7bSJ0VdV8vIuxQyAQXIhoQ8\nCnsgd/m7EjFK3rL1yWO0MVNDMhaU7LSJLb6DJxEpngHD8jCiIeWXAknhQSK3PWX/5vNAR2MaJ8xs\nDYxXYdnqPMnMMjV/PjSnrDOb43rCbc47hf34lTsAUPUzOugY7E1UCtT2CljeQ12w3mBN4FwyTGbN\nP7NsOx54YpMYe9HxROa9fJ9UaIDioj9TxKadg+jNDQKOibVbB0RCG1RPlCxx8t64Nc+6z0neBcVD\nkRRgEDZ+fq3En1SejY5CEqZdA5PF8hXNgc403Tlx0fmk280YEd4WbnnHtLjwLshKev1539LnBNfd\nnwsmj9VtwD82/QuATzAXnzyOnstxxaLW3jyOnch5W+QtkvGYkp6iEtF5T7a89Za1UONZPLL1kcDx\nolQupMd63inikefpPX3hra1Q48GMZyhyzFsi72Es78c2P4nOXDde3PkqVvS8OSxp7l6m1IM14SX8\nz9q7sXGAVkS83WzzjTv9+/R2Er5K8wGyUmfEMG/NviKTc9DVl8fGnYPIOTlsGiwvS1R3Y3kPN/4u\nFs/mW12XAEYe8SMfwWObnsTTr2/H7//xVuixpXjxjU6ahf9WePjmQCAi70MMgZh3wG1OicVQfOvN\nKnWba6qwfADfsm6o9gl+WPIOURIjw7jNOUbUVfqHqwomtlcFysc42cRMXciJcnDxEz6GuKVjYmsD\nHGLjcxeNRnUFHWdBTiJ2Zbc3V3BTkU4YaKyOB65HJOUxy7syYYpriM96TKjQnbuQkoHjEphsMdLX\n9Dh6cr2iZGt2zZHivM6OUXD7a2ATGy/upPrvXBa11G0OACZbbK3fPuhnzGsOirYrVMrkudVVHVA9\nPL9qJ2761QtQDBq394h0P1TXLxdi5L1lexH5okv34d4JVmGQGeT3UQFxNbjwJ7UvT4nZKRiIW5oY\nLzQbpk7H1S/FefkCSTdcn7wNej5L8cm74PrWZU/Od3trjLxL1dOMjlV4YM1DcDwXhAAT26tw1GRa\nG19winCJC7evDs72MaKigTeMCUPfUAGPv7JVkJDcjc4cuRJqRXfA8pZDQNLFis5hgYQzBqphH1zA\naH3t0gLDldzmEnlb4eQtJ/r9v9fuxLLulaH7DVceV3RtaHV+WODxzc8AKPdqEELws9d/jf9dG1yo\n9PTnWRc8EiDm4cgvb7t4+LmNWL2ZlmxmbT+GvCvfF3rMvoCX5xUdF//96p34/gs/KRM7KvXWBGr3\nh1ns8NACfw4c1xNVDH9a81f86onnsKnTv++7s8K3sERBfr/fDUTkfYhBVcOzzbmVoTGZUUV1ETM0\n1ErkPba1UsiE0pPQ40c2paXPJHeYbJmXan5Lx7O9yzaPb6kVC4yBrI3KlIXjjmgX23lTkpipiZcc\nh6uxlxnLJm+pTeDoFkqSd6+6V4xHdpvLMWthgbsa0gkDDdWJwPWUlsNVJM3A9ei1NN5cnaIuccfz\nhCeBqDbuXf0AvcaqMWhPjQheuEv3W9e3EaYSA8nR+f33y+eXzRG3vAH4Cxtmeedt+sKvTib8/TUD\niurhjU19gOJC0VwQx4TrefA8iJg4J29uUaowaRmT7DYXjVmkBZur0wx5Bp4QZ+cNxC1dkLeiOeLe\n9hcly7tAnzdV84TbnBOXqcZD3ea9WXo8IUy6Vy+KEkYK/9nryXK3ugKTkXPe4wsxQ1wDNCcQ81ZT\nvbjnzT8LBb8f3PMq7n70TSxdxWK3hpSAVbMT1sQXBQm6HvGrGmRIuQWDdjl5F11byMNyD4ilxv3K\nDtUT3gWZvLmGQSl6MoMwSAIN8ToAwxPgcKpyj21+MqDBz/NKZPL9xV9XYe2urVjWvRKPbHxMfH7/\nk2tx9zPPIT7zceitawLqdjIxquke4bnY3p3BH59Yi+/9lraslWV4d2a7Qse4L8ixDP9C0cP6Aerp\nKG3y44sJ2VjbtwHLi0/AGE1DII7r4anXt+FP/1obOIaHRGQJYPkdEZu6BErCf/YzuxGl6WFW/HCS\nvgcCEXkfYjihbSEA4JSO42GZ5daAcFUzy7u1jr4oJnVUY1RzhbAeAQh3LHe5A74rm26XasI7R8Dp\nbIPT2eZvl9zQJJeCN1SJRNHvuGVqJs4/kVoZU0ZS17HIqAZQLPjkHbf0gIAMjAIjW7pPU20Ccxpn\nYFrdJKzr34AhbTs7h3TxcsyaK615GtIJk1qBsopcSUb9rPH1qEun/HMxxboYUzVzXSL01wFfxtXQ\nDJhG8GfCSSTjZGEp/uKptT6FUpiaLITj66sXbQ95FhNorvGTDGO6IVnOvshMNu/QlzBbwMiWNyGA\n6rG+6l65d0K27ImniVp2wE+kKuboPbKYWA50h1U7EAzZQ1DsONA5GvYW1pVMc0SiD0+aMxCDxa5X\nJm/umvcGqJiOVrsNg3JrTql0atsQJVtVVYVlXfAYKTAvCl3EObAMSU9/1HI8ufVZPLLhn3A9V5RM\n9bA2nUqImI3sNlfCyrcUV7yMB/dgeXMPSEz1PUCK6olqD368O1gFNZ4RLm0ZWTuHQk7DB0efAcDv\nA5ArOPjt31eL/Rw3PKntjV1BFy8PXcge7KGcjd89/1TZsX99diNyBiVEo3VtoFc5J38lNgRr0guw\nJtM6/lK1uqyUUf+/ax8WvyEZL+54Bc9L/elLUXSL5fr/kuXN8asVv8OPX75dhMe4Vfzwhn/ihy//\nP2zxVkKv2waoDhzPw6/+7w08tGRjwAshpH+55e2RMg+kKpP3btrfdrPnbLgGPwcCEXkfYphcOwE/\nOf67mNVwBCyj/PYYnGBUDzFTQ1XKwq1XHI0vf5hm+fK4LQBBvgH3ouwelC1vosHeMBVeVs5WD24v\nrJyPKUnfhWxqBj588nh89zPzMGMctRZiElnlmfEbs6jbvLBsIYobJ4rtinR+njRycvvx9OsUjyXE\nlMfdrSnPCsubeNTytgzNT3aDH1fk1xC3NMwd7y88eDcxUzOggCa1aFKHXMI8Dbyvugw5NGCqscC2\ns0afiqq+2eLvQHvOgHyqi6JXBCEKWup8z0jcNH0vCCccx0Qmb9MsbE+lMe+Cr3QH1wAhCLjNdU3y\nTngl91+aJy7/6hQMxE0NMS3OzmvT5iu6DZe4UPKViO+a5hOo6kiWN53LJ1/y+8bLMW+e8ObsGAli\nGzDa30Rfwbcqq6sly3Dlb2BNfRquPgRNVaEqCoqk3PImqitCSfLcPrrxMVz1xNegN9FSL9cjwoPh\nZYOLq0DCWgi5Q/WEBSqTN/GYfKtr+6ED9jwljARkHf3BnA3Xc7FhYBNMLyUWMLe8eBs2D/ou7oJt\nU8liR0c2xyxCRn6PLN0kyc8SbM9tx5cfvwlPrH8hMNwEy81wulqhQRM6/4E4t+KhR6WJX2rp619a\n5PXYdBHVP1TwNQXYgpiXBZaSmWx5bx7ahqU7yrPOf7Xy9/jNyj+Iv4u2G9B8v/Wl/8Y1T92EXfle\n/HrFH7Az0yme9VIZ1NV9a6HX00UQJ+A1fesD+6iJwYDbnH+XJ2nYc0+G63riPnLIev6lynUyIvKO\nAADQVPYjUspdedzyVlRPuNJrKmJCwjBgeTOrNpDMEaKqxlGZNANxW0teCDA0VfrkbmoGFEVBY7Xv\n9pXJm7+EYqbGXKBKILNazlavTtPjWpK+znhpkhx3hauJId8t7lLL2zTUACnpsWLgHKauBRc2jNhM\nzYSmqXC8oOXNyVtX9fLYqpQ3YKlWYNOpI09ERW6cv13zr5d7PbTa7XijawOKjg14KtobZcvbpN4F\nkDLL2/OISNrjpMFnvQAAIABJREFULwlFt0EcAx4jb3gaFIVlC3P3eWkSIRfaAc1GB2huQNzS/fun\nOYiZmp87UIhTS5yoILaBIjJSzLsI4mpYsqwbdz9MXZM524/r8nixl6mE09kORSEYcCh5X/Ghqaiu\nCU6vmhhCwaRuV8NQy8ibXoODdFJ+PoOWqNEuNVlhiwsu7AJQFz63vF3J8k5qKcQd2kRGUT3YbJ4H\n7SHEtBgaN58HZ/toAIBDbL8Gmn1Hykj4vyvFQ6Ho4q3e9cg5eaSdVnj9dWIMXUO+KtiWHuZKdw0M\nZei4RJMdRhp6y1rE5vwdD+/6LYrI4c8rngxc85Cdpde1fip01RALKNntrbe+BcegnhAufyyseOn3\ns9T5E17dvAFf+q9nxCLHigWJqbQ3Oq9l93rpb3ht34bAdtlb4HgObNfG13/+HP7th/8Sn29l5ZM3\nPPtdvLDzZfxr6xLkmOVdCHNJMw8cH2NNrDqwWUkMBkrF1vZuxuceuwbLu94Qn8ltb6EHr4mrJAI0\ncY4QgnvefACvdvnhCdvx0McSE4frQX8gEJH3IYzm2gROmNWKL15whPgskE0eAqOEcDVVCcgbkuEs\nbwC1lbEAoafiQWICgNYaP0lNjudyWJLb3M821/06TEk0hYu4AEBVih4XsFRLM+Cl97OaYGVWsuWt\ny+QddJsbuio0vGWYqgFNU6h2t7SY4Mk3pmqUkbec9CeTsxibNN+xEMtbjWXxt/7f04Q1TxPudtNQ\nYeq+Z0V0RXNMZPIOtSCY5e1fqA04BmzH893mAGJxlLnNP3bqBEHufFvOy9JnytMRs3RhvampPjqn\njLy9giU8EKQYR8YblFzGRX9O2Hf15XwrLONkaEzZMcR+3EqLWRo8zd83zix/T6X3z9BUOGD3UnwH\n/d5kXAqTlHTVcwerpG1sIWeb8DIVYpz8he95HqAX4RXi+FjHlTAddqzqsg531PJOm0nYDoSHpugV\nxQKAex8qrKT4XXHPx7Iu6vKOF5vhDVXD3joGAPDcm742+NZeupghjo6BQWZpMsubX6Wa3hUoAyxk\ng7+/jJ1hc6RAgyFCF778bT/05vVAMQEvkxZiQaVqfRxLNvtlc+NHVOH8U/zcDzW1C/IP8ub/eQld\ng9TFbO8YgYQex/qSJjy25xPj7ct+gy8/eQN2ubQpztbuTKjVammmkKQNI0au9Oc4dCxyxjtA3d6y\nbsCDq/8BAPjjW38Wn/FjHdcLvEMAQElI5J230Vvow5Nbl+Dny+4Sn+8azIuZiCzvCACoxfzRRRNw\nxBh/tT4hdQTcwSoU3pwdekwpucdMLaiQJJM3CZKZkDdlkBOpONpqq6T9yxcSlaZvRfKXWMyULT//\n/LpE3tzy1lTNJ9mSxUVx3RHwWMIUb0nKY96moQlXOOCX9nDi0jU1IBwiX4OuKtjUOYTfPbpOfM7L\no/RQ8vYXKLy0SoYWIG95MVOSw6C6UIkuLNiKhCnlNDjCCiCOQd3mhJM3LwVzoai0tj5XcFDgbnMA\nlim7zTUoACxDCyTNAUCB5JDQaC5CwtIRN+j86rU70KmsgWKypKd8TMwDKcThEgdEp5nJil70xXDY\ngi3LuscRQtDv7PLbz7Lvz7t0e6ezCTuTVNr0kxMvw1ktF9Ahs/71pqHCUYKVA2JRKB5PAhhFjEx3\n4OjmuXRqpC588iKosHIezGKNyDsAaLKiYhQB2/QXSACgUMvbIx6G7AzSZgq27Sc2Op4jkTf9jsp4\nWlLCo9v6cixbv0jnmeTpwAekBc6OPhZGcA08vIS6yGWyo99B5X7zrx1L50hx8M+XttA+5ZkiuocG\nxBzpii5i5tzw1iq7oShAYeN4EMeEogDZYkHEkkvj/tttP8FLU5WA0Iw1+XmYE18AJ/A1W/rx6rrt\nbJ4NjKrsQHd+FwaKfqWCHMte2fMmrftnv+MbfrEUP/uLb81yFN2i0DRwPSJCFv7AWNWJ68sJK0RF\n7oVF9JqsbMBtvnEbnfNdhV6YE16A3voWc6F71I3O3iFpvQLENpnbnC0M8g5eWeu3C/WIhy2dQ3h+\nVac/3ihhLcJwiGtxFFfNg9dfH7q9lLwtUyuxvOWENXr7501uxPc/O5+2/pMs7/GtJf5MMMuCn1sr\nt7wbEv5Cg1tIPGv5e5+Zh7PmjQ0dK7e8AYiM5dIMYJJP4SOTzg1+oWR5u92tZePxX8R+9q0MQzWk\nemH/+3gs2ND0snri0fX+NRoh5C3PtxJI6C8JA6gudMUQZXQVSRNtKRqX16q6AuSbZZa3r89OAuSe\nLTjCbQ4Ahkl8kvc0aBpLAJOS5gACm+TAeSNmakjr/uIrhz5R1uQWLDEPXoH1ZlcGac9yzRPhEL5Y\n4q1fu3O7YKMgLF5ueXsKJYo3B/3yqE1bbdz10Dq2ncdXM7Br3gKgCNLjC4BYjL2U9SIUBUjoSVw0\n4TwoTgyKbqOphu4vXP/FGEBUmAodP7f+O6uepIsgT4Preb4YDot5d2W74REP9fE62K4nFp02sSWl\nO2Z5mwkpt4Fu4yEEl1Vf8DnK2QUUbRe/fGgVVm1hjXpcXWyX8wb4dRLHFGI7ikoT2VZv7sMdf10B\nWymI+VWhS25ztsBgdegkmxZz2DkwhKLtQW9eC60mqKrW7/o1y6qqBKRhAdAOaZK1nnPZ78s10Jyg\nZX49OT80kA35/cmu+tfWlau6FdyicJtD8VhIyQfPc+GLqEwxA89mioKuCkVzgqV10m9Qq+wRTXgc\nhzDLm97Hj7Z/Bl6mIuClsl0Pv3/CL9+7c/n/4DtLbsMDT/niMZHlHWFYqHu4Y2aJNWwZWlD3N1Aq\nRh/kptoE6qvi0FQ1QO6TO2rF/08fU4svXzjdj8ejNL5OURvzCZ94Kl08MJd5Q3UCR030CVa4iAGk\nErIrmrfwLL/YMXV+TJx386pImNA1Bc7WsdQqIeUxfuIRVFoVpaeDqRmi5EgJqXU3VCMgvfjZD07B\n5R+YIf7Ww8hb2t+TpEcntNUFd9RtqIqOuGR5z2+eCxAFesNmP6Pe1ZHJ2zR2yaw6a+ozfptXx0Au\nHyTvbGIjI2h6nzVNoeTLXtqJlEvdpooHz6afxS0dST2Jwlv0+lzFFpa3V4gL0RauVpbxBkUSk8hl\nYN+/dscu/Owvy/HEm5ScSYaFW3jZGrdwTN+789yrg4J8XOY2R7IXUF2MVuaCFBOB73hsgJYUcosx\nriawoycLt6hDt1xRiREgb0BkxOftIjziIR9jFmM+IfIKAFoOt2pjL/64lNbzt6aaqQyqwlu32uVu\n81hSkD/XyOdVBbativtJPy/gsZe34ull27F5Fy2Rq02m/aQ/j7vNFYgcCFsqeWT3dzBrY8122mKX\ne0Dyeep2J4SAe43VWAbEU0AKcXGN37l7KdZvH4AxgvUzcHTkXliEpNMIBzZ4GZ+mKgErmkP+zdhM\nuY84hig5zEreroxdImID3+1N57A8abDgFkTCGkqSyVJGUhCrIyzvrK+q6OqA7gT7juvhSWe268F1\nCfNuqMgXJE8Zu8ai7QbG8GrXcmjpXvEbEfu8S4jI+z2GcLe5VPIVEvMOWJYSudek/Bfr/KlNmDra\nJ3MAAUEYDpnc4WkBlzk9p2+5VyV88RiZ8MQChBGVSGarS6IuLnkDPA26ptDEKkUBoIAUEtAhuarZ\nS8ojBIs6TkBV/wwa72OQyZk37pARqJsHVZKrkhYBCb085i27zT2pfj5VojKnKABcFQ3VCUwbXYu5\nExtQHauC5VZBiQ8FMuppqZhfh6omhkQmLHENDOVtFGzfbd6XXI7p09l99VToqsK6zrH5GPMMVNZb\nmj8TBduFoijwhmhoxEYOipmHQhTAlmLezPLut/tgxIPkzc/Vn83h+VWd+PsKSt6lljePLTpMMCb/\n+kLs7CmIuHavthHfff5HUHU6B5br51qIlynJQzFzIt5tKQlk8g6Ia8BVCjBYtUYpefNcjbybD1iD\nzrYxrByPC9FQ8n19K81gbk01U8ubu80JJe+EpQuXdtKK+d4PI0jeXPdAdPDzCsiKen3672lzx4rf\nnS2XWqksROKYoGI7qng+sgUnQJwAkMl6ICBwPMePeceyIIUEAFXyDri49wnfclR0ByAqFM8MzLWm\nKqHlcmD3UYkPQq3sogtqT4NC6Dhkb1fGLre8lVjWJz+jnFjzbkGUivE5cnc14gOpy2CqlvjMcT04\nnoO8mxeqisTVoaiOmGN6fSXfwd3ujkcXAJoNuAZts+zySgJ/n7LjpTkChkmqO0CIyPsww5566IZa\n3oGYd3mdNycbz/MCCWvyQqBUfrB0eygUglhJrbpsrVt6+PG8QQh/iR49tQkfP20irr5oBkzNpCtu\nNv50wixrSiCTN38RVqUs6KqO6sKkQHtUUzVEOdCYmjZ888jrMK5q9LDXaGhqoJZ9XHMdPnDkCNx0\n2Vz/slUFhVVzoearsKDZF27hgh4yKhMJ6JqKL104HfOnUq+CiQR9YRh+0h0tFSOB8h7enAWOgf6h\nYHY9APSxzm10kaMGLG8A0OtYwhT7bOG0ZurZYQRAyTEPnSQAKNA1BQumNWFEFQ3Z7Mr3wUqweHKJ\n5a0YBVimImWrJ6Brip/YptlQFF+qlQghGVVoxW8Z2ibmwHN8qV23zw8ZKUZRWN4WErSch32HbrJY\nrsVkMJnHIGXRf9/Y0oVfPkL7NDudbSDFOAtNcPJmI0pQi7Ml2URj3sxtTsnbRdzSka70UBmjrV1L\nyZ+7r4u8RxD7DRb1XRhwugHFhcZEgyqsBGrScRAiuc0VOW4vJe0x0ti4Y1C4r0lJ7kHRsyl560W6\nwGChB+Fh01ykkiE04AYXWSqzvHUvjuKGSbC3jaLbmSWqN9MFDsnR3vSKS8chk3e2pH4bAPSGLYjN\noNnmhuWTYFKpggIFBafot2FlY/EKcShOnC7CJLc5j6kHLG/NoUTMUGrdK6oHKJS4HZewcj0ahvIX\nOPQ7MnlbkLfT3QJnRzubA7o9bmmR5R1heBT3QN56iaVoGcGENeLJ2xXpv1wmskSqkyGsLWCY5Q0A\nY6voD5vYFsa0BF3VMtEamoFvf+JI3HrF0SXnpeTI5V1jpoZjp7eIuHiVVcmuRRMNW2QYEnlfcfZ0\nfP7caRjTSo/RNZW9YPh1aSJO1VSTQGOqBhWmbJkH57M0/p00E/jwiePQ3ugfo6kKvMFaJDedgOqY\n/7nc7IGjpabc2rcU+oJVOem4GnIFF7miCy3mZ1VziyWQxyBZ+l051vCFuc0NPRgWEYlVnobPnD0F\nNRUxen+IBuJqKHg5KLotXsS6ruITZ0zGNefT+9WT3wU9zsnbEucCAK1yF5qmv+W77l0dLbVJsVDQ\n67Yj1rLJF3MJkZYFAM9gCnBF+twkYjq83iZUDbIKDL0o9NKTajWyeceP+zJLTjGZNeZpuOXf5iPN\nyHv11h68vtFPsgJo8ppX4vZWrBwsNYZ7/rYJBP4C1CU0YU2zCsg4Q+iobGGeJhWEKFA1Rt6eDVM1\nYLOsZrHAqejBC7gPWsNmaJXsGow4aiuo0EvOLhey4fFuIkkFr9vZi9gUKpzCFy5y3NzziEgMEwtX\nISTjoChJ2Y6qYhnlJeENVSXoLw4grqThdnb4izUtaBUX3jgyMA5ZMjUjZYIHcmMAQPEEeRc3TMIM\n71xYmomiWxAxb1GD7eooFF0YqinKHh2X+Cp2IrFRh6J5yBSkeWTkrQ42wB1gZWWqC9vxqKdDs0Fc\ng3lwuOVNv38wa4v5cLvaUB1jybvsGY9behTzjjA89mR5lwovWKaGie30ITthVmvoS5KngHgEQQlR\nibiUEPIu7fTEccX0T6Bi2wkgmUqcefTIYcdqqDra6lOoqYiVfS6j1Hr33dYkKNTBj5dUz2pSScwc\n71tqhqYG6n0BX7iBZ30nDD9cUGZ5l2Sex8JKxbgbnhDELR1nzO/AvMmNWDituWxfSyuPmXPyVrhl\nzd2sRRfElPpCM3KXQx2Vdf6LWGQrexp0VaVubzlhx2KuVtfXntfZfSaOgSFniMqzshc5d5vH9TgS\nehy7cr2iJI+/zD+4YIw4f6eyRri947qJz35oqug0BwBoXYmCW4ACJZDMJ5frOBq1evM5Rt5snKpD\nCXj2tCTMup3wCnGkvUYqYcleuq/iL4DqUPJm46urjPueE83P6OcvfM8jovWqxvu6aw5yWQVLWJtM\ngy1aHTh0MZ2gNdod6XamSgfAU6FoLJud2DA1U2RNBxfQ/iKNz21N2gI8Ddtz23HHsrsgMvqlccLT\nxMKoM+tnO8ulcIBP3rwlqli4Sm5z3oa3Wm3CN46/KuCh4ffC1bLwiIdxDS04YVYrxrXUse22P5eA\nOG57Fx1vgLyZZXzVjE/jqhmfDswBvRd8gRLDUMaFpZk0YU3EvNn8cfJWTJFQ5rgeduWpp8kX86H/\nDhX8MSi6DS+TRmbVLH8Bwo7f5qyHogDeUGXAbS5yC3K2mA/iGJg7voWek2kimHpkeUfYDfbU67fU\nhZyMGWitT+G2Lx6DxaeMLy9XAkRrPyrmIFnGEonK33v6yJNRH68NxH5lWJqJq886ATd8bI7I+JXB\nm6+UeglKt/NyH8sILjgqmeWt6DbSyZBac0XOXA9+h6YpActbBpf7TOp+LH5PlndY9yQeo+eqcecd\nNwafPnsKmmuTqDQqA/uGldvFWekWfzEeN82vr1WkF7/O483SgqxdnVZ2PuL6lrdcIy7I3/W158e3\nV2H+lEY0pqtEaRBPaNOlhUtNrBo9+V7U1zOyZy/CU+YEdeCJ4oB4KmaOa0RTTQLfuuyowPa8W4Cl\nmaiuKF8EAUBRoyV7OUbeosENK9dzk53wFAfurkbs7M0hm7eF29RGHlrtdroAkcSBYixPQdHcMnf0\n8nW70DfAXMUa70jmBBZIwuOkuFTVLk5Jo6OizV9oen5M2iU2DCk8M5yXAQDqE3WoqYgJ1/1rXcuR\nJf2wJlBJUd/y1oU7l3sv7O0j4Q0wi5aR8zPbn4dLiBAb4QtX/syYY19FhtDx1+ktSFspGLoKj7e5\nZZamrdHjm5J1+OiiCWirpgaBYmVhjnsZWsUudk56n555lWaqb+rpFdfG3ea5IR1JI/heUHQbnsEs\nZ9tAf6YIS7NQCIl5wzGQLzq+qJLqoug6uH/NX+k1MhU7fo0i1q7QOm6xuJcaBdmOh60OFW5xu1tD\nLe+hrB2o8EjH6Hvig8eOwHc/PQ+WoUUx7wjDY97kJswaX4+vLZ4Vup1nVNdZ9Zg2uhZnL6Qu7GSM\nJWaFkTezvUvbBcqiJvKmM0Yvwk3zrw0mp5WgtjKGUc3h5M7JebiYOd8u9MdLkt7SJn0BKbqDdLyc\nvGXLu1RIxtBUv+SoBJwYZMtbLyFXTmAfnXQhxlaNwsiKkqYlAM6Y34HT5rXjU2dNLtv2hWlXBWr0\nwzL2E5rk1lc0jGzyCb+m+xhfI54n+kj3tEFvxw+O/ffgCT1VinlL99RgbnfPz3jXVBWfOmsK6lL+\nvXOKLAFLWrjUxqphezZ67R7qvuS92y3//GkjhYq0CsXTce6xNI9A04KLy135PliaiY+dOhFnzO8o\nmwvCwgCdPVRHnN8jTmI7MszqtC1s685Qy1tWA+WhBdvCSbOobn/CYG1Nx7wu+otzwn9pdZcIJ2ga\nK8nTnMACSSgPcnI26QJjRLoVlsmS+jwNHmhfe9uz0dMnJToNoxxYvXURLM2kSoeyVCk2+vMhW95C\n598RcyD2Y8f/c9OT6NPXQIlTD0ap5a1oHvQxNJuee5Fk8halWCo9vi5OibE6Sc9jtKyHVs3ugfQc\ncm8N11bo7suhM0sJ/Sf3vFn221fTPSC1G0CKFrxsBdZvH0BPn4OcUxAiLVzbgdgx5G0XGgwxxgIZ\nQme2G9PrpooWvdzyzhTZ74Qt1OIaj/v7mgeO6yGDXpCiBZJL0wx1fs/ZImkoZ0uuewOVTGggFgcq\nUxZMQ0XRdvdoYO0vhJs+EQ5ZWKaGK88tt644UkYS35p/HSrMVHhMOqS1J3/Zlbb+k6340pZ77wS+\n5R1O/pogb/riLiXvuJThHeY214gpvqd0gUHJV8XE+Cx0NFQFtnELf3duc+5Wntc8B/Oa54SOP27p\nuOD4saHbUrFYwPIPu0dJPQk4/vbqtH+9llcJe/0UWJOfB1GZFSBZhZahIaZbSOoJP8bo6dBUBTFL\ng9vTgqJuw+zw5SHh6mVd32TLyCkyYpeItyZO44V9hX5U6JXg7RsURUFh9UxY41+B7dlIWAZqrKQI\njWglnouck0M6UYdpo2sxbXQtHlqyEYWVR6FuyhoMEhazJ4Bjq+hoTQjPByfvXqaRbqoxbO/JImbq\nouc44MtbLpzSjounUNnadExanNWzpL1Aq1z6Ha45CKj1rCpAmmON11mzeD57oSeNhL/wJCo8uEhY\nOlzVLbG2gwsYTt5xRp7phBH4neaJn+XtDXBi8suYeLlVIJ9B+v+COgjVGoKXjwsPguyB4z9z/rsy\nNBW9/QRWo+/9Kap0DPUsVl2TKM/VCHw/I/8iyeP1tT348f8+g9gR6+EO1ACuIQiZQ6vugqIAxS3j\nAU9HvujCLChQzaLoC6ym+kEIdWsXii7i4HF5FzZT4RvoKxeEGsxnAZjQ0rS6okKvQhcQ0DywHQ8O\n8ZUCs3lb/K54eCJrboNVxWrfPRWVcbqIzjssVBUbgDbiDeSKp5XNzYFAZHm/B1EXrxk2mexblx9V\n9hmnZbIbgi61yt8JRFx+mFOqSnncXkZCcmvL7U55rJlnm4dZtfzlf3TNSTh7zKmh35PYjdv8ncLU\ntYALN8z7kNb9a7JUU7jhAZoMN29C0NqX3eomW4BUxZi1ThSAKNBUBcmYgfqqONydI6ES2UrSAhYz\nAD+jHxAvYpng5Xr+ilgSs8bXi0XloglzUUmakXcLyDm5gJiPripCHpQjVhL394aqMdM8xf/A1QEo\naK5Jipp8txict7pUBTp7cxjMFuF2t+KktuMB+PKWDekKUXWRNMt/G7LkbYJtH4qvg9FOFzky2QkJ\nYJ6Mp9iIaVbwuXV05L0cYpVDVMSmpLJDBifvGHvuUnED8o8jD2r1FlbPFORbEaf7ajXbpQ575RoO\nAPWsKUYRhCWr3fjxuThnQfnikqvrmYYq7jl3mxdUujyrZ5Z3Q0U5eZeqNxJPRc7JY9naHiEA43bS\nZ3XZup7gHHDPgOwV83QoCi+1I1CT/SC5FFRCyZ9b3takpbAVapWv3iDVkrt8AcF6rTdSQZZRFvOI\nSZZ10XHhoCg8BnLuBPds8GeBjRhxg3fQo+SdSa6F0bwBm3qD7UoPFCLyfp9hREMKN867BjcvuEF8\nxt08Lvt3XP+5uOGoqwPH7U/y5pa1S8KTO9QSy7s05t2UpOpN9bHaQO05fzlzyzuMGPnLP6xjG/+e\nZIjlffHJ4zB+RFVACW5fQL9fETHN/kJ5b2ceFgAASw+St6oquPC4oDu+QrIkeQ9s/pKloQefMK5f\nPBvzpjQibUrk7Oplcyxn3E8f2YxPnDEJx83wBXZ4xj9A5+vKc6dhFksMvPCEsRjZQL8/5+QDSXma\npsLZOg7FdVP9awxJ2otp/vg4cTbX+Za36ypI6v51N1VVwiME67cPQFVUnNi+AIBfTiff0zE1I/zs\neAauugbQOefQG1g3L4mYDFWjiyJmeXuqLeLoHPa2MSDwUGxaRj/gmvNmubdJMQsgRKEd5cDIW/N/\nG0VlqGwMHfXU82GOXAU11ReYJ/n7AKDIiJfY1LXb0ZTG+NagZgPgaxYQ4ru9eYJWERkYqiEWdfXp\n8pBYaSIeHAMFL49ETIfCeoDzkM+ytUHyVpmSn+w14Za8Yuap7oHmwstUwjI1arm7vuVcTG1mx0jN\nhQIxawIt3Y8RqTbUWLWB8Sqai6ydp78VtmjpzxQDx1tSCaCzg4Z3+D1/bPNTeHzz037mPQn3KO5v\nROT9PkRDog6VVvnKmbvGLSWFpmRDYFtIXtY+QxXkHX5SlcfaWcy71FoZXdmBzx7xcXxp9hXB47ik\nNCPv0pp3gFs1VIq0FMJtHmJ5nzJnBK67ZFawZn4foCgKrr5oBmbWURWz0sQdAEiYMVHrbGkmkjFd\nkLKmBUkLAJKmFONn19DMFjgcfOlVmbLw6bOmoDImddjytLJEx3qplOeoiW1YMK1ZzB2AwPOTCLsG\naQ5ly5vfS/klHUbeimv6nhNO3jVJcbzreUhLY2itoZ6ATN5BIqajwkoHqiHkMVZYaehvLkL+9YXi\nM/k+xIzdh5scj8BQLKiJQSjxAXhKOXl7fY1I6Wm4Zl/g+NLKCQFXF2JKybgRVC5TWaxXImdu9QF+\nXH+4RLiiTscwsq4ON1xKQz1hiZIJk96zwWxRsjqZ2xw5VJgp8ZzIz5x8DYCfsEk8DXllANvct0SN\nNo9Db9hRrtYG0JJD/qyLOTviaRH+IPkkYqaGfNHBUM6fI0/3NQ9Kx2N0rGJzRFATr0KMe5mE5e0K\nsR5O/rmCFPPWHDom1QOxTdibJtFxSc/tfW89KMJYils+twcCEXm/j/GlC6ejrT6JY46gJQ+cvGWC\nGsvqoxtq4uUn2EfwOHRYpjbgW+Ya++2kQmq5p9VNLluAcLe5RuiPKszyPml2G7568UyMaPDJ6+On\nTUR7QwpjWqk1EbS8939ayOSRNbjsiPNxycTzsajjxLLtluRaNzUqQiMatygKNFULvDiC5E3nroy8\nSxwnSSNoeZeiPu6Tt0zEHBVSA5rQBUiAvP2xcs+HHDoI08h3XCI8LPwl3taQFDK6iZiBasn676j3\n3fiJmA5VUZHQ/WssXfAYugqST6Hw5mwU101FXaU/3ngIecvEWSi6mKgfDUVzoTdtgIci4iELkLRR\nCaIEO7uVhif880slmpoaIG9P9ZOkOJRA1QDvXS/FsaUua45OiW/e+A7RwS6szDNlMNlbqVaeyt8S\n2MghLXljShd79BqY5rzJa8jpta8k//TVANl5t3aXS6USxwCIhoZqdi8kmeOWDno9Zx45HjGTajP0\nZf2ySRKKQnq/AAAbb0lEQVQbCJxfHo+i29CbN7BrTIjx8Xtijl6OdblV9CBXFzkAckKbodN7Egif\nlNxzy6I/suaaYEXJgUJE3u9jTBtdi29/4ihhhXLXuKym9qULp+Nri2dhTMv+eyC55T0cefPt6YSO\nb19+JCrfpqv6iDHUHdZWzVyKIdZFzNQxqaM68PI5dnoLbrr8SBh6iOUdco79AV3VcXTLkaFjNAzV\n713NSJeTN19Y8aoCUzUQtyTVOmF5S33R6ZkCf6UMyfIOJW/frZowysm7UnqRlxJj6THyNXLrkkus\nAggo1vE6esfxfCEPRkS1FTGcMa8DC6Y14XPnTEW15Sccjm32a/m5cE9ausbSaxC1+P31cLvbxLMD\nhJO3vMAp2C7aY+MB0HI7opAyyxsAKqQmL9yKa2sIL1NEqbWmlv825PvUW/Sbhvglf/52N6RxkRyO\naU+34YNjTgsQZMqS480avEwaWsUuaPVbQBQvcDwAjKroQEqpFp2+xjbV4rR57SK0YW8eL/bV0n1C\nOnU48FBGQzUTKUr6IaW8QRu3jKqvQ8LSkc07yG0ZAQyy6xS96/05mDLC9x7yTPWkkfS9H9K+y7LP\n0jE4BkawBQ4kt7mha9QL4egY1ZzGly+cXrbodPUcTM1EOvHOQmtvFxF5RxDglrfspo5bOsa1VQ13\nyD5B3UPMW1jm8IZ/2YXg46dNxFXnHYGFU6hs4R7lW4eBoRnCZbuv53gnMHVNvMy5vCTPOOfZ2jUx\npg6lKIhJ8WruNi9VsCq1vNvSkmBMyAtVJtQwy1te1ISRe3wYy1tkrEtWolySyMvRbMcT18jjoYqi\nIBEz8IkzJqO5NonqmL+gbK3zn9EjxtJrr5LIPVGywCgNxUwf689XmEuYuJqwVfNFF0nTAnE1EVMP\nI++URN7cyjv32NE4//gxZfsSV99ziZFENos6ThL/z/UQZOudZCqRe/HkQGy/QiJfRVGwqOMEJGzf\nQ5OWyRsKiutoAqLesLnseAC4es7nME+7UCwARjdX4YLjx4pyPrenFflXj5PGT3uNDwfujeGWt73N\nn6ch0Bh52kxhVEsFXI+gp9dD88CxwXOwRe8lp4zHF844DpW5CQAANUkt85SRRJznHYQtJFzd98rx\nksGqbrg1a2jioWvgklMmYOro2jLvQ09uV6gH5kAhIu8IAsfPpAlJs8aHtxvdXxhTORJAmHVIMb1u\nCh1P24K9Om/M1DFjXB3SZhKWZgaSrvYWPEZ6INzme4IpWd5claqmgvc7py8MTmxFtwhLiqNazHug\nqzqumvFptPfRspVSWhhd6ddUf2NxeQWCjHgIecsItbyHiXnLXp20Qq1dXv8L+Ja37bpoSNDnkBBg\nzsRgDgYAVPMFDILiOVzJri3V4o+x1PKWyPu4GS1oqfWvIWUl8K3510IfkhY4ro6PnEItyUVzR9De\n6LYpuqrFJaW9tnrqrm9IBpvoANSDcvq8Dnz9yC/jxBHHBM4vo3YoqONAXE2QCQBMrh+Lr5TkfJSF\nPzxdlNQBCP09aNLzLSc+AsBXPngs4GqC+NIhxxMoYlw8h+Wy0yYKWWRSjAnLXHZpA0DlllNwztgz\n/HOxPIiKBPME9jYF8hIA6k2Z2O7f95baCmiuf2/5d+SLDlRFRYfH+ruzkreUZHmHClY5BkYIqWMF\nbj99RovJbfQjVy+rfvHnItwDc6AQ1XlHEDh9XgcWTmt+227qfcWF4z+ICdVjMatxeuj2SbXj8b2F\n3wyWK+0FNFXDNXM+H3AN7y0Sehx9hf6DYnkbuubXm7JabRHzZqRTKxGX/DIxpSz6CTVjYbkZAD1l\n7D0i5WeOj24O96x8ceZnsHFwS5m7tBRhZYmyNR6WkAYA7eZErCg8g+aUb/0J8nY8zG2cgXV9GxDL\njMTZx00qO77GCo77psvmYiBbFHM1qmoEsCV8DHweZ42vx8dOnRgcu6WjLl6LOSNH47lupn3u6Zg/\npQknzaZCL6+s7mJSpdQzInsqvrZ4Nrr6cuhVN4nvT5oWvvrJo0TYoCXVhIUtR+GxzU/R87s6IHHC\nCGUatrxYg/icf9APSohGVYKJi6RE2lh87vj3Jox8PdZ61elpQoKFX266bC5Wb+7D5JE1UFdUwovv\nYseXPwfHTm/GP5dyOWBK3o01CXz90jlYuWEX/vPe10DsGHXtl1xDtVGHk9tnYkzlKNz94t+xsZMu\ndqaMqsH0TbUY0ZjCX5esByGK8C6kzBTGj/AXXifPbsPmt6rQyYVYuDgMlzw2LZCiJRZZKTOJmDK8\n5U1cHUdPbcIf/klbpBbfnIPYnL/BNQbE9pgxvOt/uGf9QCCyvCMIKIpywIkboC7Go5pn79aqTUuZ\nrfuCpmQjUua+kT9A3c5pI1VWc/5uwNJV1roRqGSJYaUxb9EUAcGyt9KSLz6DpIS9Dc3AhOqxGF9V\n7sLlGFc9Bie3Hzfsdo6w8IdcBx6WkAYAU5Nz8NkjPo6zR/v19kdNpkQ+sb0auqrjkknn47w5c0Q+\nggzZbQ4A7Y1pTB3lx65HVUqysiXPkio66ZW7qrkVP76+zf/Q1f0sZQCmqQUy5mXhoLilo70xHcgb\nOG56G1rqgs9jXPYGuDpOnOV/XzpBVdZ4YlmYlRhIOvR8adLW+iTOOYY1B5LqpsPCHzWDM+H2NqCt\nOE/McXtjGiczmVut6Lv+wyz3uso4KhL02r2S52DyyBpceMJYv1lKyTV091PCHVXZjqnG8aLne1NN\nAl+4YDqV2iWqyI8wVB2WZiIVN/CJMybhyxdOR3tjGpMa/C6AHz91Mlrrk2KR1TdUCCRHUstbCx0P\nAMAxkIobuP6js3Hy7DYACohtwVN5A52g5X3JxPOFpxAID58cKESWd4QIIfjY5ItQcIvvaAGxr9B1\nFW5nG2yjgCvPOx8AUJP21a+AYDKWXH5klpK3SJ0t/56rZn66/MO9wHnjzsIDax7CxJpxZdsaE37o\nZTjytkwd0+qCNevnHDMacyc2BKoBhgOPaZdm1nPwpD7e31lGW30SmzuHUFc1/MtWvi7iagGXv6Vr\nAZd02Eu7OdmI5mQjtmd2oqmy3LshW84nzhyB8SP8fToa0wAUmEocBZIVVutpR7ULyeOEEYcChS7M\nJCI6YWYrTpzVhhNnt+Hz/1WEmuqDSozQZ/mCo+bi0aWNWPyhCaFzoBerwIVd08N4ssZVjcZLna/5\n1QESqtIWyFZekkUt81HNVP50guT+lhdn3PuSZImH3kAN1FgWtudn4C+QmvzMbZyJf215BgBNPj12\nuh8uaapJYHl3HGqKJr8ljSRivN+BY+L85o/h3rcegJryLWuAVtmMba3EkZMace+W5diapS4U4hgB\nsaKjW47ElNqJeK2b9q1/N2PeEXlHiBACUzOHVak70DCYhKuzdRzqE9R6a29M4ZxjRokOaTweXB+v\nDVjbZoj4DDCsmN07wokjjgnGbSXIRCFaNZYgbKyqqgTaq+4Opmbg5gXfGHZxAAC5l06iL9sPBD//\n6AcmoK0hxayr0rHTfyulxjtfuXB2YB/TUAPkHQ/pLqcoCq6Z83m81rUC0+unlm3XVA03zbsWf1n3\nMOY1B88/aSQlNiPXgEJsA5QEJRdNU8X9VhUVcT2GrJMrkTv1O7DpJIbCiqNRlQ4nlTEtlbjinOHl\nluOZDgwZW2FU9pZpP3AsnnQBptVNxuyQMFgypkvtR6llfvyMFpxz7KhABYvrlmfX88WS21/ni+WE\nYGTFCDQmGtCSKs+h+dAxo5B5eRJeGKKqZykjAVPKjxhd3Q4vlxbkXZr1P7atEvW91YK8DcUs03pI\nmynoqg7HcyLLO0KE9zOSMQOXfmBCwPpUFAVnLRgl/q6NV+PauVehxqoWtdMAy1QPwbvUKyGARR0n\n4G8bH0d7upwggXIvwb6gcpjOdhy3f+kUhDlPYqaO044qb4RSio9Nvggv7HwFExpaA5/HTC0QTx7u\npW1qJuY2zRz2/PWJWnxy6uKyz6tSFlrrkujcUgN97Aa/R3XJjeTiIh111Th/8Ww8+vwmHD2Fkpii\nKHBcD4CCUU27n6fhoCkGiqtnY+KY6mFzH3Z3jRPaq1D7Vgp96BYKZJapBcIbANA7SGPSFSG9Cnij\nkXFVo8u2AfQ6bzjqK6GeBUPXcMmcU/DCE4/T79aswH6WqQXaKB83pTyMJHdPTJrhXRJrrCp05roD\nuQ8HGhF5R4hwCIJn/u8OYaSol3Tt8t9T7z57nz36VBzVNCvUnQr4mfEHEqX913eHay6eiQeeWheY\n+yObZuHIpvIOfg3VCZw4bQyeHqB61/EDYHGdd/wY/PTPWRQ3TII3SC3x0kXYpJrxWLVrNU4ffRLG\n1ldibFu4FT1hxL6Ve/L5U/YxPUpTVZw//Rj8YvkGuF10XsMWmJUpujiZPKqmbBscE1+c/CWMqKsu\n38awu/CWoer4ztHXI+fky/bTVQWktxVesg+jnWNw6YfKEyPlBWKlVU7eAK3+6Mx1v6sJaxF5R4jw\nHkLpy4n/fRAMbyiKMixxA8O7+A8WJnZU42sds/e8I8MHpx8JrO2GSzyMZuWP+xMzxtZhbGsVVm30\nPQSliYeXT/kICm4xkMAYhpHDtOfdEy47bRLufvRNXHRSeV7D28XMhmn45lFfxdeefx1AeF+Bs44e\nicqkhWOOaA58fvnpk/Dy6i6MaWh6R9LE1bEqhFG/rqtQMrUoLF+A6qnhYYFKSU2wKhHufeClm2Hh\nkwOFiLwjRHg/4GCw9x5Qmhl/uCFhxHHxxPMO6Hc01iSwamOv+LvU8k4YiVBteY5vXDoHb27uxbi2\nfVNIbKlL4tpLyj0Pe4vGZD14NnxYuMTQNZEhLmPhEc1YWELo+xO6poqSjPgwuvNT6yahVZuADVuK\n+MAJ4eWtnLwjt3mECBH2Cj/43AK4XnnSz26SzQ869kfM+72OxupgedecCeHW4XAY3VKB0S37ZnUf\nKBxKizZFAbhBP5znPWkkcN2xl9Me4lY4ZY6pot6RltSBW2iUIiLvCBHeA6geJpt4yqgavPRmF2aN\nrwvdfjBxqLnND0U0VvtW9e1XH79XMfxDFeYwCmXvJj555iS8sbGPlX3tObSkKsqwxA0A46vH4gfH\n/ntkeUeIEGH/4NjpLRjVVPG26qbfbZjvASI60JgyqhqTOqpx9NSm9wRxA76lezBx9NRmHD2VWsn7\nK6fz3SRuICLvCBHe01AVBR1N+67xfiDwqbMmY/32gVDVtAhBGLqGr148fKnZ4YTT53Xg2eXbUfUu\nqDjuDfzQ0qEYXBoeB3Qpd/PNN+PDH/4wLrroIrz++uuBbc8++yzOP/98fPjDH8Z///d/H8hhRIgQ\n4RDC/ClN+MjJ4/e8Y4T3FM4/fgx+eOXCQBOZQwEXn0wz6WVltsMBB8zyfv7557Fx40bcc889WLt2\nLa6//nrcc889Yvt3vvMd3HnnnWhsbMTixYvxgQ98AGPHjj1Qw4kQIUKECBHKILvQDyccsCXQkiVL\ncPLJJwMAxowZg/7+fgwNDQEANm/ejMrKSjQ3N0NVVRx33HFYsmTJgRpKhAgRIkSI8J7CAbO8u7u7\nMWWK322lpqYGXV1dSKVS6OrqQk1NTWDb5s2bd3u+6uoE9P0cI6uvP7RigYcronl854jm8J0jmsP9\ng2ge3znejTl81xLWSjV59xa9vdn9NBKK+vo0uroG9+s534+I5vGdI5rDd45oDvcPonl859jfczjc\nQuCAuc0bGhrQ3d0t/u7s7ER9fX3otp07d6KhYe/EByJEiBAhQoT3Kw4YeS9YsACPPvooAGDFihVo\naGhAKkVrTdva2jA0NIQtW7bAcRw8/vjjWLBgwYEaSoQIESJEiPCewgFzm8+aNQtTpkzBRRddBEVR\ncOONN+L+++9HOp3GKaecgptuuglf+cpXAACnn346Ro0atYczRogQIUKECBEAQCHvNBj9LmF/x2Gi\n2M7+QTSP7xzRHL5zRHO4fxDN4zvHYR/zjhAhQoQIESIcGETkHSFChAgRIhxmiMg7QoQIESJEOMwQ\nkXeECBEiRIhwmCEi7wgRIkSIEOEww2GTbR4hQoQIESJEoIgs7wgRIkSIEOEwQ0TeESJEiBAhwmGG\niLwjRIgQIUKEwwwReUeIECFChAiHGSLyjhAhQoQIEQ4zROQdIUKECBEiHGY4YF3FDmXcfPPNeO21\n16AoCq6//nocccQRB3tIhzRWr16NK664Ah//+MexePFibN++Hddccw1c10V9fT3+4z/+A6Zp4sEH\nH8RvfvMbqKqKCy+8EBdccMHBHvohg1tuuQUvvfQSHMfBZz7zGUybNi2aw71ALpfDddddh56eHhQK\nBVxxxRWYOHFiNIf7iHw+jzPPPBNXXHEF5s+fH83jXmDp0qX4whe+gHHjxgEAxo8fj09+8pPv/hyS\n9xmWLl1KPv3pTxNCCFmzZg258MILD/KIDm1kMhmyePFi8o1vfIPcfffdhBBCrrvuOvJ///d/hBBC\nfvCDH5Df/va3JJPJkEWLFpGBgQGSy+XIGWecQXp7ew/m0A8ZLFmyhHzyk58khBCya9cuctxxx0Vz\nuJd46KGHyB133EEIIWTLli1k0aJF0Ry+A/zwhz8k5557LvnTn/4UzeNe4rnnniOf//znA58djDl8\n37nNlyxZgpNPPhkAMGbMGPT392NoaOggj+rQhWma+PnPf46Ghgbx2dKlS3HSSScBAE444QQsWbIE\nr732GqZNm4Z0Oo1YLIZZs2bh5ZdfPljDPqQwd+5c/PjHPwYAVFRUIJfLRXO4lzj99NPxqU99CgCw\nfft2NDY2RnO4j1i7di3WrFmD448/HkD0e94fOBhz+L4j7+7ublRXV4u/a2pq0NXVdRBHdGhD13XE\nYrHAZ7lcDqZpAgBqa2vR1dWF7u5u1NTUiH2iefWhaRoSiQQA4L777sOxxx4bzeE+4qKLLsLVV1+N\n66+/PprDfcT3v/99XHfddeLvaB73HmvWrMFnP/tZXHzxxXjmmWcOyhy+L2PeMkikDvuOMNz8RfNa\njn/84x+477778Mtf/hKLFi0Sn0dz+Pbxhz/8AatWrcJXv/rVwPxEc/j28Oc//xkzZszAiBEjQrdH\n87hnjBw5EldeeSVOO+00bN68GZdeeilc1xXb3605fN+Rd0NDA7q7u8XfnZ2dqK+vP4gjOvyQSCSQ\nz+cRi8Wwc+dONDQ0hM7rjBkzDuIoDy089dRT+NnPfoZf/OIXSKfT0RzuJZYvX47a2lo0Nzdj0qRJ\ncF0XyWQymsO9xBNPPIHNmzfjiSeewI4dO2CaZvQs7iUaGxtx+umnAwDa29tRV1eHZcuWvetz+L5z\nmy9YsACPPvooAGDFihVoaGhAKpU6yKM6vHD00UeLOfzb3/6GY445BtOnT8eyZcswMDCATCaDl19+\nGXPmzDnIIz00MDg4iFtuuQW33347qqqqAERzuLd48cUX8ctf/hIADX1ls9loDvcBP/rRj/CnP/0J\n9957Ly644AJcccUV0TzuJR588EHceeedAICuri709PTg3HPPfdfn8H3ZVezWW2/Fiy++CEVRcOON\nN2LixIkHe0iHLJYvX47vf//72Lp1K3RdR2NjI2699VZcd911KBQKaGlpwXe/+10YhoFHHnkEd955\nJxRFweLFi3H22Wcf7OEfErjnnntw2223YdSoUeKz733ve/jGN74RzeHbRD6fx9e//nVs374d+Xwe\nV155JaZOnYprr702msN9xG233YbW1lYsXLgwmse9wNDQEK6++moMDAzAtm1ceeWVmDRp0rs+h+9L\n8o4QIUKECBEOZ7zv3OYRIkSIECHC4Y6IvCNEiBAhQoTDDBF5R4gQIUKECIcZIvKOECFChAgRDjNE\n5B0hQoQIESIcZnjfibREiHC44ZZbbsGyZctQKBSwcuVKzJw5EwBw3nnn4UMf+tDbOscdd9yB8ePH\nCz3rMHz0ox/Fr3/9a2iatj+GHcDOnTuxbt06zJ8/f7+fO0KE9yOiUrEIEQ4TbNmyBR/5yEfw5JNP\nHuyh7DUefPBBrF27Fl/60pcO9lAiRHhPILK8I0Q4jHHbbbdhy5Yt2LZtG6699lrk83nceuutME0T\n+XweN954I6ZMmYLrrrsOs2fPxvz58/Fv//ZvWLhwIV5//XVkMhncfvvtaGxsxIQJE7BixQr89Kc/\nRV9fH3bs2IGNGzfiqKOOwg033IBCoYBrr70WW7duRVNTEzRNw4IFCwI9ijOZDL7yla9gYGAAjuPg\nhBNOwJlnnokf/ehHIISgqqoKl1xyCb797W9j48aNyGQyOPPMM3H55Zfj/vvvx9///ncoioKdO3di\n9OjRuPnmm2EYxkGc4QgRDk1EMe8IEQ5zbNmyBXfddRemTp2Kvr4+3HTTTbjrrrtw6aWX4vbbby/b\nf+3atTj33HPx29/+FpMmTcLDDz9cts/KlSvxk5/8BPfddx/uv/9+9Pf348EHH4TjOPjjH/+Ib37z\nm3jmmWfKjnv22WfhOA5+97vf4Q9/+AMSiQRaW1txzjnn4Oyzz8Zll12Gu+66Cw0NDbj77rvxxz/+\nEQ899BDeeOMNAMCyZctw66234r777sO2bdsOSy9DhAjvBiLLO0KEwxzTp0+HoigAgLq6Otxyyy0o\nFAoYHBxEZWVl2f7V1dUYN24cAKClpQV9fX1l+8yePRuapkHTNFRXV6O/vx+rVq3CkUceCQCor6/H\n7Nmzy46bNWsWfvKTn+ALX/gCjjvuOFxwwQVQ1aCNsHTpUuzYsQMvvPACAKBYLGLTpk3ieN4+debM\nmVi7dq3okxwhQgQfEXlHiHCYQ3YrX3PNNfjWt76F+fPn4/HHHxfNPGSUJqSFpb2E7eN5XoCIS0kZ\noL2M//KXv+CVV17BP//5T5x33nl44IEHAvuYponPfe5zOPXUUwOf33///fA8b7fjihAhAkXkNo8Q\n4T2E7u5ujBs3Dq7r4pFHHkGxWNxv5x49ejReeeUVAEBPTw9eeun/t3eHOAoDYRTHHyGYJlwAMAjg\nAFROSC0STCWCIJCYBhwOwxEqegIkuqLBbRN0LQaBxkBZsdkaDJutmeb/05PJ517eZCbz9bYmSRLF\ncazhcKggCOQ4jm63m2q1mh6Ph6SfVv97VJ/nuXa7XdH+z+ez7ve7Xq+X0jTVYDAobX6gSmjeQIUs\nFgvNZjO1Wi3N53MFQaAoikrZezqdKo5j+b6vTqcj13XfGnq329V6vVYYhqrX6zLGqN1uy3VdrVYr\nNRoNLZdLZVkm3/f1fD7leV7xVWq/39dms9HlclGv15MxppTZgarhqRiAj1yvV6VpqvF4rDzPNZlM\ntN1ui3fn/3U4HHQ6nbTf70vZD6gymjeAjzSbTR2Px+J/4tFoVFpwA/gbmjcAAJbhwhoAAJYhvAEA\nsAzhDQCAZQhvAAAsQ3gDAGAZwhsAAMt8AxJ5C+54P8QOAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72fab5e290>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe8AAAFnCAYAAACPasF4AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzsvXe8XVWZ///e5dTba3pCQiAJCSWE\nIJGmoSSgjsg4gmCb4Tf+dCwURUdEQXGs41gYFQvDiIyIiKIIJIAgEBJCgJBKertpt59z76m7fv9Y\nu55zboiQBCL783rllXt2WXvttfden6et55Fs27aJECFChAgRIhw1kF/vDkSIECFChAgR/jZE5B0h\nQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8IESJEiBDhKENE3hEiRIgQIcJRhoi8I7yp\nMW3aND796U9Xbf/iF7/ItGnTQsfdcMMNoWOWL1/OBz/4QQB2797NCSec4O3btWsXH/vYx1iwYAEL\nFizgkksu4bHHHgPgpptuYuHChSxcuJCZM2fy9re/3fudy+VC19A0jfvvv/9vvq/Vq1dz1VVXHdSx\nDzzwAF/72tde9bVcvNbz3wi46667+P73v/96dyNChFeE+np3IEKE1xsbN24kl8tRX18PCBJas2ZN\n1XErVqxg/fr1IZIeCZ/97Gd597vfzW233QbAqlWr+PCHP8zDDz/MV77yFe+4+fPn8+1vf5vTTjut\nZjvr16/n/vvv55JLLvmb7umkk07i9ttvP6hjly5dyvnnn/+qr+XitZ7/RsAHPvCB17sLESIcFCLN\nO8KbHm95y1t49NFHvd9LlizhxBNPrDruuuuu4+tf//pBtblp0yZOPvlk7/fJJ5/M4sWLGT169EH3\nq6+vj09+8pO89NJLXHHFFYCwAPz0pz9lwYIFmKbJypUrufTSS1m4cCEXX3wxS5cuBYRV4IILLgDg\n1ltv5atf/Sqf+MQnOO+883jve99LT0+Pd53ly5czffr0qmu98MIL/OM//iMXXHAB73vf++jq6gKg\nu7ubD3/4w1x88cWcf/75fO9736vZ18p7ueqqq1i4cCHz58/njjvu8PatXbuWSy+9lAULFvCBD3zA\nu85I26dNm8b+/fu9893fy5cv5/LLL+fqq6/mM5/5DAD33nsvF110ERdeeCFXXnkle/bsAcC2bb7x\njW8wf/58FixYwC9+8QtvrL74xS8CsH///pD15MknnwTAMAy++MUvsmDBAi644AI++clPVllMIkQ4\n3IjIO8KbHhdddBF//vOfvd8PPvggCxcurHmcbdssWrToFds855xz+PSnP82dd97J1q1bARg1ahSS\nJB10v9rb27nuuus45ZRT+PWvf+1tt22bxYsXoygKX/7yl7nqqqtYtGgRH/3oR7nppptqtrVo0SJu\nuOEGHnvsMdra2rjvvvsA2Lp1Kx0dHYwbNy50rVwux8c//nGuu+46Hn30UT70oQ9x9dVXA/C///u/\nzJ07l4ceeogHHniArq4uLMuq2VcXP/nJTxg/fjyLFi3il7/8Jd/97nfZt28fIISiq6++msWLF3P+\n+edzyy23HHD7gbB+/Xouv/xyvvvd79Lf389Xv/pV7rjjDh555BEmTpzIj3/8YwD+9Kc/sXr1ahYv\nXsx9993HXXfdxerVq0Ntff7zn2f69OksXryYn/3sZ3zuc59jcHCQJUuWsHv3bhYtWsQjjzzC1KlT\nWbly5Sv2LUKEQ4mIvCO86XH66aezefNm+vv7KRaLrFy5knnz5tU89oYbbuA///M/KZfLB2zzO9/5\nDldeeSUPPPAA73znO5k/fz533333Ienv2972Nu/v+++/n4suugiAOXPmeNppJU477TTGjRuHJEnM\nmDHDI85ly5bVvNcXXniBUaNGceaZZwLwzne+k127drF3717a2tpYsmQJzz//PPF4nP/6r/+is7Pz\ngH2+8cYb+dKXvgTAhAkT6OjoYPfu3Wzfvp3BwUHOPfdcQJitb7311hG3vxKSyaR3P21tbbzwwgue\nteO0007zxuepp55iwYIFxGIx6uvreeihh0LWlkKhwPLly/nIRz4CwKRJk5gzZw5PPvkkra2tbN26\nlUcffZRiscg111zD2Wef/Yp9ixDhUCLyeUd400NRFC688EIefvhhWltbOeuss1DV2p/GzJkzmTt3\nLnfccQezZ88esc1EIsFVV13FVVddxdDQEIsWLeLrX/8648ePf80TfXNzs/f3Aw88wJ133kk+n8ey\nLEYqVdDQ0OD9rSgKpmkC8Mwzz3gEFcTQ0BBdXV0hC0Q8HmdgYICPfOQjWJbFV77yFXp6erjyyiv5\n1Kc+dcA+r1mzxtO2ZVmmt7cXy7IYHBwM9U1VVVRVHXH7K6Gpqcn72zRNfvjDH/L4449jmib5fJ7J\nkycDMDg4SGNjo3dsOp0OtTM8PIxt21x++eXetkKhwBlnnMFJJ53EjTfeyK9+9Ss+//nPM3/+fG66\n6aZQexEiHG5E5B0hAnDxxRfzve99j5aWlpo+2yCuvfZaLr30UsaPH19z/8DAAC+//LKntTY2NvK+\n972Pp59+mk2bNh0yLa27u5sbb7yRe++9lxkzZrBjxw4WLFhw0OcbhsGaNWtqCiGdnZ1MmTKF3//+\n9zXP/ehHP8pHP/pRtm/fzr/+678yZ86cA17r+uuv58Mf/jDvf//7kSTJG4OWlhYymQyWZSHLMrqu\n093dPeL28ePHI8uyJ3xks9kRr/nQQw/x+OOPc9ddd9Ha2spvf/tbHnjgAe+6g4OD3rF9fX0kk0nv\nd1tbG4qicN9991FXV1fVtrs6IJPJcMMNN3D77bdz7bXXHnAMIkQ4lIjM5hEiALNnz6anp4fNmzdz\n+umnH/DYzs5OrrzyyhHNuKVSiU9/+tM8/fTT3radO3eyatWqEaPKR4KqquRyuZoa9cDAAOl0milT\npmAYBvfccw8A+Xz+oNpevXo106ZNIx6PV13r5JNPpre3l1WrVgHQ1dXF9ddfj23bfPnLX+aZZ54B\nYOLEibS3tyNJ0gH72t/fz6xZs5AkiT/84Q8Ui0UKhQLHHHMMo0eP5pFHHgHgd7/7HV/+8pdH3A7Q\n0dHBhg0bALjvvvuQ5drTWH9/P+PGjaO1tZXBwUEefvhhb2zmz5/Pgw8+iKZpFAoFrrjiCjZt2hQa\n93PPPZff/OY3ABSLRb7whS+wb98+7rvvPn70ox8BwgoyZcqUgxrvCBEOJSLyjhABkCSJCy64gLe+\n9a0jkkEQ//Iv/4Ku6zX3jR07lp/85CdeVPiFF17Itddeyxe+8IVQBPrBYM6cOfT09HD22Wd72qaL\n6dOnc84557BgwQIuu+wy5s+fzymnnOKtPX8lLF26NOTvDl4rFovxwx/+kFtuuYWLLrqIT3ziEyxc\nuBBJkrj88sv53ve+50W4z549m3nz5h2wr1dffTWf+MQneNe73kWhUOCyyy7jS1/6El1dXfzgBz/g\ntttu48ILL+TPf/4zN998M5Ik1dwOwvJx88038+53v5tUKuUt8avEO9/5TjKZDBdccAGf+cxnuOaa\na9i/fz/f/OY3ufjiiznrrLO48MILec973sN73/teTj311ND5N998MytWrGDhwoW85z3vYcKECYwZ\nM4bzzjuPdevWceGFF3LRRRexZcsW/vmf//mgxjxChEMFKarnHSFChAgRIhxdiDTvCBEiRIgQ4ShD\nRN4RIkSIECHCUYaIvCNEiBAhQoSjDBF5R4gQIUKECEcZIvKOECFChAgRjjIcNUlaenuHD2l7LS1p\nBgcLh7TNNyOicXztiMbwtSMaw0ODaBxfOw71GHZ0NNTc/qbVvFVVeb278HeBaBxfO6IxfO2IxvDQ\nIBrH144jNYZvWvKOECFChAgRjlZE5B0hQoQIESIcZYjIO0KECBEiRDjKEJF3hAgRIkSIcJQhIu8I\nESJEiBDhKENE3hEiRIgQIcJRhoi8I0SIECFChKMMEXlHiBAhQoQIRxkOK3lv2rSJ888/n7vuuqtq\n39KlS3nve9/LZZddxo9+9KPD2Y0IESJEiBDh7wqHjbwLhQK33HIL8+bNq7n/a1/7Grfeeit33303\nzzzzDFu2bDlcXYkQIUKECBH+rnDYyDsej/Pzn/+czs7Oqn1dXV00NTUxZswYZFnm3HPPZdmyZYer\nKxEivGmhGxZL1+6jWDZe76542NuXZ822/te7G0cNXtjYy879wyxduw/Lsl/v7rxq9GWKrN8x8Hp3\nA4D9AwVWbekDoKyZPPdyN7Y98tjmSzovbOw54DFHGoetMImqqqhq7eZ7e3tpbW31fre2ttLV1XXA\n9lpa0oc8Z+xICd8j/G2IxvG143CN4d2PbOTXizdw3twc11x+6mG5xt+Kf/nm4wDc/+13oSiHTn/4\ne3wP9/Tm+NEf1ni/48k4F8075rBe83CNo/vcf3XzQpobEoflGn9rX+79+jv4+d0vsmzNPmRV4aK3\nTq55/I9/8SzPv9zNdVecytvnTHjF9o/Eu3jUVBU71JVuOjoaDnmlsjcjonF87TicY7hhu9BwN+wY\neMM9p737syTjh2YK+nt9D7dWaKobt/dz2tS2w3a9IzGOXXsz6K3pw3qNg0V3zzArN/YAsGnnAKcd\n117zuA3Oc3hh/X5mTWw+YJuHegzfUFXFOjs76evr8353d3fXNK9HiBDhtcE180lIr3NPqqEZ1uvd\nhTc8SroZ+m2aR/+YvZFcOJZtY5jiG1EPYAVqrheWgsHh8hHp18HgdSHv8ePHk8vl2L17N4Zh8MQT\nT3DmmWe+Hl2JEOHvGq6LTnrjcTdGRN6viHIFeRtHsc/bRb6kv95d8GBaticQqcrIH0mLY+bP5N44\n5H3YzOZr167lW9/6Fnv27EFVVRYvXsz8+fMZP348F1xwATfffDOf+cxnALj44ouZPLm2ryFChAiv\nHW9E8tYj8n5FaHp4jEzz6CfvQun11byDQWeWZeP+UuSRddn6VAyAzAE072x5iKZE4yHp48HgsJH3\nrFmz+NWvfjXi/rlz53LPPfccrstHiPCGwf6BAo3pOOmk+Nx6MkXSCdWbEGqhe6BAQzpGOukf0z1Y\noLk+QSJWHbiZzZUxLZvWxmRou+Wazd+A7H0kzOYDQyUUWaKp/rUHSFm2TVd3jgmj6pEliZ7BAk11\nCRLx8PMoayZ9QyXGtde9pusVSjq7e3OhbYPDJbJ5jaa6uLetN1MkGVdoSMcrm6BYNtiyJ8u49rqq\ndwOEANWXLTKmrbqvA0Ml4jGF/mzJu+dK2LZNV0+Ose11ntnZtm329OUZ116H5IxTXeBdz5cM9vbl\n6WxJeedYts2W3VniMZljRjfSkynSmI6FYiJ2dQ8zpq2OmFqbZGudUwslzbdmmJZV9bdl2WzenSGV\nUEknVOJxxfuOhgo6fZkidakYqYR/naV7V/B/G+7lIye8n4s7zjng9Q8VjpqAtQgRjkaUNZMbfvYs\njXVxvv+pswD499uWIQG3//v8mufohsXNd6xg9nHtfPQfZgLQny1x48+X8455k7jk7ClV51z7388A\n8D8VbbpKhvw6cLdpmWzL7mBq85SawsOR0Lw/++OlQPW4vBosfm4X9z6xlcvPO45Tj2vn33/6LBOm\nZ7A7N3LdnH+jOdEEwDf/70V2dg/z7Y/No7059aqvd/MdK+jLlkLbNuzKcO2tS0L3c8Mf7sYup/nF\nxy6vauOexzezZNdKmuoVvnvlZVX7n1i5h3v+spmvXHU64zvqve2mZXljB3DlBcdz3pzxVeev2TbA\n9+9dxZknjuaqd5wAwOMv7uH/Ht3E+88/jtOmdfLvP32W9iZfcFi5uZdfLd7I208dxwcvnAbAqi19\n3HqfiKq//v2z+c7dK5k+sZnPXSFWSGzcNci3fr2SudM7+fgls6r6kc2V+ffbljF1XBM3fHBOjdH0\nEdT8g0vvXFJ/YVMvP7l/rbddSuZomLUSuXkaVqaTz922jHHtddzy/73FO+avu5cAsKJ7JRefeGTI\nO0qPGiHCYYRmiAlhKK8BYDj+tQMZP4tlg7Juhvxre/pymJb9N/vcfBPhkWfvezf/ie+v/CkrulfW\n3K8bZs3tbyT8cevDfGHJLWimxsrNIsh21ZY+9vYXIFair/FZ+kuDdA3v8c7Z2S0ijQcOMrhJt2qb\nkSuJuxZ2De0hPmkDieNfrLm/qzdH4riXKI15ofY1MkVsoKsnrOFXmrbXjrAuf9veLADPrNnvbXtx\nUy8AK17uYSivIaWHyE+7H7mpx2lLRG4/8aI/Zv2Be3VzAGzYlfHb3LsRKV5gxYaemv3oHiwCsGVP\ntuZ+0zIxLfG+BX3uZoC8NSe+oC9bDJ0rN/Wjy3kxxoo4d09fPnRM2RDPOqkcuSVwEXlHiHAYURlf\ndDDapghSssnGtzGsiUm1NyMmt6DPc1XvWnoLB0524pL366F5P71HJF7al+/2tlkBf+PR4PN+ZOcT\nDGnD9BbD45zJlVEa/WVcRaOaaJUDBEC52DW8m2v+egNL9jxb+wDJIjZlNXJzd2iz+1yf6FpywPZ7\nC/6qHsuuHm83IK43EyasSvIeyVRdy6Li3rdhWpiWTWzsVtHGpA0j9jN4vf4KoaW/OMCSwu+Jz3hu\nxPMrz6nEf734E7723HeBcLR7Tisi1WWIH/8Cw8ZwVV8AJFXz/pbragsHZVMck1CqXReHCxF5R4hw\nGFG5tEc/iKU+Zd1Ead9LpvU5/mfdrwF/cnU1hf35Hn625k7+47n/8jQGoCoDl6d315hkNw5sYU3f\n+oO+l1eLxri/TjVI2G/0pWLuhAygW+EI6d5MESnuE0ZBD5MfgHIQEtOjO/8KwIPbH625X2ndj9q+\nl8TxYeuFu7xpW3YnALZR7QEtaQZFxSfvWn0cibzzAQKT6rIMpNbXJH+5xj2qTuCXYdqifcVpq0Yf\na12vp6Ivq3qFCVtOjEzQwf6XtDD5dg3vYcfQLnoKfWim7l9L0fn+y98mOfNZlOZetiUfreoLgBQr\nB/7WqIWyeeSj0CPyjvC64I2UZvBwwqwgU10/OPKWG4RWtze3D/AnJ3epUG9RTMq6pYcmm0rh4EBW\n8x++9DNuW/2/r9ifkfBiz2q+8dz3KRrVpBCc6IPEFyTv16p5W7bFrSt/7hFg9X4bpW0vcsv+mvtf\nCbuGdnt/l4zw5NybKSLFAuRtVCeRMi2bIW2Y32/+MzktX7U/eI2JDeOq+g4gN4nnbNvhB1jWTWzb\nJlN2TMuKUUWufZkScr2vKeZr9NGNZu/LhImx4JiWpbosyZnL2Bd/gd3De6vOryWfuEuuTMuirJtI\nqmjLNqsDNF3BsxAwZe9zTNKphAgEfMkhb9saWRjakFvtPefKe1m+z3cZ5PScZzavJGJNyZLT86G+\nACEhDTV8zu83/5lfrL0LzXnHa1lgDhci8n6TwPUvWrZdMYGaNY97pW2vBat61/LJJz7PtuyOmvst\n2+Kl3rUU9dIBr23VEAAOVV9/uf433LT0m6/6fLcfQfI2TCtEriMJMJpmIsUFIbYmW4Cg2VycP1jy\n/YHByaaSED2zeeC3bdvkdJ9MKv3ouiGIwbKtmtqWi9vX3sXu3F5W9673zgHxXILm/JAGG+hfppzh\nzvX30F3oxbQsBofL2LZd9QxFX6rHqrfYz4bBzdy/9aGq4yzbxjBMYsesIzaxtrm2ss18SQ+ZTHcN\n++RdOSn3ZkpIcX/cCjUEGNO0+c7z/81fup7imb3Lvevphskftz7MZ578Mn0lIaTJkh+xXiwb7O4f\nRG7sQ2l0yLscDnzTdJMhLYdhi/5KEhR1v49lzWT7/iGkhE/Y+RoCxLDdR+yYtfQMD4W254o6iZlL\nSc70a04MlAZDxzyzdzm7zZeRm3tInb6IPbl92LbtRZAXk7tZMfCMr3lXQB2zleuXfImCXgwJoK5F\nJp2IYds2e/Ou8CXh2pJ0w2JgqESuqDNYyrIz/gyJ414C/JgDwzK4e+PveaFntX9fWt5/xnJ1v7Kl\noWqzeby25m1ZNn/peoqVgfaPJHlH0eZvAnQPFvjCT5/l4jMm8fLOQbbvG+J//n0+Dy7bwX1PbuPm\nf57LxFEN/PWlPdy5aCPXv382MyYJ0vjNXzbzyIouvvWxeXS8hsjZIO7fIibbv3Y9w5SmY6r2L937\nHHdv/D2N+jF0r5zOj649J7QsA2BTV4Zv/t+LfPySWcydLrLzPfp8F3c/tpkbPjgHPdVNS6KZ0XWv\nLnPfc/tFAJBhGajy3/aZvLxzkO/cvZIPLZzGceP9VIr5khEycRumTUyt1iZKuomUEGSwfVeZNW39\nXhCNaxbvCfgyQ5p3gBwf2LaY4eQQ0IYkSZR1k49/90nOOGEU557lB9Zc96On+MAFM5h/6niG8hrX\n3LqEc04eiz3xRV7u38R/nHUjsQOMgaZb/P//+SRnnTiGf3nHDD73k6VkpN0kRCBxyKToE7PFnwZv\nB6A50cSG5Z1s2JUhEVcoayafuewUZk5u5cWe1WzdrPDw091V0dvd+XDw0n/d8xLb9g3xb+85ke/+\n5iWuuGgikmKCbGFaJorsE+SqLX384Herue59JzNrShvb9w1xyy+fB+CbH5tHZ3OKVV27vON/8dBq\nxqnT/WsnXkJp9f3QtUzSmfKgR3j3P72de34Dn79iNt/69UpSpz8ROvalbfvZ2TlMc32cz922DOnY\n5SSm+wKQVKHxLVu3n98/v5LkTH9btpynLp6mpBlc/+Ol5EsGiVk+mQxr1Zp3X/ol1NR+8oqBbvhR\n0kPlAnJdmNAHy2F/76833AdAfLLQqH+y5EFSPafQ0SKeUXncc7yUAzlZ+x5iEzZj2LBpcAu5cnXf\nDNPi9kWrKDrmckm2QBZC4pduX06PE6TWcEwXeJ+5ze0PvsykUQ3stzZXxRIM6znyJdFfqYZQ8Z3f\nPUeb7FtB5MZ+5PQw2BJIdugeilr1+bWEuMOFSPN+E2CjE7X50LM72b5PfJCWbXPfk9sAPzr0waXC\nf7Z07T7v3EdWiIIxm7p8Te+1wtXmRlp77EbuZhFmuv6hamn2ryvFMfc+4ZeSve+vIjBm+cbd/PdL\nv+CW5f/5mvsa1BoPFktWi34/tGxnyOddKOkhzbsye5aLkmYguf492eLhZ3d6y1hcTb7HMZvHlXhI\nU3DJ0bAMFu34CwPNKwDxvIediPdn13fTlfMjfVEML3p2l6O1PLVqN893v0TeKJAp1Q7ScbEvI/Yv\nWbMPy7YZGCqHTI1lI0jezrNP+pO1bulelHDZuc+la/ezYWAzt6+9i8cHBUm8vCus+e2vIO91OwYp\nlk1+/dgG1NHbWbRGmFslSZivg1i0XBDzn5buAMS6eq/d/gLL973Attxmb5tml0NLBOzOTeJ/UwgE\ntczmPaVe729TEtaRxc+NUIBJ0dm0O0NXTw7dsFCawgFykmqA5L87f1ixKqQVA2RLeedehCY7arRN\nLOW/vzm9uo+u8UFp2093xo84HypWa+lBzTtoNZJi4t4yWYOd3TnvGVYimbb454unM2/mKN4+2yfI\nn6/9FXvG/L7KvVHWTZZt3hHaJsU0hgs6PYNFL9+BlvYtJA2NYoy27xtClqvzIeS0PANDzvtYg7yF\nZu5bshLTxfcjmXHn+v67nCtWZ4orRWbzCIcSlZGiUrzAQCHrmbfcCdUNsKn000LtwJRXCzenkSzV\nfv08U63j56t15ZST8CS47MM1t9nqa5N+g6biVxOIogdyJQfHslAyQj5vbQTyHtZySJK7QNsMCS8e\neRd8Yqg1BtlymKzcyF8Xe3P+RCkpJprmulWcyzb4wlqmXE3ewTEKEpebgSpE3gEBSDMsUDXkQKT2\nYDHnBWC5SMQVT4iT04JUKpPT7A1EsQ+X/D70JlcTm7iR4lh/nfJAKSx8uglz+lIr+dnqX4aEqoHC\nEHe+fA92zH+PJMW3mtiYge2Oz7aW5q0NBI4TRDE40lI/xaA3U/RiG1yhIISA1hcb5QsBVkEEBA47\nhNubKSI39DM0cTGmFCDvcjUhm5Lfn64B35ozXK6+n6DmXWt5m6aJ96w0AnnbisbZJ43lX981k7NO\nGlO1X2kJC2NlKUd8imOSdueCWJk9TuKaGcc0ITf1hPz6l54nhILebBHd9L8LN2ZgWM95Yyyp1fcg\nqToZR8gNCktSsQXbkpDrhjyXVqYQHs/GeEOkeUc4tIhVJNxPnvIUNy3/ukfq7oTvEnStmsG1siu9\nWnjBOCO8fqZDDF6QTsW1C3qRVervUNp3UyxXTxSGUjs46GAR/AC1V6F5uzm7K8k7XzIOSvMeChCv\npBj0Z/0J1jQtbNv2JlLN1MgXq33Kg+UwWelGONZhMKhNK4bXF89H3uATT7YGebtL2AAKZoA43Ykx\noKGEzeYWyROXED/Gj3LPFKsrMCVisqfpuYFKlQUt9hd88t7W7U/87uQaxEDRHw/TMulveB65foBy\n0xZW9a1DM/yJfqhUYwJWDIYdTasQ8C3bloRqJ8g774zhPV+bQS2gPTvrg0dapy8pOn2ZkhfbYBuB\n4C6XuFS/j5Ll77cKIrmKaxbvzRRDhKZaooJXvkLz3jS4FSvhH7c34z/zXMDEbmbbkGyZTDDOooal\nwRVkhgvV38yY5Hh0S/e+p2DSFu/8eFhrjU3Y4AluFB33k6qxs1tsax6TJTFNuLdkW4yHkhTj25sp\nUTQDz0lLOPeV9yPTlWrNWVI133IQ0MylvTPBjCHFyyROehqAgYL/DZzVeiGtyRaKRumIBeNG5P1m\nQ0CajKsVmrdyZDXvkczmXiCRM2lVLrfaNbybItmQ9haEJudqbg9ix9CuUDRxELkAMb0as7k7gcdU\nudpsHiDQkTSUIT3Qf8UMBVaZlk1eL2AENJ+hgJbktl+pLZtWOFguUwoICLJBWQ8njwlOpBkt7PuE\nsHBQNH1hyU0sEgzyqQxYq4zyHa6hESZiCrudSHsMYbKsDCQKmnF39PmWCKxqrTWoea8f2EivuoHE\nCc+BLO47GImdr0HekmJ4VoVcYLy1DXNRiFN0iNHVzmPHrmJDYZV/vqPlDeVqvE+2BIpBT7ZAr5sg\nJBCZHdNF/Elw3EzE3+ZQC9aQKBE6rPmad5D8k5YgviB59xT6+MHKn3r3D9CdC0SmOwKKOdSKtuUU\nVCsVGsOagVmOcDGU10LzjLZsY2voAAAgAElEQVRtFi0J0Qc3ULJWamC5gryDEfbGUKM3Bjv2i/dR\nSfnHj5VEPIIuFVBkib5MMWzCdsZzqJwj4zyDWj5vggKSs9/oHYdeSHrjLzlj1p0V42V0T6TdmEZK\nTWLaZkjjP5yIyPvvEAOlQTQzaEoNkETg5Yx55C32u2ZzzSpXSemHMsmHa3IdSZu3bLe/Yn/l8ifX\nZOx+XJU+tqLtE9NI0dI/X/Mrfvly7dz6w4Go3FdjNvfIW5FCVox8yXCehY0yagd7ctVLbwAKhk/e\nUkVErGnZVcQ8FCC/2uQttO6gmX446ANWzCrNO6g51zKbBzX3vOFf39e8S9imgm0qlMxqn3cQtZaa\nxWOyPz6KDtgVS+L00Du6dzAgyFnV01qwv7XKoxYD1oOc5vfX1WpRDE+wcTXvuvxUrFwrshX3rDWu\nEKS2+W4J28bT8mwIERtA0m5Ckm36snl6B4uOUO2/N0nTIe9gwJfzHWtbT8Z2hJu8JvrQmy2FiClh\nC7N60KKU06sF3IGCP0Yll7wHRoMZQzHTDGnDXpayWm4C1zIwVNA9rdUcGIXZN576mMidviWzXRxb\n49sXAl8wsMA/xsoJ8pcSBU/zjiXEOOp7jmVS8ngAslqWtqYkvZli2IK2XUT2DRQDgmhgjLRts5x7\nEGOcSqj+flMV30dIKLTodSL0bSNGb6ZIWhWBevkaY3M4EEWbH4XY1T3ML/68nk9eeiKdLeGi9nm9\nwJeWfoPx9WP5wunX8H+PbOIvL/oaZnACiFVq3g5Db2m+l889bdO89VLv2B/9YS3nzxnPFRccf1B9\nfOCZ7by8c5Dr3z/b+1DveXwz2ZxGLqWBAiDxg3tXsdkpSPCpfzyJyWMasQhr3pWlI7tdf6/zcXUP\nFkKBa3nTn4SeXtPFI8v3ceOHTvMi1otGkUw5S4MzET350h4ef3EPLQ0Jpk9soXNyteb98PKdrHi5\nhxs/dNorWiFcYUOpMJs//uJu9vUXkFI54pM2cHfXBo4ZfQ0dHdNC5xfMvC9WKz7hqopMMbGXb6y4\nN3R8vkLzfu7lbh7ZvAncVNWySV+2xH/+5iVQyySmr6Bo+WSlxk1PAHIzuAXJ+4k1W2nMdHHh3An8\n/qmtDA6VGTfL13SDxPenZ3aI8+MlbC2JpOrsGxxC003iMSUsSOL4CZ3+j++oZ3dvDqV1L08Ul1G2\nXQ1JRBkPFzRu+eXzDAyVuPyiseLWJBnLtugeGgScSHTJH3NbjyPFNJ7esI1ZiX5mTWmrGVQUFCCW\nb9hLYgZItoK2+VSSJz8VIkPN0kgACScVpmzF0S2DsqlVuUJsSwJLDWt5FRpfPpNAaRVBcXv6ZEa1\npukPHJM0WxlmK/Gpqyi91Iytpfz2jJj4B+zoHeCWX66gN1MiMcrC/WpiOLWoy4M8u24/v35sM23j\nstDqdlJEUvfkMlzzvb/S2ZRkz2CWeDNgim9GNtPY2GTKQ7SlWnjmZT8S37+voNbqru0W53em2wG4\nc/09nNwxq3a8i2yKNtzgMMdaUVpzJraewDZU1NE76Fk3FmjwrmFmOmlKNEFJBLt2NI9l3fYBVm7d\nBzEorT4Lu1SHjEJ/UVhrmuvj5J0xHNf/DrYMlGHKWi/4rqkuTlmvuIf++fR0OMl0VIP+vAZpMUZ9\nmRId44UrIK8XSODniT9ciDTvoxD//fs17O7Nc/+S7VX7smUhDe52tJYgcYP/UUE1eXuk5Ex++/rD\n2vdjL4i2Hti6iJuWfjNkuq3EH57ezoZdmdBktvi5Lp5d3+1V7zEsg1Vb+ymUDTI5jXXbhfZkOaTq\nkXeFGd9dJuVOYNv2DbFuh29CzZm+VnnnY2vZ11/w2ga8NchlS5DDLxdtpKsnx+qt/fz2iS0hs7mr\nzdz7xFZ27B9mYFhM/I/tepIvLf1GTbO6YYj+xlQ51Hd3PIPEuLXGWveiJTRZ24gJE51kkUooJOMK\nQx3LveMkXZBVIUBGmmFy2x/XMRQ0dct+pLrascf3IzqIxy3vOXmacUwTpk5bwlSKvLBR+JT/vHQn\nz6zd7yWPUWWVklXh/5QNpJiOrSWxTRXd0ti0W5hcS3p4vNpTbRhogM2oVnE/8amrKdhhbV+KldnX\nX2D7viGyeY2N+4VmO8FJbtJfEM9/zrQOkulAycfhZmxLwlIK/OB3Ivgp6Au1ikIjdCO1g+M1Sj8Z\nu5wS5tsa5OvmsVY1oRWu69/gCUFuxrPyunnYhhr2V1eQt2vilhQD07LpaEqScuSQs8fOo8mY6B3b\n0C76Kak6tiWDrXiad1e+i+37hsgVdU8rbUk0M8Y6EXO4ha58F89u3k6uqLN70DeB1xtCECrbBbbu\nzrJsXbfXx6ljBOm675rrLtm63/fne/0P3CMO8br7zh53BhMbxmNjM1QerhKgZEMoIak5j3uBeZKq\nYVsSdrEejDj67uORZBu5LksqoVK2xHc0a2In582ayrFNk9kwuJlp08XzH8yL91wkh5EYk5jAgN6L\nlMgzujXtPceGRJpPvWc2tiV5yk1bU9IXnB3yTlvtTFBO8PqWLfrfaaGkM7vzJE5sn0FnXTtHAhF5\nH4UYcgJC6pPVfiP7gCUvCJnN3UxIru9VqTRlSbVNzot2Pk5faaAqorkWatbudYSDsiHuo9NZu+ul\nAK2INh9Z8xb34i6BclG2AxOxc7/BW3Ozk2mmVm1Wl02yAZPycCk8ybhc/IctDzJQGqyZdcowLZAN\nCsmuqr7Hj3+exPTnvd+1AuLKtiBDu5T2+pSIKSiyhGwEAn1KQrovB8hINyykZA65MbBGOKC9h9Jo\n6qItJRYgb9MCbKRYmeZEI7aeQIqXqtJn7sntJ6HEmdQwAc0uhd6Vjsni2tZwC5gKyCb5ongPKrN8\n1cXS4n1QjFCZSxcJSbwbUkwjmw8kRTHFxDyrbTqqpNDPDuIxiX+7ZBbHT/K1HqtUJywA8ZJnBXH9\ntbGhiRyfOA2A/ny1sCNZCiCBWaE5O/uTagJFlogPC3J9dt/zvrAqmyT0Nuxio/C31iB/M9sqtErX\nv+1sb29OolllpjQdw+XT30NCrqO8+RQAzjhFVC5D1T2N2y6lMTPtKI0DyM3i25Ad8rzm1I+RUJKY\nvULI2Ws6Firn2zH6RzPJnOe0GXgXnb5ceubxpBMqaOJdcZMDlZx3Ttt6EqUX345VrAuR97hRTh4B\nh/iS8RjHNYtqeHkjXxWVPaV9lD+8DQNCaFUMMGOeddF2+iCpGsm44rXxrxefQjKhcsGkc0UDDb1M\n7KzHkp3+OO/8xLiwcClt+xjVmvb6m5QTzD6uQ5i9nW1j2tJV1gNVkZkxfpTTB92L1bDNGLppM731\nOD520j8TV0Yu9XsoEZH3UQg3pWFDuka6wVcIsAp+YO5yD9eXqCgyECAb+cDZygyrOjBDMzVRLEEO\ntx3KmuWQd8kh77FO3WOXICqXihkBn7duGV6gkrf8JlS9yaZsB5f4iD4G/es9gexfwSUvUqJA6rRH\nWbTjL962XLmCvKtyh9fI8mZaxI9byZ66p9haeDm0T2nuC/2u5VPXKGDbYDlZtSTFEOStSEiaX3fZ\ndLRGzfKfeX+5j8TMZUiq4S83CvrNAwFKdllMikrM94drugmqjiTbNMQbsEsppHiJTD6Q7U6y6Cn2\nMrZuNC1JQSZBa4LWuB3bkjB6JmBbigjGGhSknXeimG1TYczwOYK8Ee9lMmVWvXMtsrOkKFYmGxDS\nipYg77H1Yzix/QSM2BAtHWUkSfKIxb1HW0tCrOwJGG4msobisXTUif5nQwF8jvbs+DhtUw2Rr/ve\nJeQE8ZiCVaynM93OjuwuQd6ShSTbWIZ/vhCgrND5Vq4Fu9jgkYvSIN7rlqYYNjZJ1dHsFckjLtcq\nIyl6IChNwth/DBAonOFcI6UmUWQJa0jYyPP0e+MNYPaOp0FtAFsKPUM1bnrnx2Iylkve5QyWbVGS\nRTu2HgdkQdJObAKAGjP8sUMQn/usl+xZ7uVkd3FSu59tJnHcS6RmPYuk6khmjNaGROBaQEwnHlMo\nODEPKUX0rTMlNN5sOYuqytiyLtwWtqC5MbFjkWwFdew28vWbkRQD25KIqWIcG2PNSIkCclOvqG8e\n8Hm7z8H1a8tNfdhjnRUThloVVHskEJH3UYxaUeFBM26tJQth8hbHFsoOwcmSZ+6CEaIxAyjVIJ4l\ne5fzu81/Iu6UKHQTHlQm+we8oLr6VIzm+rgXqeznwq4OWOst9PmEqRiA7ZVelOoyJE56GjsogDj3\nIAX81K7mDWHylOurE9G4ZnMXlZp0Lf+pYfpJNoaMAye3qWV216WiiLB2J+eA5m1LgQxteYe8bf8e\n9pZ3ICkmetfxGN2TgLDmHXz+linGRFZNz/pSMjSSJz0FQL1aj1VOIUli+ZUXSZ7MY9kWY+pGe+lb\ng9HphprHLtWBkRBaqwQ9Q4Jsi6azpGr/MSQK47wJXU6UeEL/XxLTnwtZB1plx7edyntCK0DRsa40\nJxqZ2ijiMFIteeceAuRddDRvyRcw3ICidCzFqAZB3iUr8Jwd8rZ0550xYk6kcfC9EwlyknGFsm7S\nnmwjbxTIlYre+YYhuwMi/ne/rQpSsDVBCLGJGyFWoqlRXNclJUWWQBcEVrByoh+q7hEj+OlTvWVy\nAdO+KsvYWgoZGSvmm91BmHx1A5JyGjk9jNK2l8TMZ5AdQSKlJokpMmbJ1byz/HHrwxjNwuftWg2E\ni8dGaXfW5scCPnkH7rNetm8Fd738W4KYN2Yu7516iX8/ySFQdFRJCEiiLdcXrpGIyRSMIkkl4WXO\na3LqqWfKQyKHhaO5e5kiTJVYYTSSbLFOe1ospzNVYoo4/8KxFyFJoI7ewdi2tDf/ueMcU2XqnMC7\n2Litfl/N2EEVHDrUiMj7KEN/IUPylCeQW/bXXCccJCOj1gtVg7xdYrVtO+QTr6V5u/5qqC7WAHjL\nJJTGAZANr22fvG1PA3LN5om4Qkdziv6hEoZp+ZHyjoYeLIPZEyBeSRZtuZp3/NhVyE7mLjcgxtMw\nAm0EyTtoqQhOhi4KevgeNcMK+fprJWUIErxsB9s8sLAFIg7AUHJYpTS25ZyrGCTiCrIsYztadHnT\nqZ42plt+H12t08o3CpO1c76LEHk7yT0kxcS0bAzTIqsNeoFCti152rmUKLK/wmffnGyixZkw3XSu\nSBaWpHt+WDdCtzebc8bL0byNGLph0eFoS7HxIpuZXJ8N9bfDnoptg9wUWAoGDNvid0uiBVsT10qk\nxHlFowSWjLZtltBuXXOrI2C4qTjr4knGNAvhQx29E6VjlzceAGXNyXtQSiPJlne+uz+pCGIp6xYt\nSeH3HigP+uTtnO8SnEsGleZYs38Mck6YY+Vkgfp6cZ6vecvYegJsWJ9ZizJqJ5IEiuW7GWwthW0H\nnoOsE1fiKLLiLAGVSMuNSIkCx09o8oPLzBjDeY3ZTWcgqQbxY1cj1w1jJ7NOH5IidqMk+rJjaCeP\n7XrSfxCGK4CIMY5PWRsao+A35Uac10JKTXJK58zQNkm2ScpJLzmPa2lQO/YwNGoJBb1ISvXT5SbV\nBEklSaacJaZIQrM2VRJxcb5hWpT2jQ9dwzZjnvvw2JaJIj4hVqalIeG9h+51Fdm3HoTa0OO159rD\njIi8jzI8vnOZSBRw3Es10xAGyeBHf1hbtT84eWu2+LusmTy8fKfIchUMOlGq28+X/PaD5kkXwQpS\nUsqv4OOlHJRsQbrgVeJJxAR52zbc9sd1dGcdE6ZD8oWSwc/+tI7t+4ZYvlVIvHaAmAbcDGSBicI1\nobkfYFk3uOuRDdz57OPszPjpX3/1WKAkZg0ff7DYA8AdD7/MMxt2+PsDWt7zG3q4/+ltXoY1gBUv\nB7JG1bBkDBULWJbN/zz0Mt/9zUrW7u0CyRZBOs49SorQvFVZwpZ1UnIdVqbTm1SCfmQ3iMc2Yz75\nB4Uwl0D2noDpaeZim6abFAMa6JT6qb5Glyiw28ls5b5DxbzM48sdM6yreTt+U9fE6fZxy0AXS9fu\n8zVcI0ZXT47lS2JYxXqkej/gUJIgZTfz2TmfADOOlWsRVhGnbbmhnyFpPzNaj6cp0YBWEtey4jl+\nt/lPDJYzKHoDZt94QAqR992PbWbzfiG8NSTqGN/a4l03Ptl5FxzXgusxsYvChy4lnVgKR4BKqkkS\nMZmybnoWiKyWDWhsYdLxfMoBUhjVmgYkFIe8pXiJtMNHSVX0W5Yc06/zPONOXeykEqg1YMvCwuCQ\ntyXrYc0dyA6oSDGd3WN+i5zyg62GCjpnj52HbVbTQVJJEFcV8jlQibNreE9ovyuY6Hum+hslv5JY\ncL16LeKb0Xo8lxx7MZIk0ZRoqNqfjqX9zHqm6iXsKSf3M1jOkI6F6y00JxrZm9/PQMOLYi4zVS+W\n4p7Ht1AeaGFy9p3e8dZwi5dpsi6pYhtxJFVHVWTf8uhp3lLoHqxcE+WNp4KexDBtNnVl+PkD6w95\nIaeREJH3UQYvwtGSR9C8fXJdvbWvan9wOYcZINpHV3RhmFY4KrZm1R0/GKyW5h3URCXVr9Lk19AN\nrNX1yFtm1mThk3txU6+fwcohnadW7eXZ9d3c8svneXHnDsDRLBHLSTzLgeFrIp3pDq8PAC9s7OXJ\nrmdZXlgUShm5bmcgM1egb66/2fXLu9jTm+dXj/vJN4JLjH58/1r+9MyOUCYwSxJ/nzdnfDga10Hf\ncJ6+TJElq/exbscgK7YJ4cQq1vtai2z4ZnPZICY596m7+Zb9PnqWF1MNkH+15l3YO9Zr31svr1ue\nyVnfNY2J6SkhzXt3r/PsHRLq2qvRtdsJ7nK1UvcenWdh9ApNRx23lb+8sMeL5G9K1lPWTdZvz2L2\nj64al3pzFJObJqEbFtZwC5IEckoIdW5ynrdPOFvcS05MY7uNjSLeApDxScMn7yKPPt/FcKmIbcP0\n8e00pfzJ2DYVLjhtgvfeubGK9YrQqpV0nqb6uDdeKVVohZpmehaIgVLGF5Zcn7kTeCincsSOXYXa\nIVZttNbVMXl0g9NH8b6NHS15edBd8vVQIfx1NjaGfttaCjlRIjZlFSYaKZf8HfK2HdO7jS0sHDbU\nxVNccf5xjG2vJ2ZWL29SZMVZlSJR6vOjqMubT0HvOs57zu3pZibEnWWkqk5RFXPP5I4OLxVqXQ3N\n+x+mLOSCSW8T/ayxfGx0U6OnOYPvv3aRVivJ2zGdpzYiyRa2odJQkRBmzqQp3t9mpsPLQJlMqCTk\nJHJcFwKPa4Gq4bcHMHomYGU7aUzHMEyLp1ftZdm6/fRnj0x+84i8jzJ4ZGHEvIQQQYQCoCo0ye99\n8kzGjvJfZMM2mDymkUmjGiiUDQzDqliPWi0cZEv+MqNaPu9g6khJMTzy9uoDy7XIW+GMmaOZM80h\nXOe6biajYPUeOT0szLmOyTc4oU0d3eH9PaqCvLsHi6GsX36DZu2/HXPgLv1lnt+/MnxOILCndi7j\ngHnc6d+0Cc186rJpVUdqZjkkhA3oTgnIYr0n8UuqCNBRFAkUHQXXzxj0AYoJzrUE2IbqTToN9YHP\nXNHF0idLEe3bEpYsyLikGb7mbsQo66YnxMjJgle8xBUWTC0WIka3L+75AP/90X9gUuMElPpBhu0e\n9sbEWH78nbO5/v2zAbDyTVXjIluOS8AwPdLxVg441291TNXZrF1V79pSfWuEbz1wBQwDhRjzZo5B\nkiTU3aeKZUKKiT12LRPGiOuVyqKm9LX/cBYA8+c1M3/2uEAwWIJETMEGGmMueQ/6AW+u5u1o7kpr\nN2rbPuQ6IYTc8E9v83IPWI5PefrxKe+7cjVvL6eMHo7GnzVugvf3tz8+jwmdghzV9n0YUtkjb1fz\ndp+Vi3Qsxa1Xn8Ox45qIqTIzx4r2bEOl05jBW8ecDvhLSs0+EX9QrzZgDY5G6fdzPnz742+lNS2+\nydiY7QzYuzm+ZSpffN+5/MvFM4BqzbshVs/ExrAZ28qFBZJxLS2hnPZSxZyUrqHNB2FrqVBFwpnH\ntPD2U8czPjlZXG+oDdW5P1mSOG5MBzYW31/zQz/4L0DeDTFfwDH7x5KMKzTVJzBMS9R4l6Ct6dBU\nX3wlROR9lMElC3dyrUTIh1rxosdUORSJbdg6MVUmnVTRdKeggHJgzXsoQN7lGpp3KFtWgLx9zdvv\nk0fejmTtfaTudR3ydgPzpGQOuW4IK9vmE1egv2ogAdKYOmfpiUMm/dlSyKzuJVEIEHZQcLED2ZTu\nWH936B6DwVmVZnWlfTdKu798zG1TUYTJOwjbktCscMrUYVMEuNmlOo+0pFiZZFxBloVA4+ZxxlKw\nLRkppnnpJstWwIXg3INhB56pE8ErGEEiIdWhSeKZarqFZrmFMWLifdATyCjIyYDP2yFRvaSCGcM2\nFc9c6xKrazaPqTJtyRaQID9KVGhS+o9lcvNEjxRcK0pojB0TsW5YHmkpDYNI8aInILgTaX8mXPEL\nwFQC5K255F1EqsuKwCzbJ8J0cRJG9zEAPLN/GT3SJm98VUVmVLodWZJZ2buGPnmrFxOQiiW9dzet\nOMVB9Kz/jjvjbzlL+jwyQBBZc6LJS0lslcWzzpQy3mqKSh+xvP2tnNJxovd7Zsdx3t8xVWFWy6zQ\n8UmPvMU4G/smc0LsLG99eqXw3ZRwnoNkc4w1jytnvFcc5xatGWrj7JaFvHvM+wHoqCCphrgjPIze\nSVxOcOnUd4b2B8n7golv4wunX0MlyhtOp7R2nve7M91BIjYyTbkCigvTDs95drE+RN5u8NtFoy6l\n+OLbwYyhBoJZ61RxDz3FXuRkwUmyI85RFZn6eB0fPuFyJmbeAbZMQzqGqsjohk1vtkRrQ7KqENTh\nQkTebzDolsEL3atGXPJVMv3JtbbPO1A4voJ8Y6rirSEGQLZEBKVTYSlb0MKm3Rqad1+gwENNzTto\nNlcM8k4ke9Eh76CJ17CdJTfOB5WIK0h1Wc8n7loO3OVZSpvwVZt943yTslJtogY/o5N7P2U9LJgk\nqHP6GPQHB9ZDl/2JqXISDd5DZWrP+JS1xKesCbTpkLcsoVNhTrNUdFsLZR3TLFdzjnvFFKR4mURM\n8QOeLH+JkK3HQfXJW7c1Z3mM4pnNTcIJQkLEJTWiSQWQRIpUL3LdUB3BSyItNSKlh4jPWYSUzHkC\nUbkozKlWoQEplUNp24OccM+Pe/fdFHdIIZHHzHQwpnwasiT7BXMMv7b4jBZhnVBNJ5LesDxBQB29\nk+QpTwrLhy15/s7eTMl7Z+aOOhWAMaXT/HE2Y9iGitLc65fRDFilEjEFK+dr/xa+5qwqMnElzsJj\nzmNYy/F88REUZy11OuYHU6WoR5UUitKQJxDalkIqoYAR9zK9uVAlX5sDMHSFhBJnsJxlq5NCdErT\nJIKQyg28a8qF3u+JjX5lrrgqc+boed56cPDJ0hUQsFSmpWbzDqeNyY1+8hcARXIEViksCfnr6yVa\n9anETfE8O5rDxOmSN8C5o89hQsPY0H41UBP+H45d6AsLQVgqdqGJy6dcwTWzP8bcUbOrqskFMXfU\n7NDv9x1/ifftg1jn71aQA0g6wlZSjXvvnRog2/p4WJMXFj4xfm5g2+mjT0XVxfuSTsaIKRKGKQJn\nK8fkcCIi7zcYFu94nP9Z93/cv/Xhmvu9IDFbqql5B3OaV5KvLNuUrTC5xlWZdDIG2BhNO/xoVaiK\nNpcb+1jc/cfqvgQQIrMKn7c6ertXHxfAQiz18j5OtRSuUSyHydsNGDKHW3yTskNoqiJj4pN3a7KV\nhJIIpYN1NSZzqIVOyzFhBwQc19xp5ZrQd87wtlea+4Jthgs01Fiap7h542VPsDGHm/nQ8R/ANhVM\nWw/lHNesssiFbckhzTsekz1BwF0/DIARR1I16p01/yaaFyTkBqwFyRtVJyb5ZFkvi0lIbhhk7eAa\nj7xtM0beqaLVoDoR5bKN2tnlkVCx4JiF841IEsSPXYM6QQRTeZYRSQpN0ma2jQ4nKU88oFG17lnI\nl97yWT4y/QOUN84hVRJJRXTDCsUyiPHQUOwEsiRjWlaoZOqxzZP477d/i3GcGDonKIyBsxzPQTyu\nYA2OQt80h45Um3+Q5QsYFx9zPh+Y8T5/V66JZEz1a0obNu2pdqz4MI0Nzn2ZKumEeBZuJjcXHzxB\ntOUSgmHatCRb6C8OsGlwK82JJi8ILvhadaTamdQwgYXHnIcs++MXU2WSCVUkxnFwaufJ4hoBzTKm\nysyfcDafnfMJPjDjn0J9mtYqgs7M3rApOzPsv++9mZIXhNpWURmsMemblMc3d3IgjFQO2MVJ7TM4\nrmUKkiQRj/vvu7Z9JlY5yWzlHXz5LZ9leutxofPG1o/m+jmf8n7bxbqQ5u0+r2CKYzVQddHVvF2Y\nw63e30GN2nUDphNqiPzbm4+MyRwi8j6ieOz5Lrp6wqkphwoaDyz1g5y2O8kLdgyJZSuPrOjinsc3\ne8uhvCQNstCUlq3bz12PbOS3T2xhqKCFfd4V5Js3CuGkIgHNW27qJT55HWpnIA96KEDGJhYo4wjV\nAWuWbYfK5EmKEYo2V8eJ7E5mthUz60ySkuV9nHvl1aH2RE1rvw61p7kYcZ/YHD92XVL1lr5JG9/G\njq4yacXPmAR45KdvO9ExHVdq3k7U9daTwIxTeukcZDPBkBZ+ZkENasPgZm596o/c+9ctxBM1stsF\nNG+3kIax5zjmjj0RLAXN0vnLCr82s47uCCaSuE9bglhZWCVc4UMPrO/V40iK5UUoIxu+VcJdR+ya\n6yUTSbZIyP6k26AKv3HsmHX8pe8BhmNOX4yY9+wanWPASTiiashWjGLJoqUhUdNnbet+bEWQvO1S\n2iPvYKnaJI2MruskEVOxsh1YTlSxHtC8XcjJAoolnv/9T2/HtGxithCwOlLtSJJUpa25goxVrMPM\ntnGccoa3TxwrYWY7mBYkA0vxtFZJkpg35jRa4+K9NbonElMV7zovbupF1uqRFJN0Y9k737VqeX57\nQNtyMjNahb/YNWnbNqT0jnwAACAASURBVExrOZaSWaZgFJnaPLlm8Q5FVvjc3E/xrikLKrZLwrxs\nJLC1BAoqJ7YLAVRRwiQPMLlpkhfU6eLE9hOIbTsXfdf00PZgVbvebJGCM1e11CdCxzUHyLsj3Uot\n/MeZX+Rrb72h5r4g4oHnlwz8bfZOoLzqbYxPTmZUXW0BIRiBbpdTNc3mSlCgCZJ3haBuF/0I+CDJ\nu0pJXSoW2t4RkfffH/b05vj1Y5u56X+eC23/5cMb+MNT2/ijk6fc9dmokkJfpshv/rKZxc+JZTa6\nqfumV4e871y0kcdf3MOi5bt4fkNPyOddGdzh1om2yk6QkWx6Pu9ahelD/uBE0VtD7aLSbL5++wAl\no+RPtorOcN5J0lLWQRITsbb5VH8Nsmx6H+cwvdiWRGnVOZhZ5+OXA+StaiKBhy37EbxOn+rTMXRL\nx9bjFLJJfvC71aTUdCi5hr+EJ4ahy1X36Js7nWIMRh2y1iisCZIFWMSnP4fS0uMtWQHYYDzDw8/u\n8pa+ubBtKeTzdjNC2UYMWZbEGnDZ4PHnffK2EMk3Jo6qByTQ48LnHTCbG5r/2bpaaSJtihSwiu6T\ntvMcOjqcNe+uoBPQLprjzc44OlYBxe+jm9K0PuaTr6Tqjt88TqFk0NaYrE3eAW3ZM5sjfPluLedY\nYFJWHRLz/LPOulndsJDNMEkAyGaSkmbw4DIh7F7UcQVXTv8nprUI7TFoKhX37rgjSmm0jXOZnvTN\n6u77Z9swJu2n6cSWQxM7wPunXIm2YwZm/1hiqkzKuc79T29n5y4np3Z6nTjdUkgnVc49ZayXZAXC\na59PmSpMvP9w5jGcMdrv07njzwx0vur2PbQ42cckSfJIpLT2TK4c93FPuw1mF4yrI5ugAS47ay7Y\nMmec4I/DhXP9wLjeTNEjrvGdgqynTxTvUFOAvNuStcm7OdHkrYmvhfEd4t0MCl+1zOYH8oMDTJFP\nQ983GZBFeteKtoKat+dWIOxDNzPtmAP+OAQ17wWnC5fD204ZGyJvNxvckUBUVewIoViuvfZv/4CY\nLF3Tn0veiqzSE8gnPVzQ6Q/UL0a2KGtmyHReLBuUpaDmHSbkYUeDtMtpSJRANompCnXJcO7lifHj\n2aVtCpO/G6S07xhGG7Pon/DnqoC1fFlDUkyxbjemISkG/UMlLMumZJSRZJvpbZP5+GfO56tPbKef\nHpAt74PSEZnF7HLaXx8qW77ZPKb564fLKSRkSAhLREdTim5LCwWaqXZSRKzLplgj6yWmUCmXbVER\nyCHsKWMbSU2qZ1sOT7CoS6pCY0oBqoYkWSL5DAifbkX0uhtjEJfjDL94BvETlgc0b9kb/1s+IqKX\nZTuGpYhc4u4MLSkGthHnuHHNfPby2Vz/2FKkZE5MHE77Wjkwm7sJJGI6LQ0xioqF5Wb0slQScoJU\nyuCmj8zllj8+AAjTopsfqjXRChUp6t1o9JyjeadUn4ileAlUDb2QwrJt0kmVn3ziHfz48TgbSi+i\nNGRoUBspBsgqpHlrqZqatxfxK0tIkh+kqBsWsRqEI5kJT7iYPKaBK+efSl+fbyFprwim0ndNI3Hc\nS+h7BbknAqbYoJY3OqTNSSGTKMC4pk7MHuGLjqkyHQHTsV1hGscU39aHFkyjdetuFu0Sgkaw1vak\n0Q386NpzhGUFeNv4M2lPtYX93QcoV/Ctj83zAh49Td2Ih4g0SE6V91OJd501hVMmt4a01ffNn8q7\nz5rMt3+9kr39ec+d0tqQ4NZrziYVF8cGY0Nqrek+GHz5I3PRdCtErkGzubftAH5wgOPUuazrEgpR\nkLxdn/dImncwT4W2KRA3QVggPPeUsZw+YxTppMpTq/wA1WT8yFFqpHkfIVg1UpUCVaYxw3KLhMih\nYhCFkkFf0c/JLSt+SktX8ivr1oE1b6fghuf/U0zH5+1XParPT+WczvnO/mCqVD/pQn3MCc6p8Hkv\nyQo/va0lhd9WFVWSBoZLXtnIpkQ9qiLTXu8kvohp3sdZtovexGa7NZklV/O2QdWwveAmmTq5Ednx\ng7c3J0WQn+l/1JYernYkMi4JE2nZ4V3h47dJxhUM1zfsCACphIrlraUuh0zwthFnsiYKIXjJLZzx\nPnPs6SiWWKftEroiS2TKQ0hIjKpvce6gVhIVE0yVeFymPhUThUEUC1s2sJVgoJjTDyeoTZcLtLW4\n5nKfHBpiDWS1Idqakshp8fyntvqaVGstDckQZnt3kp7deiqzW+eIcUgNI8m2l9WsLqkSjym0SZPR\nd8yEgQl8aMpVBNXFUGCSLdf0eYeIXJFFwiBElbRa0buSmfDM+lPGNFV9R5WBQ9bgaN7ffg22YyUI\nam5BIvdWKQT6EkRdYAKPq3LIxxn0N4MTsJZUkSSJ9nTAOmGE1x2nEiqyJCFJEv90/Lt5+4Szqu53\nJKiKHCLaWvcUJKr4K5C3JElV7cnOto7mJLphsddZMphOxqhLxjyiDa7jrmXyPxioilxlNamteR+Y\nvINCyiuZzYPHnth+AnVqmium/2NVm3WBQlCSJHn9DL67ifiRo9SIvI8QauUZr7Xd07wlhd6MT475\nUrXm7aIuJV4iTTdDRSrCmrfN5sw28Zdjcg6bzZ3lN6UpXuIKKWg293Ihq6QSKkk1GdK8dctgW2GD\nc7AFZswj/L5Myat85Urnk5pEQJJclyURU9AtQ0RKuyZ3JxmDu9YbVRdm4YD/s0FpEfV3FZ2OppQg\n74DmrZXcQLhAZivHZFly5CK1bT9Kx26x3MPSPHJXFYlEXMEoOwJATAv5usHGGhjDhORkp9604Res\nUBNiQgsUtFBkiaw2RH28zsvFbLqme1dIkiyRWMJUfFOuQ85lCuiKIF+9GCAMxyeXNftobnL8pwGz\nbGO8kbxeYG+xy8vHPGOUr9U1JeqIyeIeZdstpOFkbnPMo+lEgg/NfC9WOemZ122nIlk66QpbNnax\nAXXvKbSkwlHESSXBuPix6LunoioyTfV+JLoLNaAdKrLkFXoQmncN8jYSnvm2crKH2r7HukCyjrBZ\n1m+/MR7O8hXsl/gd9h8Hr2NraYrPBXzRAZ93Q9zXhG0zTN6viFfBg0HNVKkIbHu1cO91pxO3U1dJ\nskqck9pnVvnjXytqEXWlUHWg/alEtQl+pIC1hng93z7nZs4c+5aqNmu9ZxAm/1cSKg4lIvI+QhiB\nu6vglsNUZYW+rK9590gb+O2m+/0DA8TqlgYtaWaIUIOat9zcy7J9ItLbKjnLpGJlYorsmM2dNddy\niua0Y/JSqoO9MEVO6sZ4AwOlQTQnA1le9zOvGb0TsE0VJSau35sp8v/Yu/P4qMqzf/yfs81MJpls\nkAAJ+yabICgo4i5Qt69WWxUXcKlaRVu1daFUpbUPuFT9Wbva1trqQ12hllddeLpp1YLWlcUVtAjI\nkkD2zHaW3x9nmXMmM5mQZCYZ5vP+h8xkZnLmJMx1rvu+7uuOWevL7eA9uXqMeVyl+/CrDx7Gqzv+\nbZ6npJ7Y/knrzKBmNUZxFy/ZhVRCoB0DyvxQDc0zbG533rKDrjkkbZ6rcLsrWJTvRWNwM3a173Iy\nd0kU4VckaBFX8PZUrsdR3xSBz96yUo45vxO/5IMkCOb6Z8mcKxdFc7cjuwMUAGiqfYFiPq+4OLGk\nx+nnbL3fiNaGmGiNnESCieYeVrOaFr0egZB1fK7gXR4wA+nKj58xHx8NYGBxYs4x4JfNoXMAJdoQ\nCLGg01TEzmz9PslsEqO5A5V5UWF/gNt75IiC0GGeWBAEnFJ9DtQvx6KqPODMwbqzMzkp8/YOm4uY\nV3YhYlumIbLpaMS3j4PSNNK5uEgOIgBQXtJx7tGdOaWbUxUEAQtGXYzohzM7HFcyRRZR2mFnP8HZ\nIcuI+Z2LG89FQYoe+r3NfUHiHjbvjeAdjWnmErqkQCUIAr459RKcMvLkbv+MVFLOb2e4oHFfdLmH\nsv0phs2TL9DSCaYY4QAS9RoAg/dByT1s/t6n9dANsxfuftd2llu/bEJMtTJcw0BdY9jJAPeXJ/aA\n1ttLrAIq8zXNDy8De/WtqI/sT/xQV/C1h5dlQYbeWG0uMSpuhqJ4h819QgClwSLo4SDEUKPzGu7M\ne2d9G0q1oYjpcTyx/jWs/2C3s7etumeY9fqJIri6pjBiVqFdsbWOcnRlrbmOdsBufNGyHau2/MU8\nUCt428PmghL3NOZwF0KVyFaLVCWC0lJ7eU7iP09Ts13oZm1VKCUqsdvaBGCH+SErVdShrug980nW\n/2NZMiuW7QsdqWq7N/OW4tjXHIEMa3jWmuMHzEzTzLytD3YljrgRQVxXUe4aQraXfCnDzKYgpSF7\nIwvZmUqwq5TbjVZExCYYutnD2mn5GPfDiPuwH19ik/oP8xQ0JqqI7S077SmX2JbEOmDAXPs/sMgM\n3pE2H9o3HO08xh42d9bhC4lhUfu47OBk/32bc9YdPwztAJuuGtcdJCVR8BSs+WQRgwODoe0fAqO9\nFOquMdBVn7MbXjDFvvbuzMo5Blfm7Q48/qQ51YmV46C3mFXlyRciboospXyvV0y5GPH35gGaL2Xm\nndziMxv8nmJAd/DufnAZ6JqKSHXBlC2ZsuxMz3FPz9gXAuky784Up/g7AwBZTrwWg/dByJ15P7Rq\nA155dyfuXvmOp9HK8sfeRn2zOTcc0+PY1xxFZWkAJQEFQsRd9GP9J7IztiIFUtUObCsyd/s5acDp\nAOBtB2oF4UvGLwIMEXpbGUR/GIZoNfiQzbaZPsmHoF+GVl8LQdQhVe72PB+agoaWKN57y/xD/vN7\nr+PXaz7Axm3m4+zgamgydMEMmvubo4gLVvC2Mm9REJ0Mz3OeUvTrdg9ZuzPvErnEeZ8lQSvwuTJv\nLe7q2Caaeyy7s57wl0M9VePun2suvZGgt1ZgQukkSKFGSFWJZXSxz6bCMIC4nZl7Mm+/uYey9f7E\nYDPaNHsLy0TWO7bYWspTuQcQVZSUJC5A7A+BgUHz8a/sfx5hcb815SGgusIOgmaTFA0xRIw2xHeM\nxfTBiTXqdvAGzPXtRpv5evaccNAvozpoBqq2Fsks7LOCS0u7+Tu3i3wG+BJroO3gbVfXjq01f85h\nYwc6owLuoFhZGoAAYGhVx9854B16lCUhkXlrZuadnDFqmp5YrpNuODPpQzlV4RLQ8QPXPUfaWYGX\nnbFVliay/AGl5haVMsy/02CK4J343XXN6CHm//3Dxg3M8MhERul+T6LYu5k3kH4IORvsn+WTRQyz\nKtyryjpvhuK+6PLMSSuJkbVU3+/KcSSTPXPeuQverDbPESczKWmAPPQTfLjTu7ymTdgLsbzOmeON\najFEYqq5jlY30KaJEACE6meiWbaWFllV1MGA7GzacP74ryKyx9zooXaIjMtPnAXdMPBKXQPW7/3M\nyXy11lKIZXUIi/sQ9I81s1PV3NtWFAV8e958/PKjTzF1qoh3/55ocFLiC6IZifWPdr/o3U1WW0+7\nGMfOOiUVMVWHjihEeCtSJ9YOxsdNiZaR5vPNDz17pACAOd/tt+daXQ1GrOB95IwifN76mfVzXWug\nnUYuGgTr/BiajOKAbA25CmZBmL9jsxnJNSw4o2w2Pmr+wNmJKbLhWHO/agDtLQJQYgV915y3JEWc\nJVRicTNaVfPnuzPvb596PJa/vBX75E8hKDFUlgWxwzpGe8574UmH4uebXI1rrO5XQwYU47yTxiIU\n9OGdLwdgXcM/UOoL4ZRDvo5h1SFcEDYb5OwT/us89fCRI3HWCWbryTsunYn9zebWh9VNZlCwh8JP\nnjEUf39nBzTdQHFAdoYd506agj98bG7KcsnJhyEkDMCU0WbWfszUIRhUUYTRNWaf7GWXzkSFK6hV\nlRfh9kuPwODK1FXInjlvSUQsrsEwDKfaPDljVDU9MSef5kP1vmuPxusbduGZl825fk+Rmiu4JQc0\nd5CXU2Tw9187B63huJN1L7t0JprazIs+ewcrO4ja2Zq7u9hti7xVzJnMnjIYA8sCGF2ToiNZkvsW\nH43G1ljSnHfXC9Y6M6DU3BfdMHIbvAM+GcsunYnykB+KJODLfe2oTXMRaHNfdCkpRlnSFay53X/t\nHLz18V488Tdzu9p0GXqqi4NcYPDOEbswTSzdB6m0Ac2tewAkrh63la6Fuyg3psUQi5vLqEQB2Cuo\nKJaLIDQMhVi1y1xcJOowAAQUGYJsZn2TB0zA3z7ZZ3bv8oedtZhavVWQ5jOvnu250jZhPwRBgCDH\nYaiJhgMTBtdC/FhE3JpntTPvimAJmvfHzbXWmuQUpe1vbzH/muxqcSdwxs0GNIpVze5aQlJRFAK8\nsdvJrGPbJiIweb35GnIMorVlpN6ayFxDivke3t3/Nt7d/7Z5pyvzdobQ5Rj8483vG9EihII+TB0z\nEOs273aCfak+BOMGV2P9f8zzJEuCk50FjUrobaUQi5s9xwgATc0wg7ccd4oI/ZIPoiA4PbvF4ia0\nqOZzy1xz3n6fhOpQGfaFzfqD8jLRXLalJ4bNq0PeavD4DrOJSHFAdrLYE8dNw4mY5nmcX5FQWQpU\nxkc79w0vH4RqK3sqtiqFAWDW4MPx7KsfQ9s/BJNHVngyQ3e2NW5AotBt+qihngsxURBwyPBEtfWI\nwR23dxw5OH3w6ThsbjhD58mZt98nQdONRJerNMOZpUGf50PeHdB8nmHljnP0smQeQ6oP9oqQ31lf\nDQChoA+hoLeRjP1+3BcCK+bcDkkUUaIcWMFa8rntTFmJH2VJ8/2pmrR0hyyJqAz5sa85mnYIOVvc\nf0/2KE9n5DRLwVIWrKW4QAPM3/PIFH/Hydw1BRw2Pwg5w+ZWT+WInmKHKxd7yZdfkcwPJ1GDIvoQ\njWuQkpYYybIASUlsU1jXGIERC6BdS6x7tVtzhvzmB669WUMM7TAMwwrePkSsHbxkUUaFvxx14X0Q\ny/dCHmAPi7u2WlQVZ/mUvduYMyft7GGsojUchVhsZuYhV+FOyrWg9rB7WzmiH1vLk5QoxFAj9EgR\nEHd1B/OluPp27fhlX0BIFXsgKHFIrYOh7jSDn7OUyPp9+EQ/Lp9yEcT95m5DdsEaYFZdq9aOSoYu\neLL7BmsBgFi6z5mXtzd+QDwAI+aHEGzGjlZzyL0maSlSib1LkRxDaYld7Z0YNncXOk0UToTeYI6q\ndDXzce+6lG7tbUD2I/blKECXMLC8KG27R3exXbHcvXW86aQqWItZ65cVyRu8Az4JqmZkHDYHktY4\np8mQUs2P2wVv7u1dD4Q9kuD+PZX5Qx365OdCb2XeQOJiLpeZd3d4Mu8U1eBdybzNx2U+X+6Lg1R/\nS9nC4J0jTsGatYGCs/uTLWlLQ7tHud9ndmkSJA2KYG4Dam9q4ARvSXSGtQNyAPWNYQhqAO1qO1Td\nvD+shiEKIoKKGbTsIdKI0WbuYiQYgKZ4CuiqigagOdYC//h3nPvawq41yZriFGm1WNXmRofMW0Wj\n8hnE4hZUxMd4AkiqYOLeYcoZQg81QJDjHdbRFrn28lWsYUnPPLp1DGKRtca8bZIzn+tklFa2LMIq\nHrP+I8qS4BS6tEfi0PbVmIFb9cFd6mq0l0JvLYNUXg+p2mxp65f8zsWa3h6C6I9g475NKJKLMCxU\n63kPIcVV+e/XneO2P2R8kmvNtpgYdTiQzOeiCeeiSC7C5AET0j4mZm0vW1bs82Qi7vXSgiDg4gnn\n4uyxp3d7HW86SoqlYnbzEZ8ieoJOQJGg6ZmHzYH0WZV7Pa6U4jF2Zt3Y0vlFdjr2h36uM9RUpG4U\nZ6VjX8wV+/v+fXXGezHoyox9nS8VS9aF2J2x8U22MHhnQTiqoqXduyuY0yXMyvTiRlJ3LtVbgOEE\nb8Xa9UtUAV1GLK5Bttbl2vPjih28NRkCBNQ1heEXzMBoN2Zpj4dRJAcgSaK5VCfuh2EIaFWb8ceP\nVgEAtP2DPB9WA4OuTRosEVenOMOqKPcrIiL2Bh3OnLcVOBUVMdnMumvh3bIwOXifNOxYs2DKZjVZ\nEcvMPa6T23C650EXTVqAG2dcA3XXqMTx6e75bxHlYiLrtT+c7WAfMMzXtocYJUl0/qO3RVRA9SH+\n3ymIb0/sya3IImCIiG01h6ztna38kh/2SgB7HXZYi2B8+egOGzKU+q3aASXmbCBiaHLK5TElUuLi\npegAMp+ja2bivuN+2GlbSltxkeL5MEquDp9dMxNzhx/f5Z/dZUnLxlQtfebt90nQNHPY3C4sTCdd\n5uS+v7Pg3dDazeCdIvPuK+5h855edOVL5q2kec+pMu/OCtbc2/Wm09MLou5i8M6C6x96Ddc/9Jrn\nPrt6Fk7w9gZ3Q9CgR4LmGlZRcTbZ8CsSSopkCJKOPfti5iYMgt061GroIgnOhhRtERXhqIZia3/h\npqg51xpWwwhamar5QWj2zd7eth0fNXyKQfIIaPW1GDwgEVA9OyxZhg8yX3dAqd8pShs62Oc0QnEy\nb6dtpwpVtIbsFe/8kXsI8VuHXdlh/99xQ8xWlfb/PfcmAYD3P+DQkhqMLR8FGCnmvGF1RLOqdodV\nlzgZROyzqYhvH4ehhrk8ys4AZVFwisbs4VmtvhbaPnP4fNSQUqdHtxEtcvrFA4Bf9jkdLY32xEhA\nqsy3LJAI3s7fhC55AtL/G30Kjhx8OAJS9pbq2PPcQyqDnkrsQTnaaMHdrEiWBOiG4azEUBTJO2yu\nmHPebRHVHJXqJCBJXVjDW2o1jXFn92NqzIu5IQO6N8ztVyQU+eU++2B3S3Vx0l2DrL+T0mJfhkf2\nrXS/d/v3ka63eTK7F7zYyd9YV9eJ97b+ffmUp+xCG8MwnA+WRPA2/9XQMXhDC0DdNQbV46LYGfkC\nsLbLnDV5IF54C04wcipXreCtSCIgxWFEfE5L1XJ/KRoANESbMArmnLddLFUe8mPP/nZz/tgXRZEc\nwHdnX463yxow3bUcZXz5mMTxqTK+MuJknHTUZLz7aT3iqo6nt5hrzysrJWyPxc1kU1NwzNQhEMsF\n/CeyCZKswfBFYBhCh0zbfXt8xRgIgoAbzp0Gnyxi9/52zJo4CDe/9ifnfert3jluWRKwaOL52NL4\neYcLjbISH5paDRiGGfyrS8px+lEjURr0YfaUwSgOKLj27EPx8z9thLprDIQae7g8kXnbRU32nuTj\nhpbhzGNGYV9TBDPGV+FnqzbA3GFcgN40EKK1I5tn2Nx1wTFj0FQkq7CaqMiDvsCGfebPOe+YyZ6i\no1NGmu1qX3rjC+e+AaW9u2/wLRdMx0dfNGDK6AGIxTV8/YQxB1Qo1VPupZR2sLH/litCfs8oix3I\nW9oT+5ink/yhe+uF0xGNe7OpMTVluPTUCc4GGwBw8hFD4fdJmDHeu/NWV10wd5wzrN/XejN4H35I\nFRbOH4+jJg/utdfMhuRs+vuLDkdTa+Iz1/130dn5GTE4hEtPnYBDhqcfteqrCzQG7yxStcSmCppm\nz3mbHxya4A3eEPREYxIkdtzy+yRnq0l7DbOdeTt7RUsCdCEOQwtiZ521UUdxJT4PA/sjDeZuZLrq\nZN5V5QEzeFsXEjXFQ1CsFOG4ad4sa3jpUAwOVmN3+15EP5yFY4+ag1DQh+Om1WD9B4lK7WDQgICw\ntbm9gLOPHY09cRn/ec8cNheUKBD3IVDi/aAt9lQrm+996hgzCE8YYfX/1v3QxXarUKxjRe+RVYfj\nyCGHdzj35SV+8z+rIQCCgepQGfyKhLlHJPp6H35IFYJ+Ge1R1cmU7Yst93CsnXkfOnoAJo9MVH+7\nq5zj28cDhoihlRVQRNmpcTDCJdAjRThhzHTPHL1znEWJC5KdrealwNETRnV4HODNEFJ1EOuJytIA\njp4yBIBZiX3aUSMyPKN3uTNve5jX3rSnqjzgyYrt77dFVFRXdF44l5xVpbsYOW5ajee2KAgd7jsQ\n44ZmnqLIld4M3pIo4sQZQzM/sI8lz0PbIympZJpKyPR30JWitmzo+zGdg5j7Cl/Tra+tYXNnj2Xz\nljlfavfztpc7WTtuOXt0W8HSZ+/HbDdOkTSn4GxHnVn1XVtqZtANkUa0W/PRRdY+t1X2jkuy+bo1\nJemvom8+4jpEP5wJI1zq3bQ+oCSGyJUwRH8EmpUZK7LobK0nyCoEXxRGzO/Zlxfo2s5DJa3mHLPe\n2DED6uxDKRS06wLMoFDiSz386QzJG4bntuyZ844797l55v00H+LbJmGYPsN6Qet+Q0R0w3E4b/xZ\nKX9+kc/nFA8CZuFdukpud/FVLqtac8Gdedvnede+xI5x7mFz9+890/RBbwaufCX1g6H7XMvlUHa6\nTaeyrfB+qzlkL7sCEsPmguBu2WmxN9+wMm87wxZE1QreiblQAAiIAc9rGFYWb2gydlrBe0SlOV+8\nP9Jo7kcNIGgFVLvNYeyzQzEiNAynjpyb9j0E5IDTKtL9HyIYkJ3g/Z+ItZtYuGPwhq/dXI8eD3To\nhdyV4F0ePgSRjXMQ+++UDt/r7EMped/idEt07Kvu5P9/dntUIJF5JweCVEU79lW49+VStwwFzGVP\n0Q+OcgJ4mb/jDlm2rhTP5Bv7nRquM2af50TmXeQ59+7fe6bCqT76XO1XCvECJpdD2U5ilmMcNu9F\numF45lI8mbfmLVhzb7cJwargtoKz0SHztrqLWXPefsneDMMM3ppgXQioMnZY2/UNG1AJn6hgQ/1m\nJxjYa4ZDRebws948ELfMPK/L7y8580bS7kh24xdFFlFkWM1gfFZns5i/Q1WwLMo4YegcDAqmn1eU\nZbFDoZqts0KT5Cvv9MHb/Dd52FwUEsHbnitLfs1Uy4DsD8p0u8h1PE4RRqzIXG5WuRdiJzsu2Mv4\netJoo78RBAGGYSRl3lbw3tcOvyIhFFSSNjFxZ96dz3l39fdwMGPwzi7nsz3HDp5PgT62e387rrjn\nn/j7267+1/HEsqrkgjVB6ph5G9awuW7vNuWLwO9zZ97mtZZTdWy9hu7KvJvaYigt9iHgk+GTzCD9\nft0mAMCospEA9OeBHQAAIABJREFUgPJQ9ypFk5tcOHtuW/RwCLIkmPv/Wpm37rOat8T9Kfv+njv+\nLBw39Oi0P7OzZRzJnbHcyoq9c8IBOXWBlz13XGQdWyITTPS/brcadSRn+ikzbyl1Jp+JfcEW19MX\nOdmvOXxQ560h84m9JMtd4S675rQHlgc6jES4f++ZMu+DbXqhOwrxHOTygqWvLqaZefeStz7aCwBY\n+ddPnPvcm444QyuiO/M2AAiJPautYFgSrwX8m6AM+wQ++WRnztvOvINyEIh3zLxrKspR7h/gVMi2\nurbpBIDRZWYR0qSRlTh99ghMH9e1Stprzz4UX9a3ej4EKkJ+nHroNLypbcWYkrF4+8P9MNpKofgT\nFfHmkjd7NzJft1oHdjZ3lep7t1wwHe9vrcfXjh8DUQT+ZT9WTP2zrz17Cl5Y/wVOn20VaLlesqqi\nCKOGhPD5LnP0IPkDIdV8qzNsbkVaWRJxwdxxad8DAFx51hSsa9iBrZFdnuHjZGcdMwqqpuOsY1IX\ntOWj75w/Df/3n+04+fBEEdSx02rQ0h6Hbhg4ekqiHuPCueMQ8MnY+mWip26m4D24MohTjhyOKaMq\nO33cwcyvSDhzzshO29MebIr8svmeh6R/zxfPH9/lTUk6M2N8FU6cXotjpw3p8WsdCAbvA7S18b+o\nC9fjqCHezQXswCYEWiGW1UPbMwLRlJm3GagF0XA2FnGG0q3g7YsOQoV/KBqKdwBSvMOcd5EcgBAX\nnNakLaq5DehXpo/F7JpEj+sLJ3wNL3z+NzRGm5znAeaQ8NeOTywDy+TwQ6pw+CEdA/3Xjp6Mq6uO\nwsdb67B+rbkft/sqNCgH0BSzh/SVbgZvMem22XMaSD1sPmFEhVOpfv5J4/Avc5dMKGLq4dXqiiAu\nPTWx/lpAYthbFARcd85UfPfnr6c8lmCKLlPJAf74aTU4cXpth8e5nXncGEzYfiZ+uWEfFhxydtrH\nBQMyFn7lkLTfz0dDBhTjklO869/H1pbh21/vuKzOXimwbXeLc1+mYXNBEHDeiWN74Ujz21ePHZ35\nQQeZTO/5pF6qmpclsU/+XzJ4H6AH3vkFAGDW4Bmebln2XHdgqtmcJdJa4Q3emrdgDYCZfetyIhu3\nhs1jcQ2KHgREQBOirszbqjZXJBTFi9BqBe9P2z+CKIiYPND7ITin5kjMqTkSb+95DwNTNFzpLe4P\nUPeSnqASRFMssZtXd7bLSw6YPlmCqplDy501TrAdWzsbr+5ch9HWlEEmiepz89+yksQUQ3Kmn7pg\nzTts3tWGVhWBciyddWPXHlzg3Bdt7o0/iAoJ//K7SdVVZ04ZAJKnWARRSxo2TypYgznsbcQDEKwm\nJPYcciSmQdB9ZvAWI4iq1lIxe523LCIoF6FNaoXgb8OeyJeYWDnes4mF2+GDDuvRe83Ep4hmP2rd\n8GTe7mpyo7uZd9J8kt8nOXPQXWn1eN74s/DVMachIHdvXbS3mYP3WFIOm9tz3vbwd+FNN2ad5ClY\n40cYFSYWrHWTmlRY1CGQGELSsLm9zjuRedubeiSGzc3gFo1rEKyGJHEjirC1TttemuWTRQSVIkCO\nQRpgNvaYOWh6z99UNwmC4HyIuueQPOuVXZttHIjkbPdAd0USBfGAAnfyum+3mKp5bqfaijIx523/\nfEbv3uYtWOvfG2QQZQuDdzfFda3zBwg6/vi3T7Hps30AXJm36FoTaFecJw2bb9vdgi92mvPcUSOM\ntri53tVu0qLIEkqUIATRgFS1A7IgY2rV5J6/qR6wP0QVxTtsbjNUxbOTU1clD5tne79cZ847xfda\n2uOe26kL1rpXbU5dx8ybiMG725Izb7ufucMKyA88/T6AdMPmKgZVFGFghbWHtWvplb0dZtQIO3tx\n25m3IotOYBT9EQwtHppoitJHjpo0CANKAzh8fLVzX1BJtAOVoXSrjaA7eI8cHMKF88b37EAzSZEo\nf+/iGZgwvByzJ3v34lZkEbMmVnsKopKHzZl4976JIyowqDKISSMrUFHau21iifIFL1u7SdW9WVgs\nufuVNY9tf3inK1i7Y9FMPP3uK3izHYAuosgvIRzVnK0129VE8LaboiiyiGI90XQk5Ov7db9nHjMK\nZyYtYXIPm/vl7q0td+/zfPslR2R9swdnnbfr1zRuaDluuXBGx8cKAq4+y+z89vQ/twBwVZsbicdQ\n7xo3tBx3XXVUXx8GUZ9i5t1NquEdNjdbV7rms63MuzJkZsTJvc0Bs1GLTxFRVGT9GgzRqdy2s+zW\nWBva4+3wiT5nWN0niyj3JdYvhtIUqvU1d8FadyrNgUTBmiSaLUaz3Xwh0XGte+Pe9pJBnfVqRJRF\nWc28V6xYgffffx+CIGDp0qWYOjWxdnPlypVYs2YNRFHElClT8P3vfz+bh9LrkofN46ruZNsAnK8H\nWMN6qea8BVmFJIoIBqzgrZutIOubIs6weVu8De1qGAEpALs1hSKLKJMSwbs0zaYbfc09593duWq7\nOMkO2tnecEBI7pd6gOSkJi3MvIkoG7KWeb/55pvYtm0bnnrqKSxfvhzLly93vtfa2opHHnkEK1eu\nxBNPPIGtW7fivffey9ahZEVyG8u4qnn7lVvBu9Rqv6m72qMaqnnNJPniePHzv6MdjQDMOe9ia39i\nSfdBgIDWeBva42FnO0/ALFgr8yeCd1mgf3ZOch9z8qYkXWVn3nZGm+3t9xLD5t2M3slLBhm7iSgL\nsvZJuG7dOsyda+5WNWbMGDQ1NaG11exzrSgKFEVBe3s7VFVFOBxGWVn6/Vb7Ql1jGI+t/djZDjJZ\nqszbvVOY0/LUCgLujUnsrFoYsAN/+XwtXtn5uvVY0Wk6IUkiipUgGqNNiGgRTxaryKI3ePeDOe9U\nZDExsNPtzFtK7K8N5K5Pc7eLxa0n9tU2gURUGLI2bF5fX4/JkxPLlyorK1FXV4eSkhL4/X5ce+21\nmDt3Lvx+P04//XSMGtV5v+aKiiBkuXeXCVVVpZ8rXrHyHWzZ3oiyUABXnNVxO8rikOJ5viCJiXXb\ngJN5y4qEqqoQJFkCYEAQAD3uAwLtHY+nrBhl1hy5IokYXl6DD+o+BQBUliSC9eDqEAJFiYA9fNAg\nVA3su3nvdOfRCNYC7wB6OIjSEn+n5zudygpzSkCWRef5AZ+E8cMruvV6mVx46kT86JE3cN68Q7r1\n+iWhAKqqQrj6nKn45aoNOHXO6C69TjbeS6HhOewdPI89l4tzmLNqc/cwZGtrKx5++GG89NJLKCkp\nwSWXXIKPPvoIEyZMSPv8hoaOwa4nqqpCqKtrSfv9fY1h69/2lI/b19CCOiVxf2tbzDNsPn54CB/s\nBMLhOOrqWtAeiSWK1TQZhi4msnPLVacfin+tM3+uKAoY5B+ED2AGb9lINKNoaQ5DiyZ+dVq72Ol7\nyabOzqMAH04oPh8vvl0HjDO6dYzhtqj1WnCe/7MbjoMgICvveVRVMX57y4kQRaFbr9/cHEZdXQtm\njhuIw7v4Opn+FikznsPewfPYc719DtNdCGRt2Ly6uhr19fXO7b1796KqytzcYuvWrRg2bBgqKyvh\n8/lwxBFHYNOmTdk6lG6xLzbSjdJ2GDbX9KRtPs3MW7NeJ2q0QvBHrBcXALXjdZMsys7wuiyJqA3V\nON9zL7tSJNFTCFWi9M9hcwCo8g0GNB/8Svf+1Ox13u4qc9GqPM+W3hqaL8StGIkoN7IWvOfMmYO1\na9cCADZv3ozq6mqUlJhBpra2Flu3bkUkYgazTZs2YeTIkdk6lG4xUqzTdY8edChYi2uA7B42N7Nq\nOxjvqFqDwNRXrRcSYcQ7NpdQRBmqtaRMEgUMK0kEb/dabrt/+IiQuctSd/t254IdfANK9wZ5ZDm3\nc91ERPkga8PmM2bMwOTJk7FgwQIIgoBly5Zh9erVCIVCmDdvHr7xjW9g0aJFkCQJ06dPxxFHHJH5\nRXPIvdRH1VX88aNVmO3aBlQ1OmbeYlFiqMQQzO87Veae1xZgtJVBLDYff+GEr+HThs9RVTQQmlYH\nwCxYqykZjONqZ0MWZcypmYUnsN45JgD47uGLu70eOVfsgjNfN1qjAole6WKWq8x7C+vUiCgXsjrn\nfdNNN3luu+e0FyxYgAULFmTzx/eIu8nGxvoP8cbut/HG7red76tJvc1jqg6xsinxfGgQBQGaYeCL\nvc3eFzdE6K3lQPUOAImtO4FEm1VZFCAKIs7vZH9nScxun+/eYGfe3a02l6zny3mSeff3iykiOjjk\nRzrTBxKZd+pGG8lz3jEtBiHYAr3NrArXoEIUBei6gR/8YZ33yboIvS310rhZE83+2ccfVpPy+/mm\nImQO6Q8o7V7v9UTm3b+D9xGHmPUcIwf3zzX3RHRwYW/zDARBgF/s2Jc7ntzbXG6EIBjQWiogBJuh\nG6qzx7Wn8xoAASLu/8ZXsOaLCMaVj/Z878hJgzC2tgyVKTZc+NkNxyY6teWJMbVluPvq2RhY1r3g\nbQ+79/fg/c2zJuO8ligGlhVlfjARUQ8xeKfhDJsLqbt6JQ+bq4K5lE2PBiHpkpN5R2Kad/03zD2m\ny0sCWDTp/JQ/e0CaQJevexdXl3c/oLl7m/dnkigycBNRznDYPI3EUjEBmqF3+H6HLUFFa9vOmB/Q\nRWiGBkkUsLehvUPmDZ2nvavsXuH9PfMmIsolRpE03FXDWlKWDXirzQ3DgC5Za7jjPhi6BNWIQxIF\nGAY6ZN727mCUmZ1550vBGhFRLjCKpJEp845riYC8e387oJidwIy4HzBEqIaayBalpODP4N1lSp7M\neRMR5RKjSBruOW/N6Dzz/vmfNkFQYgCAUn8I0BKZNwAIHQrWGIi6yqdIkCUBRT6WZxAR2Ri803A3\nadFTDZu75rwjMRWCEoUiKvjhJbNRO6AUcT0OwT67ycPmev9fn91fyJKI755/GM4/aWxfHwoRUb/B\ndCYDM/NOVbCWCOiabkDyx1DmC6G02I/yYDF2RXRIkpW+JxesaflZNd5XDhle0deHQETUrzDzTkN3\nNWlJOeftWuet6ToMKWoOmQPwS+YabdGa6xaS57w1XjMREVH3MXin47RHFVLPebuGzXUhBggGQtbu\nXgEreAv2RiVi0rC51rHpCxERUVcxeKdhrxRLV7AW01QnOzdEs9K8WDG37fRbu3wJaTNvDpsTEVH3\nMXhnIKYpWPt8dyN+9Zy5B7kmmpXmQTt4S1ZmbQftDpk3h82JiKj7GLy7INWcN0QNb31sbt9pWMG7\nWDaDtzNszsybiIiygME7A90wUg6bu7umGZJZvNZh2FxUARgQ/OGkF2XmTURE3cfgnYFupMm8reBt\nGAYMKXnY3NoRTFQhDdoGsbjZ05iFTVqIiKgnGLwzMAwj5Zy3ORRuQDcMCLKdeZu7StnD5pBUSKX7\nAQCXTlqQk+MlIqKDH4N3BuaweYrMGwAkFf/e+SaU2q0AgGDSnLchqBCKWmDEFVQFB+bkeImI6ODH\n4J2BYaRYKmavAZdUPPnpaufu5DlvTYpADIRhREKQRc5zExFR72DwzkDXOxasybDntL33FyctFYvI\n9eY3wqEO+38TERF1F4N3BobRcT9vUU/MaeuRIud+RTSXgNnD5mFxHwBAiIYwpHgQJMOH+M4xEFiv\nRkREPcDgnYFhGNCT5rwF3cysBUmFEU0Eb8GKyvawuV1ULmh++CQfZukLoe4cl/2DJiKigxqDdwYp\nC9bsJiuSCgjmBPjo8Hzn285SMYtgPV4wmHITEVHPMXin8NH+TyH4zMYqqQrWjLgVjK3gbRgCSvUa\n5/uKKENxFagJujeYExER9QSDd5KWWCt++t5v4J/2CgAz8/7vnibPY1S79kxSIQgGYAiQRG9WHfKF\nnK9FnbuIERFR72HwTtIWbwcAp6hM1XTsaWjzPMauX7MzbxgCxKQzGfKVOF+LOnuZExFR72HwThLT\nY57bcVV35rVtumadNsnsXW4Gb++pLHUFbwHmELoB7+sQERF1B4N3koga8dyOxXVAMAvWoh8dgaJw\nLbR6c37bnXlLSeu/Qkpi2Dz5e0RERD3B4J2kPSl4x1XNybz15gEI7T0aajQAABCUqBO8haQzWepP\nBG8hKXgn3yYiIjoQ7NmZJBz3bt8Zs4bNDQMABLRH44Dqg6EqEAJt1lruFAVrimvOW2SwJiKi3sPM\nO0lY6zhsLgg6YJinqj1ilprr4WIIgTAEUYNhCB0CtGepGDNtIiLqRQzeSbyZt4G4pjtD4wAQjpql\n5kakGIJgQPBFAUPskHlLouR8bX+L5WpERNQbGLyTeDJvwUAsrgGCDlmUMLqmFLo5fg4jXJx4nCFA\nTMquJw+YABgCYtsmdPgeERFRTzB4JwnHXcFbVJ05b1EQ4VcS2bQeDSYel2LYPOQrwYTGi6DtGclh\ncyIi6lUM3knCqmvYXNSdanMR3uAN3fV1ig5rAKwit8SwORERUW9g8E4Sdi0VEyTVWectQoLf5w7Y\n7lPXMfMG4AyxC4zeRETUixi8k3gzbw2abkBwhs1dp8u9Q1iKOW8gEbyd2M2KNSIi6gUM3kncTVoE\n0W5ibgZvn2vY3NATpy7VUjEAmDKyEgBw2NiBnvs5BU5ERD3BJi1JYpqrt7lkB28doiBBkdJn3qnm\nvOfOHIZDhldgWHVJh+8RERF1F4N3kqh7Y5KkzFv2BG9vIE+VeYuCgBGDQx3uJyIi6gkGbxfDMBDX\n4s7txLC5DkmQkoK3O1innvMmIiLKBs55u8R11bttp5TIvCVBhCy5h8q9WXiqYfNkrFcjIqLewODt\nYs93S4JZmCaIGiCqEATAJ/o9mbe7YC3dsHk6zNGJiKgnGLxdolbwDohW9zRRg+Azq89LlNABF6wl\nG1xpvu7omrLeOWAiIipInPN2iVvFakVSEG1aCyCp5sYjAEqVEGQxdcGakWadd7KTZtSiOCBj+riB\nGR9LRESUDoO3i5N5C2aGLEgaBMXMvEt9pZCTsm33110ZNpclEXMOHdJ7B0xERAWJw+YuMavS3O8M\nm6vOsHmZrzT9UrE07VGJiIiyIWPw3rp1ay6Oo1+IWcPmPqMIgJV5W8Pm5f4yyHLP5ryJiIh6Q8bg\n/e1vfxsXXHABVq1ahXA4nOnhec0eNldgBm9zztvMvCuKyrwFa8jc25yIiCgbMs55P//88/jkk0/w\n4osvYuHChZg4cSLOPfdcTJ06NRfHl1N2gxZBV2BoIgRJBXwRGLqIkFKMqBRJ/URD5LA5ERHlTJfm\nvMePH4/rr78eS5YswdatW7F48WJcdNFF+O9//5vlw8stO/OGIQG6DLG4GWKgHdq+wVCUpA5rbhw2\nJyKiHMqYee/cuRN/+tOf8Je//AVjx47F1VdfjWOPPRYbN27EzTffjGeeeSYXx5kT9py3oUkwNAmC\nYt6v7hwHRfL2Nvd0WwOYeRMRUc5kDN4LFy7E17/+dfzhD3/AoEGDnPunTp2aceh8xYoVeP/99yEI\nApYuXep5/K5du/Cd73wH8XgckyZNwp133tmDt9E77A5rhi4CmnlqDEOAEQtAlrztURXZvc5b5Jw3\nERHlTMZh8zVr1mDkyJFO4H7iiSfQ1tYGALj99tvTPu/NN9/Etm3b8NRTT2H58uVYvny55/t33303\nLr/8cjz77LOQJAlffvllT95Hr7CXihmqBEO39u6OKwAESJLgqTZP7rbGYXMiIsqVjMH7e9/7Hurr\n653bkUgEt9xyS8YXXrduHebOnQsAGDNmDJqamtDa2goA0HUdb7/9Nk466SQAwLJly1BTU9OtN9Cb\n7DlvXXNl3roMSTSryd0BO3nZGIfNiYgoVzIG78bGRixatMi5fdlll6G5uTnjC9fX16OiosK5XVlZ\nibq6OgDA/v37UVxcjLvuugsXXHAB7r///u4ce6+z57x1VYSzFEyTnEDtnvNOzrwZvImIKFcyznnH\n43Fs3boVY8aMAQBs2rQJ8Xg8w7M6MgzD8/WePXuwaNEi1NbW4qqrrsLLL7+ME044Ie3zKyqCkGXp\ngH9uZ6qqQp7bwqfmMUqiD7D28jZ0CQFFQlVVCEUlifddFFDgXMIYAgYOKO7weoWiUN93b+I57Dme\nw97B89hzuTiHGYP39773PSxevBgtLS3QNA2VlZW49957M75wdXW1Z7h97969qKqqAgBUVFSgpqYG\nw4cPBwDMnj0bn376aafBu6GhPePPPBBVVSHU1bV47mtpN39GuN2A4Lf28tYlSKKAuroWxOJa4sGu\nixEYApoa2+EvwOQ71XmkA8Nz2HM8h72D57HnevscprsQyDhsPm3aNKxduxbPP/881q5dixdffLFL\nmfecOXOwdu1aAMDmzZtRXV2NkpISAIAsyxg2bJizTnzz5s0YNWpUV99L1tjV5mocTuYNXXIqy93z\n3JJnqRg7rBERUe5kzLxbW1vx5z//GQ0NDQDMYfRVq1bhtdde6/R5M2bMwOTJk7FgwQIIgoBly5Zh\n9erVCIVCmDdvHpYuXYolS5bAMAyMHz/eKV7rS1E9BlmUoaqAPedtqDJ8VtB2B2jJ9bVhCDBARESU\nGxmD9w033ICamhq89tpr+MpXvoLXX38dP/jBD7r04jfddJPn9oQJE5yvR4wYgSeeeOLAjjbL4loc\nPlFBXNUhfjEdyvCPEd5+CJSqjgMU7gI1QTAYvImIKGcyDptHo1HceeedqK2txa233orHHnsML774\nYi6OLeeiWgw+yYeYqkNRy1C291hA9UNJUSgnJbdKNRi+iYgoNzIG73g8jvb2dui6joaGBpSXl2P7\n9u25OLaci2kx+CQz81ZkCbpuBmR3NzWbuymLJAuoCAVydpxERFTYMg6bn3XWWXj66adx7rnn4rTT\nTkNlZSVGjBiRi2PLuZgeQ7lYigZVQzCgQNV0AHDmvN3cwfvsY0alDPBERETZkDF42wVngLmka9++\nfZg4cWLWDyzXDMNATIvDJ/kQ13Qosoj2iAogc+bNGW8iIsqljOmiu7vaoEGDMGnSJCeYH0xUXYUB\nwwzeqg6fLELVzcxbSbEVqOgJ3kRERLmTMfOeOHEifvKTn2D69OlQFMW5f/bs2Vk9sFyLWq1RFVGB\nqhlQZBGaZs15KykK1kT3rmIM30RElDsZg/eHH34IAHjrrbec+wRBOOiCt92gRbY28VZkyZnzTpV5\ne4fN9RwcIRERkSlj8H788cdzcRx9zt4ONBG8RSd4y3IiUFeVB1DXGPEOmzPzJiKiHMoYvC+88MKU\nc9wrV67MygH1leTM2yeLUK1hc9k1RL78yqMQi2t4+p9bnftYsEZERLnUpQ5rtng8jvXr1yMYDGb1\noPqCvZe3CHN+293HXHb1MZcl0dkaVI8UQQyEUSQX5fBIiYio0GUM3rNmzfLcnjNnDq688sqsHVBf\nienmsLmExLC5rUM3Nfs5Hx+B4NCdOPb4g2v+n4iI+reMwTu5m9quXbvw+eefZ+2A+krMybzNU+Ju\nzCKLqZbGGTCixZD3TIFPUlJ8n4iIKDsyBu9LLrnE+VoQBJSUlOC6667L6kH1BSd4GzIA3ZN5y510\nTzv4VrwTEVF/lzF4/+Mf/4Cu6xCtoq14PO5Z732wiOnu4B3zbEYipxk2JyIi6gsZo9LatWuxePFi\n5/ZFF12El156KasH1RfsgjUY5ilxr+2WUgybc3UYERH1lYzB+9FHH8WPf/xj5/bvfvc7PProo1k9\nqL4Qt9Z5Q7fmvBV3wVr6wfGDsVUsERH1bxmDt2EYCIVCzu2SkpKDMmBFtCgAQLCCtzvzdq/ztjHx\nJiKivpJxznvKlCm44YYbMGvWLBiGgVdffRVTpkzJxbHllB287cxbUdzrvDnnTURE/UfG4H3bbbdh\nzZo12LBhAwRBwJlnnolTTjklF8eWU1HVCt6anXm7C9YOvpEGIiLKXxmDdzgchqIouP322wEATzzx\nBMLhMIqLi7N+cLlkZ96GZgZt91KxUNDX4fFVZQEAQG3VwXUeiIio/8s4Hnzrrbeivr7euR2JRHDL\nLbdk9aD6gp1566oZvH2yiOVXHolLTjkEIwaHOjz+lCOH44KTx+HKMybl9DiJiIgyBu/GxkYsWrTI\nuX3ZZZehubk5qwfVFyJaBIqoQNPM24osYsiAYhx/WG3KxyuyhHkzh6XMyomIiLIpY/COx+PYujWx\ng9bGjRsRj8ezelB9IaJFEZD8iKvWHt6ddFUjIiLqSxnnvL/3ve9h8eLFaGlpga7rqKiowL333puL\nY8upqBpFQPYjxuBNRET9XMYINW3aNKxduxarVq3CkiVLUF1djWuuuSYXx5ZTyZm3z9UelYiIqD/J\nmHm/9957WL16NV544QXouo4f/ehHmD9/fi6OLWd0Q0dUi8Ev+xHXmHkTEVH/ljZC/eY3v8Fpp52G\nG2+8EZWVlVi1ahWGDx+O008//aDbmMTuax6Q/IjHzYo1Bm8iIuqv0mbeDz74IMaOHYs77rgDRx11\nFICDt4931FrjHZADaGPmTURE/Vza4P3yyy/jT3/6E5YtWwZd13H22WcflFXmABCx1nj7JbNgTRBS\n7yRGRETUH6RNL6uqqnDVVVdh7dq1WLFiBb744gvs3LkTV199NV555ZVcHmPWOZm3VbDmk6WDdpSB\niIjyX5fGhmfOnIm7774br776Kk444QT8/Oc/z/Zx5VRYjQCAWbCm6hwyJyKifu2AolRJSQkWLFiA\np59+OlvH0ye8mbfG4E1ERP0aoxSA9ngYAFAkB9DcHkdx4OCqpiciooMLgzeAdtUM3qLuRzSmoao8\n0MdHRERElB6DN4D2eDsAIBoxT0dVeVFfHg4REVGnGLwBtFmZd6SNwZuIiPo/Bm8kMu+WVvM2gzcR\nEfVnDN5IzHk3NZnd1TjnTURE/RmDN4C2eDsUUUFLmxm8K0L+Pj4iIiKi9Bi8YQ6bFytBRGPmpiQ+\nhduBEhFR/8XgDXPYPCgXIRLX4FNEiGyNSkRE/VjBB2/d0BFWIwgqRYjFNfiZdRMRUT9X8ME7rEZg\nwECxHEQ6NoGoAAAYmElEQVSUwZuIiPIAg7ddad6so6E5Cr+PwZuIiPq3gg/eMc3co3zL9jYYADNv\nIiLq9wo+eMd1M3gbunkqGLyJiKi/Y/DWVfMLBm8iIsoTDN5W5g3dDNo+peBPCRER9XMFH6ni1pw3\nDPNUBFiwRkRE/RyDtzPnbWfeDN5ERNS/MXhzzpuIiPIMg7cz583gTURE+YHBW/MOmzN4ExFRf5fV\n4L1ixQqcf/75WLBgATZs2JDyMffffz8WLlyYzcPolDNsbhWsiSI3JSEiov4ta8H7zTffxLZt2/DU\nU09h+fLlWL58eYfHbNmyBf/5z3+ydQhdkrxUTNP0PjwaIiKizLIWvNetW4e5c+cCAMaMGYOmpia0\ntrZ6HnP33XfjxhtvzNYhdEksqcOapht9eThEREQZZS1419fXo6KiwrldWVmJuro65/bq1asxa9Ys\n1NbWZusQukR1qs3NzLu4SOnDoyEiIspMztUPMoxERtvY2IjVq1fj0UcfxZ49e7r0/IqKIGS5d4vJ\nqqpCkD63bugi5h85Al89aTwkznsfkKqqUF8fQt7jOew5nsPewfPYc7k4h1kL3tXV1aivr3du7927\nF1VVVQCA9evXY//+/bjooosQi8XwxRdfYMWKFVi6dGna12toaO/V46uqCqGurgXN7ebrGrqEuTNq\nsH9fa4Znkpt9Hqn7eA57juewd/A89lxvn8N0FwJZGzafM2cO1q5dCwDYvHkzqqurUVJSAgA45ZRT\n8MILL+Dpp5/Gz372M0yePLnTwJ1NqqvaXBILfuUcERHlgaxl3jNmzMDkyZOxYMECCIKAZcuWYfXq\n1QiFQpg3b162fuwBi7matMgSh8uJiKj/y+qc90033eS5PWHChA6PGTp0KB5//PFsHkannI1JdAmy\nxMybiIj6v4KPVqquWg1aBBaqERFRXij44B3T4xAMs4qdmTcREeWDgo9WcSt4CwJboxIRUX5g8NZU\nCKw0JyKiPFLwESuuxwFDYqU5ERHlDQZvPW4tEyv4U0FERHmioCOWYRiIaXFAl1lpTkREeaOgg7dq\naDBgwGCDFiIiyiMFHbxjWsz8QpcgcdiciIjyREFHLDt4G5rEYXMiIsobhR28rb7mhsaCNSIiyh8F\nHbHszFtn5k1ERHmkwIM3M28iIso/BR2xYnoi82a1ORER5YvCDt4sWCMiojxU4MHb3stb5FIxIiLK\nGwUdsRLrvGXOeRMRUd4o6IjlLBXTRQ6bExFR3ijs4O3qsMaCNSIiyhcM3gCgSdzPm4iI8kZBR6zE\nsLkERSnoU0FERHmkoCOWe9hcYcEaERHliYKOWFFnqZgEHzNvIiLKEwUdseJWhzWDmTcREeWRgo5Y\nMVfmrchS3x4MERFRFxV08I46c94iFLmgTwUREeWRgo5Yqq5CggRAgI/Bm4iI8kRBRyzVUCEKMgAw\n8yYiorxR0BErrschwpzrZvAmIqJ8UdARK66pruDNgjUiIsoPBR28VUOFYJingJk3ERHli4KOWKqu\nQrAybxasERFRvijoiBXXVQgG57yJiCi/FGzEMgzDzLw5bE5ERHmmYCOWqqvmF8y8iYgozxRsxIpr\nVvDW7cyb1eZERJQfCjd423t5W8PmLFgjIqJ8UbARy868DZ1z3kRElF8KNmLFrMwbmghBACRR6NsD\nIiIi6qKCDd6qlXnrugBFFiEIDN5ERJQfCjZ423t5G5oIRSrY00BERHmoYKOWXbCmaQJ8CivNiYgo\nfxRu8LaHzTWBmTcREeWVgo1acd0O3iIUpWBPAxER5aGCjVpxa85bUwXIYsGeBiIiykMFG7Xcw+ay\nzEpzIiLKH4UbvK2CNV0TITHzJiKiPFKwUcvpbW6IkCVm3kRElD8KN3jbvc11ETKrzYmIKI8UbNSy\nm7TAENkalYiI8krBBm9nP29dhMTMm4iI8kjBRq2Ys6uYBJmZNxER5ZGCDd5x97A5C9aIiCiPyNl8\n8RUrVuD999+HIAhYunQppk6d6nxv/fr1eOCBByCKIkaNGoXly5dDzOGSrYgaNb/QJBasERFRXsla\n1HrzzTexbds2PPXUU1i+fDmWL1/u+f4dd9yBhx56CE8++STa2trw6quvZutQUgqrEQCAocssWCMi\norySteC9bt06zJ07FwAwZswYNDU1obW11fn+6tWrMXjwYABAZWUlGhoasnUoKUXiZvCGJjPzJiKi\nvJK1qFVfX4+KigrndmVlJerq6pzbJSUlAIC9e/fi9ddfx/HHH5+tQ0kprEYhQLCqzZl5ExFR/sjq\nnLebYRgd7tu3bx+uvvpqLFu2zBPoU6moCEKWe2/f7XA8Ap/kRzsElJYEUFUV6rXXLjQ8dz3Hc9hz\nPIe9g+ex53JxDrMWvKurq1FfX+/c3rt3L6qqqpzbra2tuPLKK3HDDTfgmGOOyfh6DQ3tvXp8YTUC\nBQoAIBqNo66upVdfv1BUVYV47nqI57DneA57B89jz/X2OUx3IZC1YfM5c+Zg7dq1AIDNmzejurra\nGSoHgLvvvhuXXHIJjjvuuGwdQqci8QgU0QcALFgjIqK8krXMe8aMGZg8eTIWLFgAQRCwbNkyrF69\nGqFQCMcccwyee+45bNu2Dc8++ywA4IwzzsD555+frcPpIKxGUSGXAgAL1oiIKK9kdc77pptu8tye\nMGGC8/WmTZuy+aM7FddVqLoKQTffPoM3EVHfevnlv+OEE07u0mN/8pP7ce65C1BTU5vlo+q/CjJq\nRa0GLbv2xgBw2JyIqC/t2vUl/va3tV1+/PXXf7egAzeQw2rz/iSimcHb0MzqdS4VIyLqOw88cA8+\n/HAzHn30N9B1HV9+uRO7dn2JBx/8Be66607U1e1FOBzG5ZdfhTlzjsV1112F73znFvzzn39HW1sr\nvvhiG3bu3IFvf/u7mD17jvO6qqpi+fIfdHj+J598hPvvvweiKGDKlGm49trrU95n/5zRo8di1aqn\n0NjYiOnTD8eTT/4v2tvbcd11N+Ldd9/Gyy//HbquY/bsObj11u+ipaUFd955G9ra2lBSUoI77vgf\nXH75Rfj9759AMBjEhg3v4cknV2LFih93+5wVZPCOWsEb9rB5DtuyEhH1Z0//Ywv+89HeXn3NmROq\ncd5JY9N+/4ILFmL16qdx2WVX4pFHHoaqxvGLX/wWDQ37MWvWUTj11DOwc+cO3H77EsyZc6znuXv3\n7sF99z2E9ev/jT//eZUneLe0NKd8/oMP3oebb16KsWPH4Uc/ugO7d+9KeV86W7duwRNPrIbP58O7\n776NX/zitxBFEeeddxauvfabeOKJxzFr1myce+4CPPXUSrzzzls47rgT8dpr/8L8+afgtddewbx5\nX+nROS3I4G33NTc08+0z8yYi6j8mTpwMAAiFSvHhh5uxZs1qCIKI5uamDo+dOvUwAObyZHcXz86e\n/8UX2zB27DgAwO2335n2vnTGjh0Hn89crRQIBHDddVdBkiQ0NjaisbERn3zyEa644hoAwPnnXwQA\nqKmpxW9/+0vMn38K3n33bXzjG1cf+IlxKczgrSU2JQFYsEZEZDvvpLGdZsm5oChmD46//vUlNDc3\n4+c//y2am5txxRULOzxWkhLNu5KbgaV7fqpNsFLdJwiJxE5V1Q7Ht3v3Ljz11Er87ncrEQwGsXDh\nedZrSTAM3fNaY8eOw759+/Dhh5sxatQY+P3+zk9CBgUZtSL2piR25s2CNSKiPiOKIjRN63B/Y2Mj\nhgypgSiKeOWVfyAejx/Q66Z7/siRo7B5s7ni6a677sR///t5yvuKi4uxb5/ZbGzjxvdTvn5FRQWC\nwSA+/vgj7N69G/F4HBMnTsLbb/8HAPDcc6vw4ot/AQCcdNI8PPDAPZg375QDeh+pFGTwthlx88qH\nmTcRUd8ZMWIUPv74Izz00P2e+0844ST8+9+v4vrrr0FRURGqq6vx6KO/6fLrpnv+9dffhJ/97P/D\nNdd8A6FQKUaOHJXyvjPPPAf3338vbr75egwcWNXh9ceNG4+ioiCuueZy/P3v/4ezzjoHP/zhD3Hu\nuRdg06YNuO66q/Dvf7+G448/EQBw8snzsHfvXhx++MyenTAAgpGq6Xg/1Jvt5uJaHNf89hnojdWA\nIeKWC6ZjwojOe6tTamyn2HM8hz3Hc9g7eB57rrNz+Pzza7B79y584xvfPKDXS6Ug57wVSYHeMNi5\nzcybiIiy6Z57/gdffrkTd911X6+8XkEG72SsNiciomy69dbbevX1CjLl1HXvTAEL1oiIKJ8UZPCO\nxr1VjRw2JyKifFKQUSvWIXgz8yYiovxRkME7OfOW2B6ViIjySEFGrWjc2/mGmTcRUd96+eW/H/Bz\n3nvvHTQ07M/C0fR/hRm8Y0mZN+e8iYj6zIFuCWp7/vk1BRu8C3KpWMdhc2beRER9xb0l6PnnX4gV\nK36IlpYWaJqGG264GWPHjsP//u/v8cor/4Qoipgz51hMnDgJr776Mj7//DP8z//ci8GDzd4dfbEN\n6OWXX+VsAxqLReD3F2VlG1A3Bm+w2pyIyLZ6y1/w7t6Nvfqa06sPxTljz0j7ffeWoL///W9x5JFH\n4//9v6/i888/w09+ch8efPAXePLJ/8Vzz70ESZLw3HOrMHPmURg7djy+851bnMAN9M02oOeff6Gz\nDejixVfiZz/7VVa2AXVj8AabtBAR9RcbN25AY2MD1q59AQAQjZobSZ1wwsm44YbFmDfvFMyfn35j\nj77YBrS5uTkn24C6FWTwrgz54ZNF6IYBVTMgCgzeREQAcM7YMzrNkrNNUWTceOPNmDJlquf+m276\nHrZt+y/+8Y+/4lvf+iZ+/es/pHz+wbwNqOfYe+2V8sghwyvw1IrT8fBNJ+DXN5/Q14dDRFTQ3FuC\nTpo0Bf/618sAgM8//wxPPvm/aG1txaOP/gYjRozEZZddiVCoDO3tbSm3Ej2YtwH1nLNefbU8Iksi\nBEHgfDcRUR9zbwn69a+fj507t2Px4itwzz3/g8MOm4GSkhI0NjbgyisX4dvfvhqTJ09BaWkZDjts\nBm677VZ89tlW57X6YhvQ+++/x9kGdOHChVnbBtStILcEBbj1XW/heew5nsOe4znsHTyPPZd8Druz\nDWjy66VSkHPeRERE2dbb24C6MXgTERFlQW9vA+rGCV8iIqI8w+BNRESUZxi8iYiI8gyDNxERUZ5h\n8CYiIsozDN5ERER5hsGbiIgozzB4ExER5Zm8aY9KREREJmbeREREeYbBm4iIKM8weBMREeUZBm8i\nIqI8w+BNRESUZxi8iYiI8kxB7ue9YsUKvP/++xAEAUuXLsXUqVP7+pD6tU8++QSLFy/GpZdeiosv\nvhi7du3CLbfcAk3TUFVVhR//+Mfw+XxYs2YN/vCHP0AURZx33nk499xz+/rQ+417770Xb7/9NlRV\nxTe/+U0ceuihPIcHIBwOY8mSJdi3bx+i0SgWL16MCRMm8Bx2UyQSwRlnnIHFixdj9uzZPI8H4I03\n3sD111+PcePGAQDGjx+PK664Ivfn0Cgwb7zxhnHVVVcZhmEYW7ZsMc4777w+PqL+ra2tzbj44ouN\n2267zXj88ccNwzCMJUuWGC+88IJhGIZx//33GytXrjTa2tqM+fPnG83NzUY4HDZOP/10o6GhoS8P\nvd9Yt26dccUVVxiGYRj79+83jj/+eJ7DA/T8888bv/71rw3DMIwdO3YY8+fP5znsgQceeMA455xz\njFWrVvE8HqD169cb3/rWtzz39cU5LLhh83Xr1mHu3LkAgDFjxqCpqQmtra19fFT9l8/nw29+8xtU\nV1c7973xxhs4+eSTAQAnnngi1q1bh/fffx+HHnooQqEQAoEAZsyYgXfeeaevDrtfmTlzJn7yk58A\nAEpLSxEOh3kOD9Bpp52GK6+8EgCwa9cuDBo0iOewm7Zu3YotW7bghBNOAMD/z72hL85hwQXv+vp6\nVFRUOLcrKytRV1fXh0fUv8myjEAg4LkvHA7D5/MBAAYMGIC6ujrU19ejsrLSeQzPa4IkSQgGgwCA\nZ599FscddxzPYTctWLAAN910E5YuXcpz2E333HMPlixZ4tzmeTxwW7ZswdVXX40LLrgAr7/+ep+c\nw4Kc83Yz2B22R9KdP57Xjv72t7/h2Wefxe9+9zvMnz/fuZ/nsOuefPJJfPjhh7j55ps954fnsGue\ne+45HHbYYRg2bFjK7/M8ZjZy5Ehcd911OPXUU7F9+3YsWrQImqY538/VOSy44F1dXY36+nrn9t69\ne1FVVdWHR5R/gsEgIpEIAoEA9uzZg+rq6pTn9bDDDuvDo+xfXn31VfzqV7/Cb3/7W4RCIZ7DA7Rp\n0yYMGDAAQ4YMwcSJE6FpGoqLi3kOD9DLL7+M7du34+WXX8bu3bvh8/n4t3iABg0ahNNOOw0AMHz4\ncAwcOBAbN27M+TksuGHzOXPmYO3atQCAzZs3o7q6GiUlJX18VPnl6KOPds7h//3f/+HYY4/FtGnT\nsHHjRjQ3N6OtrQ3vvPMOjjjiiD4+0v6hpaUF9957Lx5++GGUl5cD4Dk8UG+99RZ+97vfATCnvtrb\n23kOu+HBBx/EqlWr8PTTT+Pcc8/F4sWLeR4P0Jo1a/DII48AAOrq6rBv3z6cc845OT+HBbmr2H33\n3Ye33noLgiBg2bJlmDBhQl8fUr+1adMm3HPPPdi5cydkWcagQYNw3333YcmSJYhGo6ipqcFdd90F\nRVHw0ksv4ZFHHoEgCLj44otx5pln9vXh9wtPPfUUfvrTn2LUqFHOfXfffTduu+02nsMuikQi+P73\nv49du3YhEonguuuuw5QpU3DrrbfyHHbTT3/6U9TW1uKYY47heTwAra2tuOmmm9Dc3Ix4PI7rrrsO\nEydOzPk5LMjgTURElM8KbticiIgo3zF4ExER5RkGbyIiojzD4E1ERJRnGLyJiIjyTME1aSHKN/fe\ney82btyIaDSKDz74ANOnTwcAfO1rX8NXv/rVLr3Gr3/9a4wfP97pZ53KwoUL8fvf/x6SJPXGYXvs\n2bMHn332GWbPnt3rr01UiLhUjChP7NixAxdeeCH+9a9/9fWhHLA1a9Zg69atuPHGG/v6UIgOCsy8\nifLYT3/6U+zYsQNffvklbr31VkQiEdx3333w+XyIRCJYtmwZJk+ejCVLluDwww/H7Nmzcc011+CY\nY47Bhg0b0NbWhocffhiDBg3CIYccgs2bN+OXv/wlGhsbsXv3bmzbtg1HHnkkbr/9dkSjUdx6663Y\nuXMnBg8eDEmSMGfOHM8exW1tbfjud7+L5uZmqKqKE088EWeccQYefPBBGIaB8vJyXHTRRbjzzjux\nbds2tLW14YwzzsDll1+O1atX469//SsEQcCePXswevRorFixAoqi9OEZJuqfOOdNlOd27NiBxx57\nDFOmTEFjYyN+8IMf4LHHHsOiRYvw8MMPd3j81q1bcc4552DlypWYOHEiXnzxxQ6P+eCDD/DQQw/h\n2WefxerVq9HU1IQ1a9ZAVVU888wzuOOOO/D66693eN6///1vqKqKP/7xj3jyyScRDAZRW1uLs88+\nG2eeeSYuu+wyPPbYY6iursbjjz+OZ555Bs8//zw++ugjAMDGjRv///bu2CW1MIzj+NcONQQRQi3W\nYnBsjDoSBFKNOVaEo0M4REO4HGyrKQin5ob+gDBaoiVyECEipakhWkKkQKFoiERPd5DOzYxLlysX\njvw+4+F5X97tx/PyHh7S6TSHh4eUy2VP3jKI/A/qvEU8bmJiAp/PB8DQ0BC7u7u8vb3x8vLC4OBg\nW73f78c0TQACgQBPT09tNZZlYRgGhmHg9/t5fn7m5uaG6elpAIaHh7Esq23d1NQUe3t7bGxsMDc3\nx8rKCj09rT3CxcUFDw8PXF5eAlCr1bi/v3fXf4xPnZyc5O7uzp2TLCK/KbxFPO7ztbJt22xvbzMz\nM8P5+bk7zOOzrw/Svnv28l2N4zgtQfw1lKE5y/j4+JhiscjZ2RnLy8scHR211PT19bG+vs7CwkLL\n90wmg+M4fzyXiDTp2lyki1QqFUzTpNFocHp6Sq1W69jeY2NjFItFAKrVKldXV201uVyObDaLZVnY\ntk1/fz/VahWfz0e9XgeaXf3HVb3jOOzs7Ljd//X1Na+vr7y/v1MoFBgfH+/Y+UW6iTpvkS6SSCSI\nx+MEAgFWV1exbZuDg4OO7L20tEQ2myUWizE6Oko4HG7r0IPBIKlUiv39fQzDIBKJMDIyQjgcJplM\n0tvby9raGre3t8RiMRqNBvPz8+6o1FAoxObmJqVSCdM0iUQiHTm7SLfRr2Ii8iOPj48UCgWi0SiO\n47C4uMjW1pb73/m/ymQy5PN50ul0R/YT6WbqvEXkRwYGBjg5OXHnE8/OznYsuEXk76jzFhER8Rg9\nWBMREfEYhbeIiIjHKLxFREQ8RuEtIiLiMQpvERERj1F4i4iIeMwvRph4T/csGFUAAAAASUVORK5C\nYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f867ef90>"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "HNqUFL4deCsL",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# 4. Case study: building an RNN\n"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "YkC1k4HEQ7rw",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "In this exercise we build and train a model similar to the RNNColorbot model that was used in the main Eager notebook. The model is adapted for converting and training in graph mode."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "7nkPDl5CTCNb",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "To get started, we load the colorbot dataset. The code is identical to that used in the other exercise and its details are unimportant."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "A0uREmVXCQEw",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def parse(line):\n",
+        "  \"\"\"Parses a line from the colors dataset.\n",
+        "  \n",
+        "  Args:\n",
+        "    line: A comma-separated string containing four items:\n",
+        "        color_name, red, green, and blue, representing the name and\n",
+        "        respectively the RGB value of the color, as an integer\n",
+        "        between 0 and 255.\n",
+        "\n",
+        "  Returns:\n",
+        "    A tuple of three tensors (rgb, chars, length), of shapes: (batch_size, 3),\n",
+        "    (batch_size, max_sequence_length, 256) and respectively (batch_size).\n",
+        "  \"\"\"\n",
+        "  items = tf.string_split([line], \",\").values\n",
+        "  rgb = tf.string_to_number(items[1:], out_type=tf.float32) / 255.0\n",
+        "  color_name = items[0]\n",
+        "  chars = tf.one_hot(tf.decode_raw(color_name, tf.uint8), depth=256)\n",
+        "  length = tf.cast(tf.shape(chars)[0], dtype=tf.int64)\n",
+        "  return rgb, chars, length\n",
+        "\n",
+        "\n",
+        "def maybe_download(filename, work_directory, source_url):\n",
+        "  \"\"\"Downloads the data from source url.\"\"\"\n",
+        "  if not tf.gfile.Exists(work_directory):\n",
+        "    tf.gfile.MakeDirs(work_directory)\n",
+        "  filepath = os.path.join(work_directory, filename)\n",
+        "  if not tf.gfile.Exists(filepath):\n",
+        "    temp_file_name, _ = six.moves.urllib.request.urlretrieve(source_url)\n",
+        "    tf.gfile.Copy(temp_file_name, filepath)\n",
+        "    with tf.gfile.GFile(filepath) as f:\n",
+        "      size = f.size()\n",
+        "    print('Successfully downloaded', filename, size, 'bytes.')\n",
+        "  return filepath\n",
+        "\n",
+        "\n",
+        "def load_dataset(data_dir, url, batch_size, training=True):\n",
+        "  \"\"\"Loads the colors data at path into a tf.PaddedDataset.\"\"\"\n",
+        "  path = maybe_download(os.path.basename(url), data_dir, url)\n",
+        "  dataset = tf.data.TextLineDataset(path)\n",
+        "  dataset = dataset.skip(1)\n",
+        "  dataset = dataset.map(parse)\n",
+        "  dataset = dataset.cache()\n",
+        "  dataset = dataset.repeat()\n",
+        "  if training:\n",
+        "    dataset = dataset.shuffle(buffer_size=3000)\n",
+        "  dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None, None], []))\n",
+        "  return dataset\n",
+        "\n",
+        "\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "data_dir = \"tmp/rnn/data\""
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "waZ89t3DTUla",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Next, we set up the RNNColobot model, which is very similar to the one we used in the main exercise.\n",
+        "\n",
+        "Autograph doesn't fully support classes yet (but it will soon!), so we'll write the model using simple functions."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9v8AJouiC44V",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def model_components():\n",
+        "  lower_cell = tf.contrib.rnn.LSTMBlockCell(256)\n",
+        "  lower_cell.build(tf.TensorShape((None, 256)))\n",
+        "  upper_cell = tf.contrib.rnn.LSTMBlockCell(128)\n",
+        "  upper_cell.build(tf.TensorShape((None, 256)))\n",
+        "  relu_layer = tf.layers.Dense(3, activation=tf.nn.relu)\n",
+        "  relu_layer.build(tf.TensorShape((None, 128)))\n",
+        "  return lower_cell, upper_cell, relu_layer\n",
+        "\n",
+        "\n",
+        "def rnn_layer(chars, cell, batch_size, training):\n",
+        "  \"\"\"A simple RNN layer.\n",
+        "  \n",
+        "  Args:\n",
+        "    chars: A Tensor of shape (max_sequence_length, batch_size, input_size)\n",
+        "    cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "\n",
+        "  Returns:\n",
+        "    A Tensor of shape (max_sequence_length, batch_size, output_size).\n",
+        "  \"\"\"\n",
+        "  hidden_outputs = []\n",
+        "  autograph.utils.set_element_type(hidden_outputs, tf.float32)\n",
+        "  state, output = cell.zero_state(batch_size, tf.float32)\n",
+        "  n = tf.shape(chars)[0]\n",
+        "  i = 0\n",
+        "  while i < n:\n",
+        "    ch = chars[i]\n",
+        "    cell_output, (state, output) = cell.call(ch, (state, output))\n",
+        "    hidden_outputs.append(cell_output)\n",
+        "    i += 1\n",
+        "  hidden_outputs = hidden_outputs.stack()\n",
+        "  if training:\n",
+        "    hidden_outputs = tf.nn.dropout(hidden_outputs, 0.5)\n",
+        "  return hidden_outputs\n",
+        "\n",
+        "\n",
+        "def model(inputs, lower_cell, upper_cell, relu_layer, batch_size, training):\n",
+        "  \"\"\"RNNColorbot model.\n",
+        "  \n",
+        "  The model consists of two RNN layers (made by lower_cell and upper_cell),\n",
+        "  followed by a fully connected layer with ReLU activation.\n",
+        "  \n",
+        "  Args:\n",
+        "    inputs: A tuple (chars, length)\n",
+        "    lower_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    upper_cell: An object of type tf.contrib.rnn.LSTMBlockCell\n",
+        "    relu_layer: An object of type tf.layers.Dense\n",
+        "    batch_size: Int, the batch size to use\n",
+        "    training: Boolean, whether the layer is used for training\n",
+        "    \n",
+        "  Returns:\n",
+        "    A Tensor of shape (batch_size, 3) - the model predictions.\n",
+        "  \"\"\"\n",
+        "  (chars, length) = inputs\n",
+        "  chars_time_major = tf.transpose(chars, [1, 0, 2])\n",
+        "  chars_time_major.set_shape((None, batch_size, 256))\n",
+        "\n",
+        "  hidden_outputs = rnn_layer(chars_time_major, lower_cell, batch_size, training)\n",
+        "  final_outputs = rnn_layer(hidden_outputs, upper_cell, batch_size, training)\n",
+        "\n",
+        "  # Grab just the end-of-sequence from each output.\n",
+        "  indices = tf.stack([length - 1, range(batch_size)], axis=1)\n",
+        "  sequence_ends = tf.gather_nd(final_outputs, indices)\n",
+        "  return relu_layer(sequence_ends)\n",
+        "\n",
+        "def loss_fn(labels, predictions):\n",
+        "  return tf.reduce_mean((predictions - labels) ** 2)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "JjK4gXFvFsf4",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "The train and test functions are also similar to the ones used in the Eager notebook. Since the network requires a fixed batch size, we'll train in a single shot, rather than by epoch."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "ZWQMExk0S6X6",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  iterator = train_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=True)\n",
+        "    loss = loss_fn(labels, predictions)\n",
+        "    optimizer.minimize(loss)\n",
+        "    if step % (num_steps // 10) == 0:\n",
+        "      print('Step', step, 'train loss', loss)\n",
+        "    step += 1\n",
+        "  return step\n",
+        "\n",
+        "\n",
+        "def test(eval_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps):\n",
+        "  total_loss = 0.0\n",
+        "  iterator = eval_data.make_one_shot_iterator()\n",
+        "  step = 0\n",
+        "  while step < num_steps:\n",
+        "    labels, chars, sequence_length = iterator.get_next()\n",
+        "    predictions = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, batch_size, training=False)\n",
+        "    total_loss += loss_fn(labels, predictions)\n",
+        "    step += 1\n",
+        "  print('Test loss', total_loss)\n",
+        "  return total_loss\n",
+        "\n",
+        "\n",
+        "def train_model(train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps):\n",
+        "  optimizer = tf.train.AdamOptimizer(learning_rate=0.01)\n",
+        "\n",
+        "  train(optimizer, train_data, lower_cell, upper_cell, relu_layer, batch_size, num_steps=tf.constant(train_steps))\n",
+        "  test(eval_data, lower_cell, upper_cell, relu_layer, 50, num_steps=tf.constant(2))\n",
+        "\n",
+        "  print('Colorbot is ready to generate colors!\\n\\n')\n",
+        "  \n",
+        "  # In graph mode, every op needs to be a dependent of another op.\n",
+        "  # Here, we create a no_op that will drive the execution of all other code in\n",
+        "  # this function. Autograph will add the necessary control dependencies.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "iopcs5hXG2od",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we add code to run inference on a single input, which we'll read from the input.\n",
+        "\n",
+        "Note the `do_not_convert` annotation that lets us disable conversion for certain functions and run them as a `py_func` instead, so you can still call them from compiled code."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "DyU0wnnAFEYj",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "@autograph.do_not_convert(run_as=autograph.RunMode.PY_FUNC)\n",
+        "def draw_prediction(color_name, pred):\n",
+        "  pred = pred * 255\n",
+        "  pred = pred.astype(np.uint8)\n",
+        "  plt.axis('off')\n",
+        "  plt.imshow(pred)\n",
+        "  plt.title(color_name)\n",
+        "  plt.show()\n",
+        "\n",
+        "\n",
+        "def inference(color_name, lower_cell, upper_cell, relu_layer):\n",
+        "  _, chars, sequence_length = parse(color_name)\n",
+        "  chars = tf.expand_dims(chars, 0)\n",
+        "  sequence_length = tf.expand_dims(sequence_length, 0)\n",
+        "  pred = model((chars, sequence_length), lower_cell, upper_cell, relu_layer, 1, training=False)\n",
+        "  pred = tf.minimum(pred, 1.0)\n",
+        "  pred = tf.expand_dims(pred, 0)\n",
+        "  draw_prediction(color_name, pred)\n",
+        "  # Create an op that will drive the entire function.\n",
+        "  return tf.no_op()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "metadata": {
+        "id": "Nt0Kv5OCHip0",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Finally, we put everything together.\n",
+        "\n",
+        "Note that the entire training and testing code is all compiled into a single op (`tf_train_model`) that you only execute once! We also still use a `sess.run` loop for the inference part, because that requires keyboard input."
+      ]
+    },
+    {
+      "metadata": {
+        "id": "-GmWa0GtYWdh",
+        "colab_type": "code",
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "output_extras": [
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {},
+            {}
+          ],
+          "base_uri": "https://localhost:8080/",
+          "height": 668
+        },
+        "outputId": "61f4af1d-c81e-44db-9079-1a7b8ed8ce58",
+        "executionInfo": {
+          "status": "ok",
+          "timestamp": 1522345877153,
+          "user_tz": 240,
+          "elapsed": 75500,
+          "user": {
+            "displayName": "Dan Moldovan",
+            "photoUrl": "//lh5.googleusercontent.com/-Rneh8xjecyk/AAAAAAAAAAI/AAAAAAAACB4/c5vwsJpbktY/s50-c-k-no/photo.jpg",
+            "userId": "112023154726779574577"
+          }
+        }
+      },
+      "cell_type": "code",
+      "source": [
+        "def run_input_loop(sess, inference_ops, color_name_placeholder):\n",
+        "  \"\"\"Helper function that reads from input and calls the inference ops in a loop.\"\"\"\n",
+        "\n",
+        "  tb = widgets.TabBar([\"RNN Colorbot\"])\n",
+        "  while True:\n",
+        "    with tb.output_to(0):\n",
+        "      try:\n",
+        "        color_name = six.moves.input(\"Give me a color name (or press 'enter' to exit): \")\n",
+        "      except (EOFError, KeyboardInterrupt):\n",
+        "        break\n",
+        "    if not color_name:\n",
+        "      break\n",
+        "    with tb.output_to(0):\n",
+        "      tb.clear_tab()\n",
+        "      sess.run(inference_ops, {color_name_placeholder: color_name})\n",
+        "      plt.show()\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Read the data.\n",
+        "  batch_size = 64\n",
+        "  train_data = load_dataset(data_dir, train_url, batch_size)\n",
+        "  eval_data = load_dataset(data_dir, test_url, 50, training=False)\n",
+        "  \n",
+        "  # Create the model components.\n",
+        "  lower_cell, upper_cell, relu_layer = model_components()\n",
+        "  # Create the helper placeholder for inference.\n",
+        "  color_name_placeholder = tf.placeholder(tf.string, shape=())\n",
+        "  \n",
+        "  # Compile the train / test code.\n",
+        "  tf_train_model = autograph.to_graph(train_model)\n",
+        "  train_model_ops = tf_train_model(\n",
+        "      train_data, eval_data, batch_size, lower_cell, upper_cell, relu_layer, train_steps=100)\n",
+        "  \n",
+        "  # Compile the inference code.\n",
+        "  tf_inference = autograph.to_graph(inference)\n",
+        "  inference_ops = tf_inference(color_name_placeholder, lower_cell, upper_cell, relu_layer)\n",
+        "  \n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(tf.global_variables_initializer())\n",
+        "    \n",
+        "    # Run training and testing.\n",
+        "    sess.run(train_model_ops)\n",
+        "     \n",
+        "    # Run the inference loop.\n",
+        "    run_input_loop(sess, inference_ops, color_name_placeholder)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "('Successfully downloaded', 'train.csv', 28010L, 'bytes.')\n",
+            "('Successfully downloaded', 'test.csv', 2414L, 'bytes.')\n",
+            "Step 0 train loss 0.37890616\n",
+            "Step 10 train loss 0.18515904\n",
+            "Step 20 train loss 0.0892782\n",
+            "Step 30 train loss 0.07883155\n",
+            "Step 40 train loss 0.08585831\n",
+            "Step 50 train loss 0.09302989\n",
+            "Step 60 train loss 0.089012615\n",
+            "Step 70 train loss 0.07275697\n",
+            "Step 80 train loss 0.06644974\n",
+            "Step 90 train loss 0.0854013\n",
+            "Test loss 0.13216865Colorbot is ready to generate colors!\n",
+            "\n",
+            "\n",
+            "\n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<link rel=stylesheet type=text/css href='/nbextensions/google.colab/tabbar.css'></link>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<script src='/nbextensions/google.colab/tabbar_main.min.js'></script>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "text/plain": [
+              "<IPython.core.display.HTML object>"
+            ],
+            "text/html": [
+              "<div id=\"id1\"></div>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b102d936-3379-11e8-ac70-0242ac110002\"] = colab_lib.createTabBar({\"contentBorder\": [\"0px\"], \"borderColor\": [\"#a7a7a7\"], \"tabNames\": [\"RNN Colorbot\"], \"initialSelection\": 0, \"location\": \"top\", \"contentHeight\": [\"initial\"], \"elementId\": \"id1\"});\n",
+              "//# sourceURL=js_e223a56194"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b103532a-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_b8c6a821fb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b105b28c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_44805e254b"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b106197a-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_a63d3c6c47"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1069f44-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b106197a-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7e203b8bce"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"b1070f38-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_d53293d4a7"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6d90d5c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"b105b28c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_3000dc2c05"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6da872c-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_4136f669a3"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dac868-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_2f70dd9aee"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6db07d8-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6dac868-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7226726048"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c6dcc6fe-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_72e7709865"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAVQAAAFZCAYAAADHDNdrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAB9JJREFUeJzt3E1Lle0ax+HTF4jeEAyMBhE0DawI\nwsCH0AIlaGBWNJBo0CDoA0TQhmDXuKAGDioiCA2KlEAlnl05FD9Co8BeaGCQoBDa2jPZsXt4Bvu/\n0+o4Rmvd1zW4rsmP84bFamo0Go0C4H/WvNYHAPhVCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKDy\nUxgeHq5Dhw7V4OBgPXz4sHp7e+vWrVt15cqVOnnyZN2/f78ajUbdvn27+vr6qqenp65du1YrKytV\nVfXhw4e6cOFC9fX1VV9fX01PT1dV1dzcXHV3d9eDBw/q+PHj9ccff9TExMRaXpWfWOtaHwD+zuvX\nr+vOnTs1MTFRbW1tdf78+dW16enpGh8fr/b29hobG6upqal6/Phxbdy4sS5evFgjIyM1NDRUly5d\nqv3799fw8HC9efOmTp8+XVNTU1VV9enTp2pubq5nz57V5ORk3bhxo44dO7ZW1+UnZkJl3Zudna2D\nBw9WR0dHbdiwoQYHB1fX9u7dW+3t7VVV9fLlyxocHKytW7dWa2trnTp1qp4/f16Li4s1MzNT586d\nq6qqXbt21YEDB1an1OXl5Tpx4kRVVe3Zs6fevXv3Yy/IL8OEyrr3+fPnamtrW/2+ffv21c//+Xxh\nYaHu3r1bjx49qqqqlZWVam9vr4WFhWo0GnXmzJnVvYuLi9XV1VVVVS0tLbVp06aqqmpubq6vX7/+\nX+/Dr0tQWfe2bNlSi4uLq98/fvz43X0dHR3V29tbQ0ND3zxfXl6ulpaWevLkSW3evPmbtbm5ufyB\n+W155Wfd6+zsrJmZmZqfn68vX77U2NjYd/cdOXKkxsfHa2lpqaqqRkdH6+nTp9Xa2lqHDx+u0dHR\nqqpaWlqqy5cv1/v373/YHfg9CCrrXmdnZw0MDNTAwECdPXu2enp6vrvv6NGj1dPTUwMDA9Xf318v\nXryo7u7uqqq6evVqzc7OVn9/fw0MDNTOnTtrx44dP/Ia/Aaa/B8qP4NGo1FNTU1VVfXq1au6efPm\nX06qsFZMqKx78/Pz1dXVVW/fvq1Go1GTk5O1b9++tT4W/BcTKj+FkZGRunfvXjU1NdXu3bvr+vXr\ntW3btrU+FnxDUAFCvPIDhAgqQMi6+WH/kX8eXesjAPytf/3jz79cM6EChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCI\noAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIig\nAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAC\nhAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKE\nCCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQI\nKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgq\nQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpA\niKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkCIoAKECCpAiKAChAgqQIigAoQIKkBI\nU6PRaKz1IQB+BSZUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAEEEFCBFUgBBB\nBQgRVIAQQQUIEVSAEEEFCBFUgBBBBQgRVIAQQQUIEVSAkH8D1Aj8lNhhe7QAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "<matplotlib.figure.Figure at 0x7f72f402e850>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1",
+              "user_output"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70592aa-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c6da872c-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_25c3aaf79a"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c70842c0-3379-11e8-ac70-0242ac110002\"] = google.colab.output.getActiveOutputArea();\n",
+              "//# sourceURL=js_984c56b816"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c708dec4-3379-11e8-ac70-0242ac110002\"] = document.querySelector(\"#id1_content_0\");\n",
+              "//# sourceURL=js_e0451a1217"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7092726-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c708dec4-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_7aa23d7385"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7099044-3379-11e8-ac70-0242ac110002\"] = window[\"id1\"].setSelectedTabIndex(0);\n",
+              "//# sourceURL=js_5722756ddb"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        },
+        {
+          "output_type": "stream",
+          "text": [
+            "Give me a color name (or press 'enter' to exit): \n"
+          ],
+          "name": "stdout"
+        },
+        {
+          "output_type": "display_data",
+          "data": {
+            "application/javascript": [
+              "window[\"c7baac12-3379-11e8-ac70-0242ac110002\"] = google.colab.output.setActiveOutputArea(window[\"c70842c0-3379-11e8-ac70-0242ac110002\"]);\n",
+              "//# sourceURL=js_cdd622e58f"
+            ],
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {
+            "tags": [
+              "id1_content_0",
+              "outputarea_id1"
+            ]
+          }
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "AHJ2c47U-A5W",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "# Where do we go next?\n",
+        "\n",
+        "Autograph is available in tensorflow.contrib, but it's still in its early stages. We're excited about the possibilities it brings — write your machine learning code in the flexible Eager style, but still enjoy all the benefits that come with running in graph mode. A beta version will be available soon -- stay tuned!"
+      ]
+    }
+  ]
+}
diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index e468176da1724d8a7ce62647dc3c4b656c71affb..54424e26472b8466b8fe68ea848b5463c10224c9 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -26,6 +26,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/contrib/autograph/converters",
+        "//tensorflow/contrib/autograph/operators",
         "//tensorflow/contrib/autograph/pyct",
         "//tensorflow/contrib/autograph/pyct/static_analysis",
         "//tensorflow/contrib/autograph/utils",
@@ -38,6 +39,7 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/contrib/autograph/utils",
@@ -50,6 +52,7 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index 1c4fcaa62228232e8dddf9b6c0e845e13fa3ae8b..dce994e50df60d8bd419f62207d77035beac9f5a 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -247,7 +247,10 @@ def to_graph(e,
   # The compiled code should see everything the entry function saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
   if tf_inspect.isfunction(e):
-    compiled_node.__dict__.update(inspect_utils.getnamespace(e))
+    for key, val in inspect_utils.getnamespace(e).items():
+      # Avoid overwriting entities that have been transformed.
+      if key not in compiled_node.__dict__:
+        compiled_node.__dict__[key] = val
   compiled_fn = getattr(compiled_node, name)
 
   if verbose:
diff --git a/tensorflow/contrib/autograph/impl/api_test.py b/tensorflow/contrib/autograph/impl/api_test.py
index ee2d301d7562ef5ba6bc7ca6d013b99dec78d4c3..f9db07778a33498f699923a9e0a193c843bfefd8 100644
--- a/tensorflow/contrib/autograph/impl/api_test.py
+++ b/tensorflow/contrib/autograph/impl/api_test.py
@@ -37,8 +37,12 @@ class ApiTest(test.TestCase):
   def setUp(self):
     config.COMPILED_IMPORT_STATEMENTS = (
         'from __future__ import print_function',
-        'from tensorflow.contrib.autograph import utils as '
-        'autograph_utils', 'tf = autograph_utils.fake_tf()')
+        'from tensorflow.contrib.autograph import utils'
+        ' as autograph_utils',
+        'from tensorflow.contrib.autograph import operators'
+        ' as __ops',
+        'tf = autograph_utils.fake_tf()',
+    )
 
   def test_decorator_recurses(self):
 
@@ -197,8 +201,7 @@ class ApiTest(test.TestCase):
 
     compiled_code = api.to_code(test_fn)
 
-    # Just check for some key words and that it is parseable Python code.
-    self.assertRegexpMatches(compiled_code, 'autograph_utils\\.run_while')
+    # Just check that it is parseable Python code.
     self.assertIsNotNone(parser.parse_str(compiled_code))
 
 
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 543c1486e657f4e7b16e5723cc294c09ebbcec00..26326465e265f5b40c3badedc0ea2813248ef60f 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -41,10 +41,15 @@ DEFAULT_UNCOMPILED_MODULES = set((
 
 NO_SIDE_EFFECT_CONSTRUCTORS = set(('tensorflow',))
 
-# TODO(mdan): Also allow controlling the generated names (for testability).
+# TODO(mdan): Also allow controlling the generated names.
+# TODO(mdan); Consolidate all internal imports into a single __ag module.
 COMPILED_IMPORT_STATEMENTS = (
-    'from __future__ import print_function', 'import tensorflow as tf',
-    'from tensorflow.contrib.autograph.impl import api as '
-    'autograph_api',
-    'from tensorflow.contrib.autograph import utils as '
-    'autograph_utils')
+    'from __future__ import print_function',
+    'import tensorflow as tf',
+    'from tensorflow.contrib.autograph.impl import api'
+    ' as autograph_api',
+    'from tensorflow.contrib.autograph import utils'
+    ' as autograph_utils',
+    'from tensorflow.contrib.autograph import operators'
+    ' as __ops',
+)
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 62a49cd92d835fb942f48354041cb0ab03d02c97..3bacc9430098d9cebf1726074524731899cdd965 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -28,7 +28,6 @@ from tensorflow.contrib.autograph.converters import call_trees
 from tensorflow.contrib.autograph.converters import continue_statements
 from tensorflow.contrib.autograph.converters import control_flow
 from tensorflow.contrib.autograph.converters import decorators
-from tensorflow.contrib.autograph.converters import for_loops
 from tensorflow.contrib.autograph.converters import ifexp
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.contrib.autograph.converters import logical_expressions
@@ -324,8 +323,6 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = lists.transform(node, ctx)
-  node = for_loops.transform(node, ctx)
-  # for_loops may insert new global references.
   node = builtin_functions.transform(node, ctx)
 
   node = _static_analysis_pass(node, ctx)
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4c624685751f7de551f6a9e891a96a2d141e9f3e
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -0,0 +1,40 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "operators",
+    srcs = [
+        "__init__.py",
+        "control_flow.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = [
+        "//tensorflow/contrib/autograph/utils",
+    ],
+)
+
+py_test(
+    name = "control_flow_test",
+    srcs = ["control_flow_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04b4734551d3227a1c611d668f006a157c2c2dd3
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This module implements operators that we overload.
+
+Note that "operator" is used loosely here, and includes control structures like
+conditionals and loops, implemented in functional form, using for example
+closures for the body.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(mdan): Add a container for implementation-specific toggles (throughout).
+
+from tensorflow.contrib.autograph.operators.control_flow import for_loop
+from tensorflow.contrib.autograph.operators.control_flow import while_loop
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..81ae64f110924cb9f8f089ced2f44bb8e3aa5135
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -0,0 +1,184 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow statements: loops, conditionals, etc."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.utils import builtins
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+
+
+def for_loop(iterated, extra_cond, loop_body, init_state):
+  """Functional form of a for statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations, excluding the iterate. In what follows we
+  refer to state as either a tuple of entities that represent an actual state,
+  or a list of arguments of the corresponding types.
+
+  Args:
+    iterated: The entity being iterated over.
+    extra_cond: Callable with the state as arguments, and boolean return type.
+        An additionnal loop condition.
+    loop_body: Callable with the iterate and the state as arguments, and
+        state as return type. The actual loop body.
+    init_state: Tuple containing the initial state.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  if tensor_util.is_tensor(iterated):
+    return _known_len_for_loop(iterated, extra_cond, loop_body, init_state)
+  elif isinstance(iterated, dataset_ops.Dataset):
+    return _dataset_for_loop(iterated, extra_cond, loop_body, init_state)
+  else:
+    return _py_for_loop(iterated, extra_cond, loop_body, init_state)
+
+
+def _py_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that executes a Python for loop."""
+  state = init_state
+  for iterate in iterated:
+    if not extra_cond(*state):
+      break
+    state = loop_body(iterate, *state)
+
+  # TODO(mdan): Remove this special case.
+  if len(state) == 1:
+    return state[0]
+  return state
+
+
+def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over objects that define a length."""
+  n = builtins.dynamic_len(iterated)
+
+  def while_body(iterate_index, *state):
+    iterate = iterated[iterate_index]
+    new_state = loop_body(iterate, *state)
+    return (iterate_index + 1,) + new_state
+
+  def while_cond(iterate_index, *state):
+    return gen_math_ops.logical_and(iterate_index < n, extra_cond(*state))
+
+  results = while_loop(
+      while_cond,
+      while_body,
+      init_state=(0,) + init_state,
+      extra_deps=(iterated,),
+      opts=dict(maximum_iterations=n))
+  # Dropping the iteration index because it's not syntactically visible.
+  results = results[1:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
+  """Overload of for_loop that iterates over TF Datasets."""
+  # Because Datsets only expose get_next, in the style of Python iterators,
+  # we are forced to unpack the loop as:
+  #
+  # epoch_number, iterate = ds.get_next()
+  # while epoch_number < 2:
+  #   <body>
+  #   epoch_number, iterate = ds.get_next()
+  epoch_numbers = dataset_ops.Dataset.range(2)
+  def tag_with(ds, tag):
+    return dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
+  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
+
+  iterator = ds_with_epoch.make_initializable_iterator()
+  with ops.control_dependencies((iterator.initializer,)):
+    epoch_number, iterate = iterator.get_next()
+
+    def while_body(epoch_number, iterate, *state):
+      new_state = loop_body(iterate, *state)
+      epoch_number, iterate = iterator.get_next()
+      return (epoch_number, iterate) + new_state
+
+    def while_cond(epoch_number, iterate, *state):
+      del iterate
+      return gen_math_ops.logical_and(epoch_number < 1, extra_cond(*state))
+
+    results = while_loop(
+        while_cond,
+        while_body,
+        init_state=(epoch_number, iterate) + init_state,
+        extra_deps=())
+  # Dropping the epoch number and iterate because they are not not syntactically
+  # visible.
+  results = results[2:]
+
+  # TODO(mdan): Remove this special case.
+  if len(results) == 1:
+    return results[0]
+  return results
+
+
+def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
+  """Functional form of a while statement.
+
+  The loop operates on a so-called state, which includes all symbols that are
+  variant across loop iterations. In what follows we refer to state as either
+  a tuple of entities that represent an actual state, or a list of arguments
+  of the corresponding types.
+
+  Args:
+    loop_cond: Callable with the state as arguments, and boolean return type.
+        The loop condition.
+    loop_body: Callable with the state as arguments, and state as return type.
+        The actual loop body.
+    init_state: Tuple containing the initial state.
+    extra_deps: Tuple containing additional entities on which the loop may
+        depend, such as loop invariants referenced by loop_cond. Used
+        exclusively for dispatch control.
+    opts: Optional dict of extra loop parameters.
+
+  Returns:
+    Tuple containing the final state.
+  """
+  # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
+  # That could be somethins as simple as a collection of dispatch rules, with
+  # some prioritization.
+  if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
+    return _tf_while_loop(loop_cond, loop_body, init_state, opts)
+  else:
+    return _py_while_loop(loop_cond, loop_body, init_state, opts)
+
+
+def _tf_while_loop(loop_cond, loop_body, init_state, opts):
+  """Overload of while_loop that stages a TF while_loop."""
+  if opts is None:
+    opts = {}
+  return control_flow_ops.while_loop(loop_cond, loop_body, init_state, **opts)
+
+
+def _py_while_loop(loop_cond, loop_body, init_state, opts):
+  """Overload of while_loop that executes a Python while loop."""
+  del opts
+  state = init_state
+  while loop_cond(*state):
+    state = loop_body(*state)
+  return state
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9112b1627fccc0e34216c6710c782e510be29d8b
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for control_flow module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import operators
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ForLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    s = operators.for_loop(
+        constant_op.constant([1, 2, 3, 4]),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+  def test_python(self):
+    s = operators.for_loop(
+        range(5),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    self.assertEqual(10, s)
+
+  def test_dataset(self):
+    to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
+    s = operators.for_loop(
+        dataset_ops.Dataset.range(5).map(to_int32),
+        extra_cond=lambda s: True,
+        loop_body=lambda i, s: (s + i,),
+        init_state=(0,))
+    with self.test_session() as sess:
+      self.assertEqual((10,), sess.run(s))
+
+
+class WhileLoopTest(test.TestCase):
+
+  def test_tensor(self):
+    n = constant_op.constant(5)
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i,),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    with self.test_session() as sess:
+      self.assertEqual((5, 10), sess.run(results))
+
+  def test_python(self):
+    n = 5
+    results = operators.while_loop(
+        loop_cond=lambda i, s: i < n,
+        loop_body=lambda i, s: (i + 1, s + i),
+        init_state=(0, 0),
+        extra_deps=(n,))
+    self.assertEqual((5, 10), results)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index edec5f7712d08247437c9e95d743e59dafffcd7b..c483ff68c4b7c6d9a3315f569b62b8f253079f00 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -66,6 +66,7 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 4f76a695228f7d84b80b2e4b03801e15e94b8f11..4a70bab4402a940dec6a8b183daf7406a7e34131 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -28,7 +28,7 @@ from tensorflow.contrib.autograph.pyct import anno
 class CleanCopier(gast.NodeVisitor):
   """Copy AST nodes.
 
-  The copied nodes will ignore almost all fields that prefixed by '__'.
+  The copied nodes will ignore almost all fields that are prefixed by '__'.
   Exceptions make some annotations.
   """
 
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils.py b/tensorflow/contrib/autograph/pyct/inspect_utils.py
index d19c6ed75e0f0651781d6e1ed80f7be11fb8a5a4..386a6d21ec9ecfe9c5c70ddcd1027dacf6860cea 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils.py
@@ -50,6 +50,18 @@ def getnamespace(f):
   return namespace
 
 
+def getdefiningclass(m, owner_class):
+  """Resolves the class (e.g. one of the superclasses) that defined a method."""
+  m = six.get_unbound_function(m)
+  last_defining = owner_class
+  for superclass in tf_inspect.getmro(owner_class):
+    if hasattr(superclass, m.__name__):
+      superclass_m = getattr(superclass, m.__name__)
+      if six.get_unbound_function(superclass_m) == m:
+        last_defining = superclass
+  return last_defining
+
+
 def getmethodclass(m):
   """Resolves a function's owner, e.g. a method's class.
 
@@ -74,6 +86,12 @@ def getmethodclass(m):
     ValueError: if the class could not be resolved for any unexpected reason.
   """
 
+  # Callable objects: return their own class.
+  if (not hasattr(m, '__name__') and hasattr(m, '__class__') and
+      hasattr(m, '__call__')):
+    if isinstance(m.__class__, six.class_types):
+      return m.__class__
+
   # Instance method and class methods: should be bound to a non-null "self".
   # If self is a class, then it's a class method.
   if hasattr(m, '__self__'):
diff --git a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
index ddca6f963b8abadd621c544a79935c69326bf65e..58f827b79a943a276c2f330f9cfd26e8bcb36119 100644
--- a/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/contrib/autograph/pyct/inspect_utils_test.py
@@ -225,6 +225,39 @@ class InspectUtilsTest(test.TestCase):
         inspect_utils.getmethodclass(test_obj.wrap_decorated_member),
         LocalClass)
 
+  def test_getmethodclass_callables(self):
+    class TestCallable(object):
+
+      def __call__(self):
+        pass
+
+    c = TestCallable()
+    self.assertEqual(inspect_utils.getmethodclass(c), TestCallable)
+
+  def test_getdefiningclass(self):
+    class Superclass(object):
+
+      def foo(self):
+        pass
+
+      def bar(self):
+        pass
+
+    class Subclass(Superclass):
+
+      def foo(self):
+        pass
+
+      def baz(self):
+        pass
+
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.foo, Subclass) is Subclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.bar, Subclass) is Superclass)
+    self.assertTrue(
+        inspect_utils.getdefiningclass(Subclass.baz, Subclass) is Subclass)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
index 4d5764a974aac542ddf4a54a9acd36f1afcb0464..583cf7ecd7bce31c55de58361ab5295abb5d6707 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -112,6 +112,29 @@ class QN(object):
       raise ValueError('Cannot get parent of simple name "%s".' % self.qn[0])
     return self._parent
 
+  @property
+  def support_set(self):
+    """Returns the set of simple symbols that this QN relies on.
+
+    This would be the smallest set of symbols necessary for the QN to
+    statically resolve (assuming properties and index ranges are verified
+    at runtime).
+
+    Examples:
+      'a.b' has only one support symbol, 'a'
+      'a[i]' has two roots, 'a' and 'i'
+    """
+    # TODO(mdan): This might be the set of Name nodes in the AST. Track those?
+    roots = set()
+    if self.has_attr():
+      roots.update(self.parent.support_set)
+    elif self.has_subscript():
+      roots.update(self.parent.support_set)
+      roots.update(self.qn[1].support_set)
+    else:
+      roots.add(self)
+    return roots
+
   def __hash__(self):
     return hash(self.qn + (self._has_attr, self._has_subscript))
 
diff --git a/tensorflow/contrib/autograph/pyct/qual_names_test.py b/tensorflow/contrib/autograph/pyct/qual_names_test.py
index 103bd25aa380e9f61ecea9c5298f34df5157d629..264afd508cdb847315c486806b531dc1483ef622 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names_test.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names_test.py
@@ -154,6 +154,21 @@ class QNTest(test.TestCase):
     a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
     self.assertEqual(a_sub_three.ast().slice.value.n, 3)
 
+  def test_support_set(self):
+    a = QN('a')
+    b = QN('b')
+    c = QN('c')
+    a_sub_b = QN(a, subscript=b)
+    a_dot_b = QN(a, attr='b')
+    a_dot_b_dot_c = QN(a_dot_b, attr='c')
+    a_dot_b_sub_c = QN(a_dot_b, subscript=c)
+
+    self.assertSetEqual(a.support_set, set((a,)))
+    self.assertSetEqual(a_sub_b.support_set, set((a, b)))
+    self.assertSetEqual(a_dot_b.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_dot_c.support_set, set((a,)))
+    self.assertSetEqual(a_dot_b_sub_c.support_set, set((a, c)))
+
 
 class QNResolverTest(test.TestCase):
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index d192bc7aabf6ea36d616ff6f2cef60fddd5973b4..83f3bafc4217649db6499566d548c1657428ad0b 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -34,6 +34,7 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/autograph/pyct",
@@ -46,6 +47,7 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/contrib/autograph/pyct",
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index da6a2f6f0500ebba41b85d06dcc912aae9d68f97..6dd53091fa3a4d874e9133520a2299b8a93ba231 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -265,10 +265,10 @@ class ActivityAnalizer(transformer.Base):
       qn = QN(node.name)
       self.scope.mark_write(qn)
     current_scope = self.scope
-    fndef_scope = Scope(current_scope, isolated=True)
-    self.scope = fndef_scope
+    body_scope = Scope(current_scope, isolated=True)
+    self.scope = body_scope
     self.generic_visit(node)
-    anno.setanno(node, NodeAnno.BODY_SCOPE, fndef_scope)
+    anno.setanno(node, NodeAnno.BODY_SCOPE, body_scope)
     self.scope = current_scope
     return node
 
@@ -282,7 +282,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_If(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
@@ -297,7 +303,13 @@ class ActivityAnalizer(transformer.Base):
     return node
 
   def visit_While(self, node):
+    current_scope = self.scope
+    cond_scope = Scope(current_scope, isolated=False)
+    self.scope = cond_scope
     self.visit(node.test)
+    anno.setanno(node, NodeAnno.COND_SCOPE, cond_scope)
+    self.scope = current_scope
+
     node = self._process_parallel_blocks(node,
                                          ((node.body, NodeAnno.BODY_SCOPE),
                                           (node.orelse, NodeAnno.ORELSE_SCOPE)))
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index 37c28872bb9fc4f0c6f95eec8145101b7a6c83de..1e6c686b01445a86499d4f5254ea7e139e450843 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -204,6 +204,8 @@ class ActivityAnalizerTest(test.TestCase):
     self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE).parent, ('a', 'b', 'c'),
         ('b', 'c'), ('a', 'b', 'c'))
+    self.assertScopeIsRmc(
+        anno.getanno(while_node, NodeAnno.COND_SCOPE), ('b',), (), ())
 
   def test_for(self):
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
index 5254b83ca7c775867fc2ad5ef0a0ad93ac483ba0..d6d9f7e1a6028d1ce8eee6c3e250a260c3bf827f 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/annos.py
@@ -43,6 +43,7 @@ class NodeAnno(NoValue):
   # Scopes
   # Scopes are represented by objects of type activity.Scope.
   ARGS_SCOPE = 'The scope for the argument list of a function call.'
+  COND_SCOPE = 'The scope for the test node of a conditional statement.'
   BODY_SCOPE = (
       'The scope for the main body of a statement (True branch for if '
       'statements, main body for loops).')
diff --git a/tensorflow/contrib/autograph/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
index b53fbb5c18f27aa4681347d965dc7322c849ec91..d3a1b9468892531cbc51bc13de66ef595f1a95f8 100644
--- a/tensorflow/contrib/autograph/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -44,6 +44,7 @@ py_test(
     name = "builtins_test",
     srcs = ["builtins_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
@@ -84,6 +85,7 @@ py_test(
     name = "py_func_test",
     srcs = ["py_func_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":utils",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/utils/__init__.py b/tensorflow/contrib/autograph/utils/__init__.py
index 22898b17e98bb004b4d2aa529b58cc99fc64dbb2..817d4126d106487e1fea3e442712a69bbfccd7f3 100644
--- a/tensorflow/contrib/autograph/utils/__init__.py
+++ b/tensorflow/contrib/autograph/utils/__init__.py
@@ -19,8 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.autograph.utils.builtins import dynamic_builtin
-from tensorflow.contrib.autograph.utils.builtins import dynamic_dataset
-from tensorflow.contrib.autograph.utils.builtins import dynamic_for_cond
 from tensorflow.contrib.autograph.utils.builtins import dynamic_print
 from tensorflow.contrib.autograph.utils.builtins import dynamic_range
 from tensorflow.contrib.autograph.utils.context_managers import control_dependency_on_returns
@@ -28,7 +26,6 @@ from tensorflow.contrib.autograph.utils.misc import alias_tensors
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is
 from tensorflow.contrib.autograph.utils.multiple_dispatch import dynamic_is_not
 from tensorflow.contrib.autograph.utils.multiple_dispatch import run_cond
-from tensorflow.contrib.autograph.utils.multiple_dispatch import run_while
 from tensorflow.contrib.autograph.utils.py_func import wrap_py_func
 from tensorflow.contrib.autograph.utils.tensor_list import dynamic_list_append
 from tensorflow.contrib.autograph.utils.testing import fake_tf
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 4ab32ee47de5c0b3b6ab18c731da7626887b67a5..7fbb7c09d81ff3d3916d48d3c48e377b90a25907 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -18,14 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
+
 import six
 
 from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.contrib.autograph.utils import type_check
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_inspect
@@ -97,70 +97,10 @@ def dynamic_print(*values):
 
   if all(map(is_tf_print_compatible, values)):
     return logging_ops.Print(1, values)
-  return py_func.wrap_py_func(print, None, values, use_dummy_return=True)
-
-
-def dynamic_dataset(iterated):
-  """Implementartion of smart tf.data.Dataset epoch wrapping.
-
-  The function checks if the input is a tf.data.Dataset and if so then wraps it
-  so that for each element it returns it also returns the current epoch the
-  dataset iteration is in, for two epochs.  If the input is not a
-  tf.data.Dataset then it just returns the input.
-
-  Args:
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    Either just the untouched input, or in the case of input being a
-    tf.data.Dataset then it returns a wrapped  tf.data.Dataset where for each
-    element it returns it also returns the current epoch the dataset iteration
-    is in.
-  """
-  if not isinstance(iterated, dataset_ops.Dataset):
-    return iterated
 
-  def epoch_dataset_number_helper(i):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(i).repeat(), iterated))
+  def flushed_print(*vals):
+    print(*vals)
+    sys.stdout.flush()
 
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  return epoch_numbers.flat_map(epoch_dataset_number_helper)
-
-
-def dynamic_for_cond(iteration, iterated):
-  """Implementartion of smart while-loop condition using dynamic dispatch.
-
-  The function checks if it is iterating over a tf.data.Dataset or not, and in
-  the case it is not then it simply returns if we are still in range of the
-  iterated and the next element.  If it is iterating over a dataset then it only
-  iterates for a single epoch.
-
-  Args:
-    iteration: The current iteration of the loop.
-    iterated: The iterable or tf.data.Dataset that is being iterated over.
-  Returns:
-    A tuple of a bool that indicates whether the loop should continue, and the
-    next element in iterated.
-  """
-  # TODO(znado): Clean up.
-  # TODO(znado): This won't work for unpacked iterates. Fix.
-  if isinstance(iterated, dataset_ops.Dataset):
-    curr_epoch, next_elem = iterated.make_one_shot_iterator().get_next()
-    return math_ops.less(curr_epoch, 1), next_elem
-  elif tensor_util.is_tensor(iterated):
-    if iterated.shape.ndims > 1:
-      elem_shape = array_ops.shape(iterated)[1:]
-    else:
-      elem_shape = ()
-    if iterated.shape.ndims == 0 or iterated.shape[0] == 0:
-      return False, array_ops.zeros(elem_shape, iterated.dtype)
-    return control_flow_ops.cond(
-        math_ops.less(iteration, dynamic_len(iterated)),
-        lambda: (True, iterated[iteration]),
-        lambda: (False, array_ops.zeros(elem_shape, iterated.dtype)))
-  elif hasattr(iterated, '__len__'):
-    if iteration < len(iterated):
-      return True, iterated[iteration]
-    return False, None
-  else:
-    raise NotImplementedError('Python iterators not yet supported.')
+  return py_func.wrap_py_func(
+      flushed_print, None, values, use_dummy_return=True)
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch.py b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
index 47049255f31113a0c7b2f5a1269593afdbbc9b19..70eef5676f61bcd978ea53260f0b86a817f2bd7c 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.contrib.autograph.utils.type_check import is_tensor
 from tensorflow.python.ops import control_flow_ops
 
@@ -66,42 +64,3 @@ def py_cond(condition, true_fn, false_fn):
   if len(results) == 1:
     return results[0]
   return results
-
-
-def run_while(cond_fn, body_fn, init_args):
-  """Type-dependent functional while loop.
-
-  Args:
-    cond_fn: A Python callable implementing the stop conditions of the loop.
-    body_fn: A Python callable implementing the body of the loop.
-    init_args: The initial values of the arguments that will be passed to both
-      cond_fn and body_fn.
-
-  Returns:
-    result: A list of values with the same shape and type as init_args. If any
-    of the init_args, or any variables closed-over in cond_fn are Tensors,
-    tf.while_loop will be used, otherwise a Python while loop will be ran.
-
-  Raises:
-    ValueError: if init_args is not a tuple or list with one or more elements.
-  """
-  if not isinstance(init_args, (tuple, list)) or not init_args:
-    raise ValueError(
-        'init_args must be a non-empty list or tuple, found %s' % init_args)
-
-  # TODO(alexbw): statically determine all active variables in cond_fn,
-  # and pass them directly
-  closure_vars = tuple(
-      [c.cell_contents for c in six.get_function_closure(cond_fn) or []])
-  possibly_tensors = tuple(init_args) + closure_vars
-  if is_tensor(*possibly_tensors):
-    return control_flow_ops.while_loop(cond_fn, body_fn, init_args)
-  else:
-    return py_while_loop(cond_fn, body_fn, init_args)
-
-
-def py_while_loop(cond_fn, body_fn, init_args):
-  state = init_args
-  while cond_fn(*state):
-    state = body_fn(*state)
-  return state
diff --git a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
index e6a41bb4166e8cfc8c703685f56eb90a1b5f63b4..f72f8e94a0df815f7d517e2b81ffc86c5c545f07 100644
--- a/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
+++ b/tensorflow/contrib/autograph/utils/multiple_dispatch_test.py
@@ -70,29 +70,6 @@ class MultipleDispatchTest(test.TestCase):
       out = multiple_dispatch.run_cond(constant(False), true_fn, false_fn)
       self.assertEqual(sess.run(out), 3)
 
-  def test_run_while_python(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 1.0, 0.5])
-    self.assertEqual(x, 0.75)
-
-    x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn, [3.0, 4.0, 0.5])
-    self.assertEqual(x, 3.0)
-
-  def test_run_while_tf(self):
-    cond_fn = lambda x, t, s: x > t
-    body_fn = lambda x, t, s: (x * s, t, s)
-
-    with Session() as sess:
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 1.0, 0.5])
-      self.assertEqual(sess.run(x), 0.75)
-
-      x, _, _ = multiple_dispatch.run_while(cond_fn, body_fn,
-                                            [constant(3.0), 4.0, 0.5])
-      self.assertEqual(sess.run(x), 3.0)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index ee67909133fc26ba98355db05a4b90d3dfa6b97b..d65c990c87cbc316472237d183c03765416501e7 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -112,14 +112,3 @@ py_test(
         "//tensorflow/python:script_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/batching/test_util/BUILD b/tensorflow/contrib/batching/test_util/BUILD
index 6db627faad1df4a4b73082e74e7754829ff2b514..7cb2d8079bd18660f72eab92654629434ce4d6a5 100644
--- a/tensorflow/contrib/batching/test_util/BUILD
+++ b/tensorflow/contrib/batching/test_util/BUILD
@@ -8,17 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 cc_library(
     name = "fake_clock_env",
     testonly = 1,
diff --git a/tensorflow/contrib/batching/util/BUILD b/tensorflow/contrib/batching/util/BUILD
index 2a84a7712a8fa66e89db41ff4e7ebe4f620029ca..8f81b6702f2807d7da7e72190ce2d86b28e52113 100644
--- a/tensorflow/contrib/batching/util/BUILD
+++ b/tensorflow/contrib/batching/util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     hdrs = ["periodic_function.h"],
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index a55029b314e67571519d96607ff1fe36070c50ef..5a2d7f6a3c0ba233299a5790fa80488786712f3c 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -57,15 +57,3 @@ cuda_py_test(
         "//tensorflow/python:random_seed",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 985177e897f443989e466d1a498c461a30aeb5cb..d193a8459d00b83580509c8de25d5f7801b195fe 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -44,14 +44,14 @@ def expectation_importance_sampler(f,
                                    n=None,
                                    seed=None,
                                    name='expectation_importance_sampler'):
-  r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`.
+  r"""Monte Carlo estimate of `\\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\)`.
 
-  With `p(z) := exp{log_p(z)}`, this `Op` returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, this `Op` returns
 
   ```
-  n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,
-  \approx E_q[ f(Z) p(Z) / q(Z) ]
-  =       E_p[f(Z)]
+  \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
+  \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
+  \\(=       E_p[f(Z)]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -95,9 +95,9 @@ def expectation_importance_sampler(f,
       log_values = log_f_z + log_p_z - q_log_prob_z
       return _logspace_mean(log_values)
 
-    # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)),
-    # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)]
-    #           = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1]
+    # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\),
+    # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\)
+    # \\(          = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\)
     # Without incurring bias, 1 is added to each to prevent zeros in logspace.
     # The logarithm is approximately linear around 1 + epsilon, so this is good
     # for small values of 'z' as well.
@@ -121,13 +121,13 @@ def expectation_importance_sampler_logspace(
     name='expectation_importance_sampler_logspace'):
   r"""Importance sampling with a positive function, in log-space.
 
-  With `p(z) := exp{log_p(z)}`, and `f(z) = exp{log_f(z)}`, this `Op`
-  returns
+  With `\\(p(z) := exp^{log_p(z)}\\)`, and `\\(f(z) = exp{log_f(z)}\\)`,
+  this `Op` returns
 
   ```
-  Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,
-  \approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]
-  =       Log[E_p[f(Z)]]
+  \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
+  \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
+  \\(=       Log[E_p[f(Z)]]\\)
   ```
 
   This integral is done in log-space with max-subtraction to better handle the
@@ -196,12 +196,12 @@ def _logspace_mean(log_values):
 
 def expectation(f, samples, log_prob=None, use_reparametrization=True,
                 axis=0, keep_dims=False, name=None):
-  """Computes the Monte-Carlo approximation of `E_p[f(X)]`.
+  """Computes the Monte-Carlo approximation of `\\(E_p[f(X)]\\)`.
 
   This function computes the Monte-Carlo approximation of an expectation, i.e.,
 
   ```none
-  E_p[f(X)] approx= m**-1 sum_i^m f(x_j),  x_j ~iid p(X)
+  \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)
   ```
 
   where:
@@ -216,8 +216,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   parameterless distribution (e.g.,
   `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
   expectation, i.e.,
-  `grad[ Avg{ s_i : i=1...n } ] = Avg{ grad[s_i] : i=1...n }` where
-  `S_n = Avg{s_i}` and `s_i = f(x_i), x_i ~ p`.
+  `grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n }` where
+  `S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\)`.
 
   However, if p is not reparameterized, TensorFlow's gradient will be incorrect
   since the chain-rule stops at samples of non-reparameterized distributions.
@@ -296,7 +296,8 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
   Args:
     f: Python callable which can return `f(samples)`.
     samples: `Tensor` of samples used to form the Monte-Carlo approximation of
-      `E_p[f(X)]`.  A batch of samples should be indexed by `axis` dimensions.
+      `\\(E_p[f(X)]\\)`.  A batch of samples should be indexed by `axis`
+      dimensions.
     log_prob: Python callable which can return `log_prob(samples)`. Must
       correspond to the natural-logarithm of the pdf/pmf of each sample. Only
       required/used if `use_reparametrization=False`.
@@ -316,7 +317,7 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
   Returns:
     approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
-      of `E_p[f(X)]`.
+      of `\\(E_p[f(X)]\\)`.
 
   Raises:
     ValueError: if `f` is not a Python `callable`.
diff --git a/tensorflow/contrib/boosted_trees/BUILD b/tensorflow/contrib/boosted_trees/BUILD
index 6fdcd0f996ee011842a5add79f06264a28a2145c..8eac1243ef63dd09c5c5dad4bcd9bd7a15f58900 100644
--- a/tensorflow/contrib/boosted_trees/BUILD
+++ b/tensorflow/contrib/boosted_trees/BUILD
@@ -14,15 +14,6 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 package_group(name = "friends")
 
 cc_library(
@@ -128,7 +119,7 @@ py_library(
 
 py_test(
     name = "gbdt_batch_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/training/functions/gbdt_batch_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index dcd235f876c87b4d7d85c0f1d0fc2e855ced99ea..17e20c4b315bab8852c90788567a2f2f92119f40 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -10,15 +10,6 @@ package(
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "init_py",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/boosted_trees/lib/BUILD b/tensorflow/contrib/boosted_trees/lib/BUILD
index 131bd48562a55a08981ac73277e93024db0d85d3..3028c2281705bd7e34b212332160d25386559d4e 100644
--- a/tensorflow/contrib/boosted_trees/lib/BUILD
+++ b/tensorflow/contrib/boosted_trees/lib/BUILD
@@ -15,17 +15,6 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # Utils
 
 cc_library(
diff --git a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
index da5e7448519cb7f4092f7bbbe1b526271008ec22..a3b1b013e3a40116f74d6ed2df78d87ed3a11ac7 100644
--- a/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
+++ b/tensorflow/contrib/boosted_trees/lib/utils/batch_features.h
@@ -48,9 +48,9 @@ class BatchFeatures {
   Status GetFeatureColumnSizes(int64* const num_dense_float_features,
                                int64* const num_sparse_float_features,
                                int64* const num_sparse_int_features) const {
-    QCHECK_NE(num_dense_float_features, nullptr);
-    QCHECK_NE(num_sparse_float_features, nullptr);
-    QCHECK_NE(num_sparse_int_features, nullptr);
+    QCHECK_NE(num_dense_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_float_features, static_cast<int64*>(nullptr));
+    QCHECK_NE(num_sparse_int_features, static_cast<int64*>(nullptr));
     *num_dense_float_features = dense_float_feature_columns_.size();
     *num_sparse_float_features = sparse_float_feature_columns_.size();
     *num_sparse_int_features = sparse_int_feature_columns_.size();
diff --git a/tensorflow/contrib/boosted_trees/proto/BUILD b/tensorflow/contrib/boosted_trees/proto/BUILD
index 9a61e163eb5ff51dc75de4e40e0f43b090d03c0c..b07f0a4314246eea63764bb6d5e166dd720644fb 100644
--- a/tensorflow/contrib/boosted_trees/proto/BUILD
+++ b/tensorflow/contrib/boosted_trees/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "learner_proto",
     srcs = [
diff --git a/tensorflow/contrib/boosted_trees/resources/BUILD b/tensorflow/contrib/boosted_trees/resources/BUILD
index 9fc101612f1e2a6bf6c5d86ea8c7199936dbb069..c0651868453d40d57e842862855f89e6845c507f 100644
--- a/tensorflow/contrib/boosted_trees/resources/BUILD
+++ b/tensorflow/contrib/boosted_trees/resources/BUILD
@@ -9,17 +9,6 @@ package(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "stamped_resource",
     hdrs = ["stamped_resource.h"],
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index fe8bd072afd43a64fa62a65bd8900b5a98dbe761..f3a75e8688ece19a6e6fd53ee9faf7f4144d76cf 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -14,18 +14,6 @@ load(
     "tf_py_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = ["bigquery_reader_ops"],
     deps = [
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index 56f930a9a8d32c5c3a025163ef56c9562f17d864..ff46f0daa80a70badedf73e15bfaf4dca85fdd89 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -20,20 +20,6 @@ load(
     "tf_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "bigquery_reader_ops",
     srcs = ["bigquery_reader_ops.cc"],
@@ -73,6 +59,7 @@ tf_cc_test(
     ],
     deps = [
         ":bigquery_table_accessor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
index e9b79a066def566096d6c3f3745974423e3371d1..7416eb19d3324fad84876cde5353bc25bac8f648 100644
--- a/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
+++ b/tensorflow/contrib/cloud/kernels/bigquery_table_accessor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,8 +29,8 @@ constexpr char kTestProject[] = "test-project";
 constexpr char kTestDataset[] = "test-dataset";
 constexpr char kTestTable[] = "test-table";
 
-bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 1a124eca364424b651de86bfaac6f33ad131804b..c239e6f8f960910cee14e1df7c4678c643496f54 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -10,19 +10,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 py_library(
     name = "cluster_resolver_pip",
     srcs = [
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 300b19733e2b4d1b912f966e94ae0286ed9c694d..5a2771229d9ffe2b5b389d1077fe02a230e9a4c0 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -61,11 +61,13 @@ class TPUClusterResolver(ClusterResolver):
       return False
     return True
 
-  def _inGke(self):
+  @staticmethod
+  def _inGke():
     """When running in GKE, the environment variable will be set."""
     return _GKE_ENV_VARIABLE in os.environ
 
-  def _gkeMaster(self):
+  @staticmethod
+  def _gkeMaster():
     return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
   def __init__(self,
@@ -73,7 +75,7 @@ class TPUClusterResolver(ClusterResolver):
                zone=None,
                project=None,
                job_name='worker',
-               coordinator_name='coordinator',
+               coordinator_name=None,
                coordinator_address=None,
                credentials='default',
                service=None):
@@ -119,8 +121,9 @@ class TPUClusterResolver(ClusterResolver):
             'Using multiple TPUs in a single session is not yet implemented')
       tpu = tpu[0]
 
+    in_gke = self._inGke()
     # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None and self._inGke():
+    if tpu is None and in_gke:
       tpu = self._gkeMaster()
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
@@ -158,7 +161,8 @@ class TPUClusterResolver(ClusterResolver):
       self._service = service
 
     self._coordinator_name = coordinator_name
-    if coordinator_name and not coordinator_address and should_resolve:
+    if coordinator_name and not coordinator_address and (should_resolve or
+                                                         in_gke):
       self._start_local_server()
     else:
       self._coordinator_address = coordinator_address
@@ -204,31 +208,50 @@ class TPUClusterResolver(ClusterResolver):
     Raises:
       RuntimeError: If the provided TPU is not healthy.
     """
-    if not self._shouldResolve():
-      return server_lib.ClusterSpec({})
-
-    full_name = 'projects/%s/locations/%s/nodes/%s' % (
-        self._project, self._zone, compat.as_text(self._tpu))
-    request = self._service.projects().locations().nodes().get(name=full_name)
-    response = request.execute()
-
-    if 'health' in response and response['health'] != 'HEALTHY':
-      raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
-                                                          response['health']))
-
-    if 'networkEndpoints' in response:
-      worker_list = [
-          '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-          for endpoint in response['networkEndpoints']
-      ]
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      request = self._service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
+                                                            response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self._job_name: worker_list}
     else:
-      # Fall back to the deprecated response format
-      instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-      worker_list = [instance_url]
-
-    cluster_spec = {self._job_name: worker_list}
+      if not self._tpu.startswith(compat.as_bytes('grpc://')):
+        # Case 3.
+        return server_lib.ClusterSpec({})
+      # Case 2.
+      cluster_spec = {self._job_name: [self._tpu[len(
+          compat.as_bytes('grpc://')):]]}
 
     if self._coordinator_address:
+      # {1, 2}.a
       cluster_spec[self._coordinator_name] = [self._coordinator_address]
 
     return server_lib.ClusterSpec(cluster_spec)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 48c3f6bb4f2d1643982e03d9ed68db14c10c184a..dff7a03b6847fb6e159dc2fa9832fceb3dfe2d54 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -117,7 +117,8 @@ class TPUClusterResolverTest(test.TestCase):
         zone=None,
         tpu=['test-tpu-1'],
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
@@ -170,6 +171,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -196,6 +198,7 @@ class TPUClusterResolverTest(test.TestCase):
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
+        coordinator_name='coordinator',
         coordinator_address='10.128.1.5:10203',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
@@ -239,7 +242,8 @@ class TPUClusterResolverTest(test.TestCase):
     tpu_cluster_resolver = TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
-        service=self.mock_service_client(tpu_map=tpu_map))
+        service=self.mock_service_client(tpu_map=tpu_map),
+        coordinator_name='coordinator')
 
     actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
     expected_proto = """
@@ -358,14 +362,10 @@ class TPUClusterResolverTest(test.TestCase):
   def testGkeEnvironment(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
     self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
-    tpu_cluster_resolver = TPUClusterResolver()
-    self.assertTrue(tpu_cluster_resolver._inGke())
-    self.assertEqual(
-        compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver._gkeMaster()))
+    self.assertTrue(TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.get_master()))
+        compat.as_bytes(TPUClusterResolver._gkeMaster()))
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
 
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index cc218e8ab8ce211a85aa3ece318558dd24049c83..35c2a294ecfa51516dcd3922b8a99b1b365de112 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG 575bda39755b98d1f7099406bb57a6e3b2074874)
+set(GRPC_TAG 09386db3939cae1ac12e5f09b735adfa8958c68e)
 
 if(WIN32)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
@@ -35,6 +35,7 @@ else()
   set(grpc_STATIC_LIBRARIES
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
 endif()
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 112b690511cea1ad5f306af718a8e32995033cf6..de84af866ba1cc382004be3f78cba48dc7a71759 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -79,6 +79,7 @@ tensorflow/python/keras/_impl/keras/preprocessing
 tensorflow/python/keras/_impl/keras/utils
 tensorflow/python/keras/_impl/keras/wrappers
 tensorflow/python/kernel_tests
+tensorflow/python/kernel_tests/boosted_trees
 tensorflow/python/kernel_tests/distributions
 tensorflow/python/kernel_tests/linalg
 tensorflow/python/kernel_tests/random
@@ -103,6 +104,8 @@ tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
 tensorflow/tools
+tensorflow/tools/api
+tensorflow/tools/api/generator
 tensorflow/tools/graph_transforms
 tensorflow/contrib
 tensorflow/contrib/all_reduce
@@ -159,6 +162,9 @@ tensorflow/contrib/data/python/ops
 tensorflow/contrib/decision_trees
 tensorflow/contrib/decision_trees/proto
 tensorflow/contrib/deprecated
+tensorflow/contrib/distribute
+tensorflow/contrib/distribute/python
+tensorflow/contrib/distribute/python/examples
 tensorflow/contrib/distributions
 tensorflow/contrib/distributions/python
 tensorflow/contrib/distributions/python/ops
@@ -331,6 +337,7 @@ tensorflow/contrib/nccl/kernels
 tensorflow/contrib/nccl/ops
 tensorflow/contrib/nccl/python
 tensorflow/contrib/nccl/python/ops
+tensorflow/contrib/nearest_neighbor
 tensorflow/contrib/nearest_neighbor/kernels
 tensorflow/contrib/nearest_neighbor/ops
 tensorflow/contrib/nearest_neighbor/python
@@ -341,6 +348,7 @@ tensorflow/contrib/nn/python/ops
 tensorflow/contrib/opt
 tensorflow/contrib/opt/python
 tensorflow/contrib/opt/python/training
+tensorflow/contrib/optimizer_v2
 tensorflow/contrib/pi_examples
 tensorflow/contrib/pi_examples/camera
 tensorflow/contrib/pi_examples/label_image
@@ -349,6 +357,9 @@ tensorflow/contrib/periodic_resample
 tensorflow/contrib/periodic_resample/python
 tensorflow/contrib/periodic_resample/python/ops
 tensorflow/contrib/predictor
+tensorflow/contrib/proto
+tensorflow/contrib/proto/python
+tensorflow/contrib/proto/python/ops
 tensorflow/contrib/quantization
 tensorflow/contrib/quantization/python
 tensorflow/contrib/quantize
@@ -357,6 +368,10 @@ tensorflow/contrib/receptive_field
 tensorflow/contrib/receptive_field/python
 tensorflow/contrib/receptive_field/python/util
 tensorflow/contrib/receptive_field/python/util/examples
+tensorflow/contrib/recurrent
+tensorflow/contrib/recurrent/python
+tensorflow/contrib/recurrent/python/ops
+tensorflow/contrib/recurrent/python/kernel_tests
 tensorflow/contrib/reduce_slice_ops
 tensorflow/contrib/reduce_slice_ops/kernels
 tensorflow/contrib/reduce_slice_ops/ops
@@ -377,6 +392,9 @@ tensorflow/contrib/rnn/ops
 tensorflow/contrib/rnn/python
 tensorflow/contrib/rnn/python/kernel_tests
 tensorflow/contrib/rnn/python/ops
+tensorflow/contrib/rpc
+tensorflow/contrib/rpc/python
+tensorflow/contrib/rpc/python/ops
 tensorflow/contrib/saved_model
 tensorflow/contrib/saved_model/python
 tensorflow/contrib/saved_model/python/saved_model
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index c03c0c80fe62a4f95d0fcf240ee25725a19d86f0..0c80d529af5230ed6d36b265e12ee4b749a14ec4 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -1,4 +1,5 @@
 tensorflow/core
+tensorflow/core/kernels/boosted_trees
 tensorflow/core/profiler
 tensorflow/python
 tensorflow/contrib/boosted_trees/proto
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index a1c320347fe60f87806736befc677541a93e7e93..bcfb4f08196e235864536b48ed0d73975b086629 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -276,7 +276,7 @@ add_custom_command(OUTPUT __force_rebuild COMMAND ${CMAKE_COMMAND} -E echo)
 add_custom_command(OUTPUT
     ${VERSION_INFO_CC}
     COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py
-    --raw_generate ${VERSION_INFO_CC}
+    ARGS --raw_generate ${VERSION_INFO_CC} --source_dir ${tensorflow_source_dir}
     DEPENDS __force_rebuild)
 set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc)
 
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index d6712aa2b48795bb6faf5153c9a8774a7d8bf3c1..e558691de4b74988031f7b2204aad92e8c7af68b 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -15,8 +15,9 @@
 set(tf_op_lib_names
     "audio_ops"
     "array_ops"
-		"batch_ops"
+    "batch_ops"
     "bitwise_ops"
+    "boosted_trees_ops"
     "candidate_sampling_ops"
     "checkpoint_ops"
     "control_flow_ops"
@@ -24,11 +25,13 @@ set(tf_op_lib_names
     "cudnn_rnn_ops"
     "data_flow_ops"
     "dataset_ops"
+    "decode_proto_ops"
+    "encode_proto_ops"
     "functional_ops"
     "image_ops"
     "io_ops"
     "linalg_ops"
-		"list_ops"
+    "list_ops"
     "lookup_ops"
     "logging_ops"
     "manip_ops"
@@ -39,6 +42,7 @@ set(tf_op_lib_names
     "random_ops"
     "remote_fused_graph_ops"
     "resource_variable_ops"
+    "rpc_ops"
     "script_ops"
     "sdca_ops"
     "set_ops"
@@ -48,7 +52,7 @@ set(tf_op_lib_names
     "state_ops"
     "stateless_random_ops"
     "string_ops"
-		"summary_ops"
+    "summary_ops"
     "training_ops"
 )
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 31e715b654c8baa53e25f54b4854c94e80c88049..ded15b4b66b614482954bedb5ef8726bf7881f9c 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -319,6 +319,7 @@ GENERATE_PYTHON_OP_LIB("audio_ops")
 GENERATE_PYTHON_OP_LIB("array_ops")
 GENERATE_PYTHON_OP_LIB("batch_ops")
 GENERATE_PYTHON_OP_LIB("bitwise_ops")
+GENERATE_PYTHON_OP_LIB("boosted_trees_ops")
 GENERATE_PYTHON_OP_LIB("math_ops")
 GENERATE_PYTHON_OP_LIB("functional_ops")
 GENERATE_PYTHON_OP_LIB("candidate_sampling_ops")
@@ -329,6 +330,8 @@ GENERATE_PYTHON_OP_LIB("ctc_ops")
 GENERATE_PYTHON_OP_LIB("cudnn_rnn_ops")
 GENERATE_PYTHON_OP_LIB("data_flow_ops")
 GENERATE_PYTHON_OP_LIB("dataset_ops")
+GENERATE_PYTHON_OP_LIB("decode_proto_ops")
+GENERATE_PYTHON_OP_LIB("encode_proto_ops")
 GENERATE_PYTHON_OP_LIB("image_ops")
 GENERATE_PYTHON_OP_LIB("io_ops")
 GENERATE_PYTHON_OP_LIB("linalg_ops")
@@ -342,6 +345,7 @@ GENERATE_PYTHON_OP_LIB("random_ops")
 GENERATE_PYTHON_OP_LIB("remote_fused_graph_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/remote_fused_graph/pylib/python/ops/gen_remote_fused_graph_ops.py)
 GENERATE_PYTHON_OP_LIB("resource_variable_ops")
+GENERATE_PYTHON_OP_LIB("rpc_ops")
 GENERATE_PYTHON_OP_LIB("script_ops")
 GENERATE_PYTHON_OP_LIB("sdca_ops")
 GENERATE_PYTHON_OP_LIB("set_ops")
@@ -473,6 +477,8 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/lib/core/ndarray_tensor_bridge.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_func.cc"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.h"
+    "${tensorflow_source_dir}/tensorflow/python/lib/core/py_exception_registry.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_seq_tensor.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/py_util.h"
@@ -683,6 +689,77 @@ AddUserOps(TARGET _beam_search_ops
     DEPENDS pywrap_tensorflow_internal tf_python_ops
     DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/seq2seq/python/ops/)
 
+if(WIN32)
+  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  else()
+    add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
+      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
+                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
+  endif()
+else()
+  add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
+endif()
+
+
+########################################################
+# Generate API __init__.py files.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
+file(WRITE "${api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # tensorflow/__init__.py depends on files generated in this step. So, remove it while
+      # this step is running since the files aren't there yet.
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
+
+      # Re-add tensorflow/__init__.py back.
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
+                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
+
+add_custom_target(tf_python_api SOURCES ${api_init_files})
+add_dependencies(tf_python_api tf_python_ops)
+
+
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -692,6 +769,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_copy_scripts_to_destination
     tf_python_touchup_modules
     tf_python_ops
+    tf_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
@@ -704,25 +782,6 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
 
-if(WIN32)
-  if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/$(Configuration)/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  else()
-    add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.dll
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd
-      COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib
-                                       ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/)
-  endif()
-else()
-  add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
-                                     ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
-endif()
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index b86a8f1ec236d820c2c8bbfec059d8eaed851c59..92f2ab6dea8e7da5dd8481639eda24e31c06848f 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -284,6 +284,7 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py"
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py"  # Deadlocks
       "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/sloppy_transformation_dataset_op_test.py"  # b/65430561
+      "${tensorflow_source_dir}/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py"  # Segfaults on Windows.
       # tensor_forest tests (also note that we exclude the hybrid tests for now)
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/count_extremely_random_stats_op_test.py"  # Results in wrong order.
       "${tensorflow_source_dir}/tensorflow/contrib/tensor_forest/python/kernel_tests/sample_inputs_op_test.py"  # Results in wrong order.
diff --git a/tensorflow/contrib/coder/BUILD b/tensorflow/contrib/coder/BUILD
index ec3d550b70d2aaa23b989c44f3d86fa87cffb335..ce12e38248785987e51befa47d04143e235554fe 100644
--- a/tensorflow/contrib/coder/BUILD
+++ b/tensorflow/contrib/coder/BUILD
@@ -154,14 +154,3 @@ tf_py_test(
     ],
     main = "python/ops/coder_ops_test.py",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 388d8e6ed6d9cb9400b0bfbe8e3f50b80149ea1a..bcee0b04c8430588c2dcbc199504bede0436f8f1 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -46,15 +46,3 @@ cuda_py_test(
     ],
     xla_enabled = True,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/copy_graph/BUILD b/tensorflow/contrib/copy_graph/BUILD
index 8ec706df74e2c91345c4bf7a506fdb424a996773..fa44c4d54e1ee871feb425115525b1cf8b732214 100644
--- a/tensorflow/contrib/copy_graph/BUILD
+++ b/tensorflow/contrib/copy_graph/BUILD
@@ -41,15 +41,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index b806799202bff4f2f6dbf717fbeea74a04b8cd6e..102bc460fdadb0ad5dc9a2960b8655c55357108e 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -201,7 +201,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
     #An instance of tensorflow.core.framework.node_def_pb2.NodeDef, it
     #stores String-based info such as name, device and type of the op.
     #Unique to every Operation instance.
-    new_node_def = deepcopy(op._node_def)
+    new_node_def = deepcopy(op.node_def)
     #Change the name
     new_node_def.name = new_name
 
@@ -211,7 +211,7 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
 
     #Make a copy of the op_def too.
     #Its unique to every _type_ of Operation.
-    op_def = deepcopy(op._op_def)
+    op_def = deepcopy(op.op_def)
 
     #Initialize a new Operation instance
     new_op = ops.Operation(new_node_def, to_graph, new_inputs, output_types,
diff --git a/tensorflow/contrib/crf/BUILD b/tensorflow/contrib/crf/BUILD
index 7aad4abdb908d0284b85137bff842bd0f38d09c6..5c1a17df4f95f3c4d05b286de0e3d7b009a76bd7 100644
--- a/tensorflow/contrib/crf/BUILD
+++ b/tensorflow/contrib/crf/BUILD
@@ -40,15 +40,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index fa86ad38c975a95171883adba152e32cd3905082..d68015ae1565b778b1ba0744f515d09007175e93 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -25,6 +25,7 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/contrib/eager/python:checkpointable_utils",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -123,15 +124,3 @@ cuda_py_test(
         "requires_cudnn5",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 9897c31a98e0b335c18a84825fc518ed1fc310a2..9cc6ca09ad6c58a6cdc5909ec755ccdd49424872 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import argparse
 import collections
+import functools
 import itertools
 import os
 import sys
@@ -28,13 +29,14 @@ import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework.test_util import TensorFlowTestCase
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -265,7 +267,7 @@ def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
     return outputs, (output_state_fw, output_state_bw)
 
 
-class CudnnRNNTestBasic(TensorFlowTestCase):
+class CudnnRNNTestBasic(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -467,7 +469,7 @@ class CudnnRNNTestBasic(TensorFlowTestCase):
 
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
+class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
 
   def _CompareWeights(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))
@@ -701,9 +703,146 @@ class CudnnRNNTestSaveRestore(TensorFlowTestCase):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
 
 
+class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):
+
+  def _VerifyCheckpoint(
+      self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
+      num_layers, input_size, expected_variable_values, num_applications=3):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with ops.device("gpu:0"):
+      cudnn_layer = cudnn_cell_fn()
+      cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
+      status = cudnn_checkpoint.restore(checkpoint_path)
+      inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
+                                   dtype=dtypes.float32)
+      cudnn_output, _ = cudnn_layer(inputs)
+      status.assert_consumed().run_restore_ops()
+    second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
+    restore_layer = compatible_cell_fn()
+    restore_layer_checkpoint = checkpointable_utils.Checkpoint(
+        cell=restore_layer)
+    status = restore_layer_checkpoint.restore(second_save_path)
+    current_state = restore_layer.zero_state(1, dtypes.float32)
+    for _ in range(num_applications):
+      restore_layer_output, current_state = restore_layer(
+          inputs=3. * array_ops.ones([1, input_size]),
+          state=current_state)
+    status.assert_consumed().run_restore_ops()
+    self.assertTrue(restore_layer.variables)
+    for variable, expected_value in zip(
+        restore_layer.variables, expected_variable_values):
+      self.assertAllClose(expected_value, self.evaluate(variable))
+    self.assertAllClose(self.evaluate(restore_layer_output),
+                        self.evaluate(cudnn_output)[-1, -1:, ...])
+
+  def _CheckpointableSingleCellUnidirectionalTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn):
+    # Single-layer cuDNN cells with object-based checkpointing should be
+    # checkpoint compatible with either single CudnnCompatible cells or
+    # MultiRnnCells with one cell.
+    input_size = 3
+    save_cell_layer = single_cell_fn()
+    save_cell_layer(
+        inputs=array_ops.ones([1, input_size]),
+        state=save_cell_layer.zero_state(1, dtypes.float32))
+    self.assertTrue(save_cell_layer.variables)
+    expected_values = []
+    np.random.seed(10)
+    for variable in save_cell_layer.variables:
+      value = np.random.normal(size=variable.shape)
+      expected_values.append(value)
+      self.evaluate(variable.assign(value))
+    save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=
+        lambda: rnn_cell_impl.MultiRNNCell([single_cell_fn()]),
+        cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=1,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testLSTMCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableSingleCellUnidirectionalTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=1, num_units=num_units,
+            direction=direction, name="awesome_lstm"))
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testGRUCheckpointableSingleLayer(self):
+    num_units = 2
+    direction = CUDNN_RNN_UNIDIRECTION
+    with self.assertRaises(NotImplementedError):
+      # TODO(allenl): Implement object-based saving for GRUs and other cells.
+      self._CheckpointableSingleCellUnidirectionalTestTemplate(
+          single_cell_fn=functools.partial(
+              cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
+          cudnn_cell_fn=functools.partial(
+              cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
+              direction=direction, name="awesome_gru"))
+
+  def _CheckpointableMultiLayerTestTemplate(
+      self, single_cell_fn, cudnn_cell_fn, num_layers):
+
+    def _MultiCellFn():
+      return rnn_cell_impl.MultiRNNCell(
+          [single_cell_fn() for _ in range(num_layers)])
+    input_size = 3
+    save_graph = ops.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph):
+      save_layer = _MultiCellFn()
+      save_layer(inputs=array_ops.ones([1, input_size]),
+                 state=save_layer.zero_state(1, dtypes.float32))
+      self.assertTrue(save_layer.variables)
+      expected_values = []
+      np.random.seed(10)
+      for variable in save_layer.variables:
+        value = np.random.normal(size=variable.shape)
+        expected_values.append(value)
+        self.evaluate(variable.assign(value))
+      save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      first_save_path = save_checkpoint.save(checkpoint_prefix)
+    self._VerifyCheckpoint(
+        checkpoint_path=first_save_path,
+        compatible_cell_fn=_MultiCellFn, cudnn_cell_fn=cudnn_cell_fn,
+        num_layers=num_layers,
+        expected_variable_values=expected_values,
+        input_size=input_size)
+
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  @test_util.run_in_graph_and_eager_modes()
+  def testCudnnCompatibleLSTMCheckpointablMultiLayer(self):
+    num_units = 2
+    num_layers = 3
+    direction = CUDNN_RNN_UNIDIRECTION
+    self._CheckpointableMultiLayerTestTemplate(
+        single_cell_fn=functools.partial(
+            cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
+        cudnn_cell_fn=functools.partial(
+            cudnn_rnn.CudnnLSTM, num_layers=num_layers, num_units=num_units,
+            direction=direction, name="awesome_lstm"),
+        num_layers=num_layers)
+
+
 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
-class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
+class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):
 
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
@@ -884,7 +1023,7 @@ class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase):
                               rtol=2e-5)
 
 
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
+class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
                             dtype, direction):
@@ -931,7 +1070,7 @@ class CudnnRNNTestParamsSize(TensorFlowTestCase):
                                    dtype, direction)
 
 
-class CudnnRNNTestTraining(TensorFlowTestCase):
+class CudnnRNNTestTraining(test_util.TensorFlowTestCase):
 
   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 36fba917a8f56c26fd5b4c3468d1d980a8ba2ba5..00d9544602ae4472cd04f04f544396b9814302fe 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -142,6 +142,9 @@ class _CudnnRNN(base_layer.Layer):
   """
   # pylint:enable=line-too-long
 
+  # TODO(allenl): Document object-based saving and checkpoint compatibility once
+  # it's implemented for more cuDNN Layers.
+
   # The following are constants defined by subclasses.
   # Type of RNN cell.
   _rnn_mode = None
@@ -363,6 +366,11 @@ class _CudnnRNN(base_layer.Layer):
       self._create_saveable()
     self.built = True
 
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "This cell does not yet support object-based saving. File a feature "
+        "request if this limitation bothers you.")
+
   def call(self, inputs, initial_state=None, training=True):
     """Runs the forward step for the RNN model.
 
@@ -499,6 +507,8 @@ class _CudnnRNN(base_layer.Layer):
         direction=self.direction,
         scope=vs.get_variable_scope(),
         name="%s_saveable" % self.trainable_variables[0].name.split(":")[0])
+    self._saveable._add_checkpointable_dependencies(  # pylint: disable=protected-access
+        checkpointable=self, dtype=self._plain_dtype)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self._saveable)
 
 
@@ -521,6 +531,16 @@ class CudnnLSTM(_CudnnRNN):
     return ([self.num_layers * self.num_dirs, batch_size, self.num_units],
             [self.num_layers * self.num_dirs, batch_size, self.num_units])
 
+  @property
+  def _gather_saveables_for_checkpoint(self):
+    if self._direction == CUDNN_RNN_UNIDIRECTION:
+      # Skip one inheritance level to avoid NotImplementedError.
+      return super(_CudnnRNN, self)._gather_saveables_for_checkpoint
+    else:
+      raise NotImplementedError(
+          "Object-based saving does not currently support bidirectional LSTM "
+          "cells. File a feature request if this limitation bothers you.")
+
 
 class _CudnnRNNNoInputC(_CudnnRNN):
   """Abstract simple CudnnRNN layer without input_c."""
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 622241a1774545529a4cdcb974333b53c8f56caa..c28c3a18e40d85c054e3dd8603fc997c775ccc5a 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -17,12 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
@@ -31,6 +32,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
@@ -86,19 +88,23 @@ class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
 
   Cudnn compatible GRU (from Cudnn library user guide):
   ```python
-  r_t = sigma(x_t * W_r + h_t-1 * R_h + b_Wr + b_Rr)  # reset gate
-  u_t = sigma(x_t * W_u + h_t-1 * R_u + b_Wu + b_Ru)  # update gate
-  h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_Rh) + b_Wh)  # new memory gate
-  h_t = (1 - u_t) .* h'_t + u_t .* h_t-1
+  # reset gate
+  $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
+  # update gate
+  $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
+  # new memory gate
+  $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
+  $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
   ```
 
   Other GRU (see @{tf.nn.rnn_cell.GRUCell} and @{tf.contrib.rnn.GRUBlockCell}):
   ```python
-  h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_Wh)  # new memory gate
+  # new memory gate
+  \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
   ```
   which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
   ```python
-  r .* (h * R) != (r .* h) * R
+  \\(r .* (h * R) != (r .* h) * R\\)
   ```
   """
 
@@ -262,13 +268,16 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     # instead of having the master pull all slices and then save them.
     slice_spec = ""
     params = weights + biases
-    param_names = weight_names + bias_names
+    self._weight_names = weight_names
+    self._bias_names = bias_names
+    self._param_names = weight_names + bias_names
+    prefixed_param_names = weight_names + bias_names
     if self._scope:
-      param_names = ["%s/%s" % (self._scope, pn) for pn in param_names]
-
+      prefixed_param_names = [
+          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names]
     specs = [
         saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
-        for param, param_name in zip(params, param_names)
+        for param, param_name in zip(params, prefixed_param_names)
     ]
     super(CudnnOpaqueParamsSaveable, self).__init__(
         array_ops.identity(self._variables), specs, name)
@@ -281,6 +290,45 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
     return state_ops.assign(
         self._variables, opaque_params, validate_shape=False)
 
+  def _checkpointable_save(self, save_buffer):
+    weights, biases = self._OpaqueParamsToCanonical()
+    with ops.device("gpu:0"):
+      (weights, _), (biases, _) = self._TransformCanonical(
+          weights, biases)
+    for name, tensor in zip(self._param_names, weights + biases):
+      save_buffer[name] = array_ops.identity(tensor)
+
+  def _checkpointable_restore(self, restore_buffer):
+    tensors = [array_ops.identity(restore_buffer[name])
+               for name in self._param_names]
+    return self.restore(
+        restored_tensors=tensors,
+        restored_shapes=None  # Unused
+    )
+
+  def _add_checkpointable_dependencies(self, checkpointable, dtype):
+    """Add canonical weight dependencies to `checkpointable`.
+
+    When saving or restoring, converts to or from the opaque buffer
+    format. Weights are saved and loaded in the configuration expected by
+    cuDNN-compatible cells.
+
+    Args:
+      checkpointable: An object inheriting from `CheckpointableBase` to add
+        dependencies too (typically the cuDNN `Layer`).
+      dtype: The dtype for the canonical parameter Tensors.
+    """
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=self._param_names,
+        component_dtypes=(dtype,) * len(self._param_names),
+        fill_save_buffer_fn=self._checkpointable_save,
+        consume_restore_buffer_fn=self._checkpointable_restore)
+    self._checkpointable_track_params(checkpointable, split_dependencies)
+
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Tracks parameters in a canonical configuration."""
+    return  # NotImplementedError raised by the Layer.
+
   def _TFCanonicalNamePrefix(self, layer, is_fwd=True):
     if self._direction == CUDNN_RNN_UNIDIRECTION:
       return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
@@ -476,10 +524,7 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_LSTM
   _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleLSTMCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
 
   def _cudnn_to_tf_gate_params(self, *cu_gate_order):
     i_g, f_g, c_g, o_g = cu_gate_order
@@ -570,6 +615,29 @@ class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
     tf_biases.append(b)
     tf_bias_names.append(prefix + "/bias")
 
+  def _checkpointable_track_params(self, checkpointable, params):
+    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
+    biases = []
+    weights = []
+    for name in self._weight_names:
+      weights.append(params[name])
+    for name in self._bias_names:
+      biases.append(params[name])
+    assert len(params) == len(weights) + len(biases)
+    if len(weights) == 1 and len(biases) == 1:
+      # For single-layer cells, allow substituting a cell with no MultiRNNCell
+      # wrapping.
+      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
+      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
+      checkpointable._track_checkpointable(kernel, name="kernel")  # pylint: disable=protected-access
+      checkpointable._track_checkpointable(bias, name="bias")  # pylint: disable=protected-access
+    assert len(biases) == len(weights)
+    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
+      cell = checkpointable_lib.Checkpointable()
+      checkpointable._track_checkpointable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
+      cell.bias = bias
+      cell.kernel = kernel
+
 
 class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   """SaveableObject implementation handling Cudnn GRU opaque params."""
@@ -577,10 +645,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
   _rnn_mode = CUDNN_GRU
   _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(CudnnCompatibleGRUCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
@@ -659,11 +724,7 @@ class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
 class CudnnRNNSimpleSaveable(CudnnLSTMSaveable):
   """SaveableObject implementation handling Cudnn RNN Tanh opaque params."""
 
-  # pylint:disable=protected-access
-  _rnn_cell_name = base_layer._to_snake_case(
-      rnn_cell_impl.BasicRNNCell.__name__)
-
-  # pylint:enable=protected-access
+  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
 
   def _cudnn_to_tf_weights(self, *cu_weights):
     r"""Stitching cudnn canonical weights to generate tf canonical weights."""
diff --git a/tensorflow/contrib/data/BUILD b/tensorflow/contrib/data/BUILD
index 9e25a77d9fd3fecdf82fdc69de97671c8ca6bb2b..7bb0dc1c0f695f4d1c7739fa11764ded4ff9410a 100644
--- a/tensorflow/contrib/data/BUILD
+++ b/tensorflow/contrib/data/BUILD
@@ -8,6 +8,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
     "tf_gen_op_libs",
+    "if_not_windows",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -31,12 +32,17 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "lib_proto_parsing_for_dataset_ops",
+    deps = if_not_windows(["//tensorflow/core:lib_proto_parsing"]),
+)
+
 tf_custom_op_library(
     name = "_dataset_ops.so",
     srcs = ["ops/dataset_ops.cc"],
     deps = ["//tensorflow/contrib/data/kernels:dataset_kernels"] +
            if_static(
-               extra_deps = ["//tensorflow/core:lib_proto_parsing"],
+               extra_deps = [":lib_proto_parsing_for_dataset_ops"],
                otherwise = [],
            ),
 )
@@ -44,17 +50,3 @@ tf_custom_op_library(
 tf_gen_op_libs(
     op_lib_names = ["dataset_ops"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 766721d8d2c2cc22a290d07f064471cb67c07d90..125260b4c1f6b63c8f83f28d1829afe2d9d3ea97 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -25,6 +25,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@Counter
 @@SqlDataset
 
+@@assert_element_shape
 @@batch_and_drop_remainder
 @@bucket_by_sequence_length
 @@dense_to_sparse_batch
@@ -32,6 +33,7 @@ See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 @@group_by_window
 @@ignore_errors
 @@make_batched_features_dataset
+@@make_csv_dataset
 @@make_saveable_from_iterator
 @@map_and_batch
 @@padded_batch_and_drop_remainder
@@ -54,6 +56,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 
+from tensorflow.contrib.data.python.ops.batching import assert_element_shape
 from tensorflow.contrib.data.python.ops.batching import batch_and_drop_remainder
 from tensorflow.contrib.data.python.ops.batching import dense_to_sparse_batch
 from tensorflow.contrib.data.python.ops.batching import map_and_batch
@@ -70,6 +73,7 @@ from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
 from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
+from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
 from tensorflow.contrib.data.python.ops.readers import SqlDataset
 from tensorflow.contrib.data.python.ops.resampling import rejection_resample
@@ -82,3 +86,6 @@ from tensorflow.python.ops.parsing_ops import parse_single_example_v2 as parse_s
 
 from tensorflow.python.util.all_util import remove_undocumented
 remove_undocumented(__name__)
+
+# A constant that can be used to enable auto-tuning.
+AUTOTUNE = -1
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index c87da7dfaa5943f7918c370f63362673844c7f0e..83ada6fb67dcbff595a38ce9e8609bdd1219b075 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -61,14 +61,3 @@ cc_library(
         "@protobuf_archive//:protobuf_headers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index 79d1fc3494d7fd223c52b3086686f732d3875767..a2bfce03620a1482f5b21cbf23c66833bc5cd480 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -40,8 +40,7 @@ class FunctionBufferingResource : public ResourceBase {
                             const NameAttrList& func, int64 buffer_size,
                             const string& source_device,
                             const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            int64 thread_pool_size)
+                            const std::vector<Tensor>& func_args)
       : lib_(lib),
         pflr_(std::move(pflr)),
         func_(func),
@@ -52,22 +51,10 @@ class FunctionBufferingResource : public ResourceBase {
         handle_(kInvalidHandle),
         is_buffering_(false),
         end_of_sequence_(false),
-        cancelled_(false) {
-    if (thread_pool_size > 0) {
-      thread_pool_ = new thread::ThreadPool(Env::Default(), ThreadOptions(),
-                                            "buffer_resource", thread_pool_size,
-                                            false /* low_latency_hint */);
-      runner_ = [this](std::function<void()> c) {
-        thread_pool_->Schedule(std::move(c));
-      };
-    }
-  }
+        cancelled_(false) {}
 
   ~FunctionBufferingResource() override {
     Cancel();
-    if (thread_pool_ != nullptr) {
-      delete thread_pool_;
-    }
   }
 
   string DebugString() override {
@@ -179,17 +166,12 @@ class FunctionBufferingResource : public ResourceBase {
       for (int i = 0; i < cancellation_callbacks.size(); ++i) {
         cancellation_callbacks[i](cancellation_buffer_elements[i]);
       }
-      // We only wait on cond_var_ in the destructor, so there would atmost be
-      // one waiter to notify.
-      cond_var_.notify_one();
+      cond_var_.notify_all();
       return;
     }
     FunctionLibraryRuntime::Options opts;
     // Copied from CapturedFunction::generate_step_id();
     opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    if (runner_ != nullptr) {
-      opts.runner = &runner_;
-    }
     opts.source_device = source_device_;
     AllocatorAttributes arg_alloc_attr;
     arg_alloc_attr.set_on_host(true);
@@ -224,6 +206,13 @@ class FunctionBufferingResource : public ResourceBase {
                   if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
                     restart_buffering = true;
                   } else {
+                    // When the buffer is full, we don't want to call
+                    // FillBuffer() unless we're in cancellation phase in which
+                    // case FillBuffer() will do the final cleanup post
+                    // cancellation.
+                    if (cancelled_) {
+                      restart_buffering = true;
+                    }
                     is_buffering_ = false;
                   }
                 }
@@ -244,11 +233,9 @@ class FunctionBufferingResource : public ResourceBase {
   const string source_device_;
   const string target_device_;
   const std::vector<Tensor> func_args_;
-  thread::ThreadPool* thread_pool_ = nullptr;
   FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
   std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
   std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  std::function<void(std::function<void()>)> runner_ = nullptr;
   bool is_buffering_ GUARDED_BY(mu_);
   bool end_of_sequence_ GUARDED_BY(mu_);
   bool cancelled_ GUARDED_BY(mu_);
@@ -263,7 +250,6 @@ class FunctionBufferResourceHandleOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("thread_pool_size", &thread_pool_size_));
   }
 
   ~FunctionBufferResourceHandleOp() override {
@@ -311,9 +297,10 @@ class FunctionBufferResourceHandleOp : public OpKernel {
                this](FunctionBufferingResource** ptr) {
                 *ptr = new FunctionBufferingResource(
                     clone_lib, std::move(pflr), func_, buffer_size_,
-                    source_device, target_device, func_args, thread_pool_size_);
+                    source_device, target_device, func_args);
                 return Status::OK();
               }));
+      core::ScopedUnref s(buffer);
       OP_REQUIRES_OK(ctx, buffer->Instantiate());
       initialized_ = true;
     }
@@ -332,7 +319,6 @@ class FunctionBufferResourceHandleOp : public OpKernel {
   int64 buffer_size_;
   string container_;
   string name_;
-  int64 thread_pool_size_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FunctionBufferingResource")
@@ -373,25 +359,27 @@ class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
     OP_REQUIRES_OK_ASYNC(
         ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
         done);
-    core::ScopedUnref s(buffer);
 
     if (buffer->Finished()) {
+      buffer->Unref();
       ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
       done();
       return;
     }
 
     FunctionBufferCallback callback =
-        [ctx, done](const BufferElement& buffer_element) {
+        [ctx, buffer, done](const BufferElement& buffer_element) {
           Status s = buffer_element.status;
           if (!s.ok()) {
             ctx->SetStatus(s);
+            buffer->Unref();
             done();
             return;
           }
           for (size_t i = 0; i < buffer_element.value.size(); ++i) {
             ctx->set_output(i, buffer_element.value[i]);
           }
+          buffer->Unref();
           done();
         };
     buffer->MaybeGet(std::move(callback));
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index bd96448d64e94c04da6d6b1d6506342631d5b3fb..cf0a8bbccb5813c799e7e6db91d73e2ecf4107f8 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -53,7 +53,6 @@ REGISTER_OP("FunctionBufferingResource")
     .Attr("container: string")
     .Attr("f: func")
     .Attr("buffer_size: int")
-    .Attr("thread_pool_size: int")
     .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 Creates a resource that fills up a buffer by making function calls.
@@ -63,7 +62,6 @@ target_device: Target device to execute the function on.
 resource: Handle to the resource created.
 f: Function to be executed.
 buffer_size: Size of the buffer.
-thread_pool_size: Size of the threadpool doing the prefetching.
 container: If non-empty, this resource is placed in the given container.
   Otherwise, a default container is used.
 shared_name: If non-empty, this resource will be shared under the given name
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 0b3bf63f79430a7b0fb0a1b72f0b287f1370eb60..7270d533c69002ad6b318645f1ef07ebb45a85c3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -22,6 +22,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
@@ -477,11 +478,6 @@ py_test(
     size = "small",
     srcs = ["prefetching_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-    ],
     deps = [
         "//tensorflow/contrib/data/python/ops:prefetching_ops",
         "//tensorflow/core:protos_all_py",
@@ -513,17 +509,3 @@ tf_py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 75482f67da11401305b7b342cd5c971da71a4f3c..413d8737978b695ac443c92036d6641e5c73f28c 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -28,8 +28,10 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -579,5 +581,73 @@ class PaddedBatchDatasetSerializationTest(
                         lambda: build_dataset(seq_lens2), 8)
 
 
+class RestructuredDatasetTest(test.TestCase):
+
+  def test_assert_element_shape(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(5).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    expected_shapes = (tensor_shape.TensorShape(2),
+                       tensor_shape.TensorShape((3, 4)))
+    result = dataset.apply(batching.assert_element_shape(expected_shapes))
+    self.assertEqual(expected_shapes, result.output_shapes)
+
+    iterator = result.make_initializable_iterator()
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      for _ in range(5):
+        sess.run(get_next)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def test_assert_wrong_element_shape(self):
+
+    def create_dataset(_):
+      return (array_ops.ones(2, dtype=dtypes.float32),
+              array_ops.zeros((3, 4), dtype=dtypes.int32))
+
+    dataset = dataset_ops.Dataset.range(3).map(create_dataset)
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    with self.assertRaises(ValueError):
+      dataset.apply(batching.assert_element_shape(wrong_shapes))
+
+  def test_assert_wrong_element_shape_on_unknown_shape_dataset(self):
+
+    def create_unknown_shape_dataset(x):
+      return script_ops.py_func(lambda _: (np.ones(2, dtype=np.float32),
+                                           np.zeros((3, 4), dtype=np.int32)),
+                                [x],
+                                [dtypes.float32, dtypes.int32])
+
+    dataset = dataset_ops.Dataset.range(3).map(create_unknown_shape_dataset)
+    unknown_shapes = (tensor_shape.TensorShape(None),
+                      tensor_shape.TensorShape(None))
+    self.assertEqual(unknown_shapes, dataset.output_shapes)
+
+    wrong_shapes = (tensor_shape.TensorShape(2),
+                    tensor_shape.TensorShape((3, 10)))
+    iterator = (
+        dataset.apply(batching.assert_element_shape(wrong_shapes))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaises(errors.InvalidArgumentError):
+        sess.run(get_next)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index d0131896a1a5986cfc5ed37785a0d0090ae6600c..6002cc73c8b41c2f20beaf0158af813807e58c90 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -104,6 +104,21 @@ class GroupByWindowTest(test.TestCase):
       self.assertAllEqual([0, 0, 0], sess.run(get_next))
       self.assertAllEqual([1], sess.run(get_next))
 
+  def testEmpty(self):
+    iterator = (
+        dataset_ops.Dataset.range(4).apply(
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "Window size must be greater than zero, but got 0."):
+        print(sess.run(get_next))
+
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index a14736ac09c9174d1536677ad05db76dc8887913..b08132cd72254326d965907a1fdafb8a820926a1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -28,12 +28,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
 
-class StagingAreaOpsTest(test.TestCase):
+class PrefetchingKernelsOpsTest(test.TestCase):
 
   def setUp(self):
     self._event = threading.Event()
@@ -70,7 +71,6 @@ class StagingAreaOpsTest(test.TestCase):
           target_device=target,
           string_arg=ds_iterator_handle,
           buffer_size=3,
-          thread_pool_size=2,
           shared_name=buffer_name)
 
     with ops.device(device1):
@@ -201,6 +201,9 @@ class StagingAreaOpsTest(test.TestCase):
 
       sess.run(destroy_op)
 
+
+class PrefetchToDeviceTest(test.TestCase):
+
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
@@ -232,6 +235,74 @@ class StagingAreaOpsTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchDictToDevice(self):
+    host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element["a"].dtype)
+    self.assertEqual([], next_element["a"].shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        self.assertEqual({"a": i}, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchSparseTensorsToDevice(self):
+    def make_tensor(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i*[1]), dense_shape=[2, 2])
+    host_dataset = dataset_ops.Dataset.range(10).map(make_tensor)
+
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_one_shot_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      for i in range(10):
+        actual = sess.run(next_element)
+        self.assertAllEqual([i], actual.values)
+        self.assertAllEqual([[0, 0]], actual.indices)
+        self.assertAllEqual([2, 2], actual.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -249,5 +320,62 @@ class StagingAreaOpsTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testPrefetchToDeviceWithReInit(self):
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/cpu:1"))
+
+    # NOTE(mrry): This device block creates the "host" dataset and iterator on
+    # /cpu:0, and ensures that the prefetching is across devices. In typical use
+    # this would not be necessary, because the GPU device would not support any
+    # of the dataset-related ops.
+    with ops.device("/cpu:0"):
+      iterator = device_dataset.make_initializable_iterator()
+
+    self.assertEqual(host_dataset.output_types, device_dataset.output_types)
+    self.assertEqual(host_dataset.output_types, iterator.output_types)
+    self.assertEqual(host_dataset.output_shapes, device_dataset.output_shapes)
+    self.assertEqual(host_dataset.output_shapes, iterator.output_shapes)
+    self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
+    self.assertEqual(host_dataset.output_classes, iterator.output_classes)
+
+    next_element = iterator.get_next()
+    self.assertEqual(dtypes.int64, next_element.dtype)
+    self.assertEqual([], next_element.shape)
+
+    worker_config = config_pb2.ConfigProto()
+    worker_config.device_count["CPU"] = 2
+    with self.test_session(config=worker_config) as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToDeviceGpuWithReInit(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    iterator = device_dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for i in range(5):
+        self.assertEqual(i, sess.run(next_element))
+      sess.run(iterator.initializer)
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 6ee1b572f121a9a40dfd638f7a858d5f1176ea3c..1075302bae96ca2e0111efbacdf5e919ea76897d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -271,7 +271,8 @@ class ReadBatchFeaturesTest(test.TestCase):
                            reader_num_threads=1,
                            parser_num_threads=1,
                            shuffle=False,
-                           shuffle_seed=None):
+                           shuffle_seed=None,
+                           drop_final_batch=False):
     self.filenames = filenames
     self.num_epochs = num_epochs
     self.batch_size = batch_size
@@ -289,22 +290,25 @@ class ReadBatchFeaturesTest(test.TestCase):
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
         reader_num_threads=reader_num_threads,
-        parser_num_threads=parser_num_threads).make_one_shot_iterator(
+        parser_num_threads=parser_num_threads,
+        drop_final_batch=drop_final_batch).make_one_shot_iterator(
         ).get_next()
 
   def _record(self, f, r):
-    example = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            "file":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[f])),
-            "record":
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[r])),
-            "keywords":
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=self._get_keywords(f, r)))
-        }))
+    example = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                "file":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[f])),
+                "record":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[r])),
+                "keywords":
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=self._get_keywords(f, r)))
+            }))
     return example.SerializeToString()
 
   def _get_keywords(self, f, r):
@@ -372,8 +376,8 @@ class ReadBatchFeaturesTest(test.TestCase):
         record_batch.append(r)
         keywords = self._get_keywords(f, r)
         keywords_batch_values.extend(keywords)
-        keywords_batch_indices.extend([[batch_index, i]
-                                       for i in range(len(keywords))])
+        keywords_batch_indices.extend(
+            [[batch_index, i] for i in range(len(keywords))])
         batch_index += 1
         keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
         if len(file_batch) == batch_size:
@@ -473,9 +477,10 @@ class ReadBatchFeaturesTest(test.TestCase):
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
         "record": parsing_ops.FixedLenFeature([], dtypes.int64),
     }
-    dataset = (core_readers.TFRecordDataset(self.test_filenames)
-               .map(lambda x: parsing_ops.parse_single_example(x, features))
-               .repeat(10).batch(2))
+    dataset = (
+        core_readers.TFRecordDataset(self.test_filenames)
+        .map(lambda x: parsing_ops.parse_single_example(x, features))
+        .repeat(10).batch(2))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     next_element = iterator.get_next()
@@ -559,6 +564,20 @@ class ReadBatchFeaturesTest(test.TestCase):
               with self.assertRaises(errors.OutOfRangeError):
                 self._next_actual_batch(sess)
 
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 10]:
+        with ops.Graph().as_default():
+          # Basic test: read from file 0.
+          self.outputs = self._read_batch_features(
+              filenames=self.test_filenames[0],
+              num_epochs=num_epochs,
+              batch_size=batch_size,
+              drop_final_batch=True)
+          for _, tensor in self.outputs.items():
+            if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
+              self.assertEqual(tensor.shape[0], batch_size)
+
 
 class MakeCsvDatasetTest(test.TestCase):
 
@@ -591,20 +610,25 @@ class MakeCsvDatasetTest(test.TestCase):
         "record %d" % recordno if recordno % 2 == 1 else "",
     ]
 
-  def _csv_record(self, fileno, recordno):
-    return ",".join(str(v) for v in self._csv_values(fileno, recordno))
+  def _write_file(self, filename, rows):
+    for i in range(len(rows)):
+      if isinstance(rows[i], list):
+        rows[i] = ",".join(str(v) if v is not None else "" for v in rows[i])
+    fn = os.path.join(self.get_temp_dir(), filename)
+    f = open(fn, "w")
+    f.write("\n".join(rows))
+    f.close()
+    return fn
 
   def _create_file(self, fileno, header=True, comment=True):
-    fn = os.path.join(self.get_temp_dir(), "csv_file%d.csv" % fileno)
-    f = open(fn, "w")
+    rows = []
     if header:
-      f.write(",".join(self.COLUMNS) + "\n")
+      rows.append(self.COLUMNS)
     for recno in range(self._num_records):
-      f.write(self._csv_record(fileno, recno) + "\n")
+      rows.append(self._csv_values(fileno, recno))
       if comment:
-        f.write("# Some comment goes here. Should be ignored!\n")
-    f.close()
-    return fn
+        rows.append("# Some comment goes here. Ignore me.")
+    return self._write_file("csv_file%d.csv" % fileno, rows)
 
   def _create_files(self):
     filenames = []
@@ -618,6 +642,7 @@ class MakeCsvDatasetTest(test.TestCase):
       defaults,
       column_names=COLUMNS,
       label_name=LABEL,
+      select_cols=None,
       batch_size=1,
       num_epochs=1,
       shuffle=False,
@@ -640,6 +665,7 @@ class MakeCsvDatasetTest(test.TestCase):
         comment=comment,
         na_value=na_value,
         default_float_type=default_float_type,
+        select_columns=select_cols,
     )
 
   def _next_actual_batch(self, file_indices, batch_size, num_epochs, defaults):
@@ -696,7 +722,7 @@ class MakeCsvDatasetTest(test.TestCase):
     with self.assertRaises(errors.OutOfRangeError):
       sess.run(get_next)
 
-  def test_make_csv_dataset(self):
+  def testMakeCSVDataset(self):
     defaults = self.DEFAULTS
 
     with ops.Graph().as_default() as g:
@@ -723,7 +749,7 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_with_bad_columns(self):
+  def testMakeCSVDataset_withBadColumns(self):
     """Tests that exception is raised when input is malformed.
     """
     dupe_columns = self.COLUMNS[:-1] + self.COLUMNS[:1]
@@ -739,7 +765,7 @@ class MakeCsvDatasetTest(test.TestCase):
       self._make_csv_dataset(
           self._test_filenames, defaults, label_name="not_a_real_label")
 
-  def test_make_csv_dataset_with_no_label(self):
+  def testMakeCSVDataset_withNoLabel(self):
     """Tests that CSV datasets can be created when no label is specified.
     """
     defaults = self.DEFAULTS
@@ -760,7 +786,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
             label_name=None)
 
-  def test_make_csv_dataset_with_no_comments(self):
+  def testMakeCSVDataset_withNoComments(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -783,7 +809,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_no_header(self):
+  def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
     defaults = self.DEFAULTS
@@ -806,7 +832,7 @@ class MakeCsvDatasetTest(test.TestCase):
             num_epochs=10,
         )
 
-  def test_make_csv_dataset_with_types(self):
+  def testMakeCSVDataset_withTypes(self):
     """Tests that defaults can be a dtype instead of a Tensor for required vals.
     """
     defaults = [d for d in self.COLUMN_TYPES[:-1]]
@@ -816,7 +842,7 @@ class MakeCsvDatasetTest(test.TestCase):
         dataset = self._make_csv_dataset(self._test_filenames, defaults)
         self._verify_records(sess, dataset, range(self._num_files))
 
-  def test_make_csv_dataset_with_no_col_names(self):
+  def testMakeCSVDataset_withNoColNames(self):
     """Tests that datasets can be created when column names are not specified.
 
     In that case, we should infer the column names from the header lines.
@@ -835,7 +861,17 @@ class MakeCsvDatasetTest(test.TestCase):
         self._verify_records(
             sess, dataset, range(self._num_files), batch_size=2, num_epochs=10)
 
-  def test_make_csv_dataset_type_inference(self):
+  def testMakeCSVDataset_withTypeInferenceMismatch(self):
+    # Test that error is thrown when num fields doesn't match columns
+    with self.assertRaises(ValueError):
+      self._make_csv_dataset(
+          self._test_filenames,
+          column_names=self.COLUMNS + ["extra_name"],
+          defaults=None,
+          batch_size=2,
+          num_epochs=10)
+
+  def testMakeCSVDataset_withTypeInference(self):
     """Tests that datasets can be created when no defaults are specified.
 
     In that case, we should infer the types from the first N records.
@@ -859,19 +895,16 @@ class MakeCsvDatasetTest(test.TestCase):
         dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
         dtypes.string, dtypes.string
     ]
-    rows = [[0, 0, 0, "NAN", "", "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[None, None, None, "NAN", "",
+             "a"], [1, 2**31 + 1, 2**64, 123, "NAN", ""],
             ['"123"', 2, 2**64, 123.4, "NAN", '"cd,efg"']]
     expected = [[0, 0, 0, 0, "", "a"], [1, 2**31 + 1, 2**64, 123, "", ""],
                 [123, 2, 2**64, 123.4, "", "cd,efg"]]
     for row in expected:
       row[-1] = row[-1].encode("utf-8")  # py3 expects byte strings
       row[-2] = row[-2].encode("utf-8")  # py3 expects byte strings
-    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
-    with open(fn, "w") as f:
-      f.write(",".join(col_names))
-      f.write("\n")
-      for row in rows:
-        f.write(",".join([str(v) if v else "" for v in row]) + "\n")
+    self._write_file("file.csv", [col_names] + rows)
 
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
@@ -879,8 +912,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float32,
@@ -903,8 +934,6 @@ class MakeCsvDatasetTest(test.TestCase):
             fn,
             defaults=None,
             column_names=None,
-            batch_size=1,
-            num_epochs=1,
             label_name=None,
             na_value="NAN",
             default_float_type=dtypes.float64,
@@ -912,11 +941,99 @@ class MakeCsvDatasetTest(test.TestCase):
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
         for i in range(len(expected_dtypes)):
-          assert features["col%d" % i].dtype == expected_dtypes[i]
+          self.assertAllEqual(features["col%d" % i].dtype, expected_dtypes[i])
         for i in range(len(rows)):
-          assert sess.run(features) == dict(zip(col_names, expected[i]))
+          self.assertAllEqual(
+              sess.run(features), dict(zip(col_names, expected[i])))
+
+  def testMakeCSVDataset_withSelectColsError(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    with self.assertRaises(ValueError):
+      # Mismatch in number of defaults and number of columns selected,
+      # should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]] * 5,
+          column_names=col_names,
+          label_name=None,
+          select_cols=[1, 3])
+    with self.assertRaises(ValueError):
+      # Invalid column name should raise an error
+      self._make_csv_dataset(
+          fn,
+          defaults=[[0]],
+          column_names=col_names,
+          label_name=None,
+          select_cols=["invalid_col_name"])
+
+  def testMakeCSVDataset_withSelectCols(self):
+    data = [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]
+    col_names = ["col%d" % i for i in range(5)]
+    fn = self._write_file("file.csv", [col_names] + data)
+    # If select_cols is specified, should only yield a subset of columns
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=[[0], [0]],
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do default inference with select_cols
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=col_names,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can still do column name inference
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[1, 3])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
+    # Can specify column names instead of indices
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = self._make_csv_dataset(
+            fn,
+            defaults=None,
+            column_names=None,
+            label_name=None,
+            select_cols=[col_names[1], col_names[3]])
+        expected = [[1, 3], [6, 8]]
+        features = dataset.make_one_shot_iterator().get_next()
+        for i in range(len(data)):
+          self.assertAllEqual(
+              sess.run(features),
+              dict(zip([col_names[1], col_names[3]], expected[i])))
 
-  def test_make_csv_dataset_with_shuffle(self):
+  def testMakeCSVDataset_withShuffle(self):
     total_records = self._num_files * self._num_records
     defaults = self.DEFAULTS
     for batch_size in [1, 2]:
diff --git a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
index 36ddf3004237ed042f21d691d83eafbaa20621e6..b13ad9ba4e533e1bcef5161d983c8e6578d549b2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sequence_dataset_op_test.py
@@ -47,6 +47,11 @@ class SequenceDatasetSerializationTest(
     # Skip nothing
     self.run_core_tests(lambda: self._build_skip_dataset(0), None, 10)
 
+  def testInvalidSkip(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_skip_dataset([1, 2]), None, 0)
+
   def _build_take_dataset(self, count):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(count)
@@ -69,6 +74,11 @@ class SequenceDatasetSerializationTest(
     # Take nothing
     self.run_core_tests(lambda: self._build_take_dataset(0), None, 0)
 
+  def testInvalidTake(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_take_dataset([1, 2]), None, 0)
+
   def _build_repeat_dataset(self, count, take_count=3):
     components = (np.arange(10),)
     return dataset_ops.Dataset.from_tensor_slices(components).take(
@@ -100,6 +110,12 @@ class SequenceDatasetSerializationTest(
     # Test repeat empty dataset
     self.run_core_tests(lambda: self._build_repeat_dataset(-1, 0), None, 0)
 
+  def testInvalidRepeat(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Shape must be rank 0 but is rank 1'):
+      self.run_core_tests(lambda: self._build_repeat_dataset([1, 2], 0),
+                          None, 0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 647620eb849268abd679d0f4ff9149ab46c30e9a..a1a5c9ed05ff226086885e4e204875d3ca933590 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -119,6 +119,7 @@ py_library(
     deps = [
         ":contrib_op_loader",
         ":gen_dataset_ops",
+        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
@@ -183,15 +184,3 @@ py_library(
         "//tensorflow/python/data/util:sparse",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index a212adf6cf580267f9f1e6959bef95f04a4ad782..1eba010b562a60ec9469f808fd657ca330a8f5d9 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -345,6 +346,45 @@ class _RestructuredDataset(dataset_ops.Dataset):
     return self._output_shapes
 
 
+def assert_element_shape(expected_shapes):
+  """Assert the shape of this `Dataset`.
+
+  ```python
+  shapes = [tf.TensorShape([16, 256]), tf.TensorShape(None)]
+  result = dataset.apply(tf.contrib.data.assert_element_shape(shapes))
+  print(result.output_shapes)  # ==> "((16, 256), <unknown>)"
+  ```
+
+  If dataset shapes and expected_shape, are fully defined, assert they match.
+  Otherwise, add assert op that will validate the shapes when tensors are
+  evaluated, and set shapes on tensors, respectively.
+
+  Args:
+    expected_shapes: A nested structure of `tf.TensorShape` objects.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}
+  """
+
+  def _check_shape(*elements):
+    flatten_tensors = nest.flatten(elements)
+    flatten_shapes = nest.flatten(expected_shapes)
+    checked_tensors = [with_shape(shape, tensor)
+                       for shape, tensor in zip(flatten_shapes,
+                                                flatten_tensors)]
+    return nest.pack_sequence_as(elements, checked_tensors)
+
+  def _apply_fn(dataset):
+    return _RestructuredDataset(
+        dataset.map(_check_shape),
+        dataset.output_types,
+        output_shapes=expected_shapes,
+        output_classes=dataset.output_classes)
+
+  return _apply_fn
+
+
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 1438b5426f7a5df7eb6dcc6769d049538ff59267..89c04dc89a2240b047bc678911d19b77be41385a 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -25,9 +25,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 
 
 # TODO(rohanj): Add a python class that constructs resource in the __init__
@@ -36,7 +38,6 @@ def function_buffering_resource(string_arg,
                                 target_device,
                                 f,
                                 buffer_size,
-                                thread_pool_size=0,
                                 container="",
                                 shared_name=None,
                                 name=None):
@@ -48,7 +49,6 @@ def function_buffering_resource(string_arg,
       shared_name=shared_name,
       f=f,
       buffer_size=buffer_size,
-      thread_pool_size=thread_pool_size,
       container=container,
       name=name)
 
@@ -69,29 +69,65 @@ def function_buffering_resource_reset(function_buffer_resource, name=None):
 
 # pylint: disable=protected-access
 class _PrefetchToDeviceIterator(object):
-  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
 
-  def __init__(self, input_dataset, device, buffer_size):
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               one_shot,
+               device,
+               buffer_size,
+               shared_name=None):
     self._input_dataset = input_dataset
     self._get_next_call_count = 0
-    input_iterator = input_dataset.make_one_shot_iterator()
-    input_iterator_handle = input_iterator.string_handle()
+    self._one_shot = one_shot
+    if shared_name is None:
+      shared_name = ""
+
+    if self._one_shot:
+      self._input_iterator = input_dataset.make_one_shot_iterator()
+    else:
+      self._input_iterator = iterator_ops.Iterator.from_structure(
+          self._input_dataset.output_types, self._input_dataset.output_shapes,
+          shared_name, self._input_dataset.output_classes)
+    input_iterator_handle = self._input_iterator.string_handle()
 
     @function.Defun(dtypes.string)
     def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
       remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, input_iterator.output_types, input_iterator.output_shapes,
-          input_iterator.output_classes)
-      return remote_iterator.get_next()
+          handle, self._input_iterator.output_types,
+          self._input_iterator.output_shapes,
+          self._input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
 
     with ops.device(device):
       self._buffering_resource = function_buffering_resource(
           f=_prefetch_fn,
           target_device=gen_dataset_ops.iterator_get_device(
-              input_iterator._iterator_resource),
+              self._input_iterator._iterator_resource),
           string_arg=input_iterator_handle,
           buffer_size=buffer_size,
-          thread_pool_size=0)
+          shared_name=shared_name)
+
+    if not self._one_shot:
+      reset_op = function_buffering_resource_reset(self._buffering_resource)
+      with ops.control_dependencies([reset_op]):
+        self._initializer = self._input_iterator.make_initializer(
+            self._input_dataset)
 
   def get_next(self, name=None):
     """See @{tf.data.Iterator.get_next}."""
@@ -115,6 +151,12 @@ class _PrefetchToDeviceIterator(object):
 
     return ret
 
+  @property
+  def initializer(self):
+    if self._one_shot:
+      raise NotImplementedError("Can't initialize a one_shot_iterator")
+    return self._initializer
+
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
@@ -126,6 +168,68 @@ class _PrefetchToDeviceIterator(object):
   @property
   def output_types(self):
     return self._input_dataset.output_types
+
+
+class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device.
+
+  Args:
+    input_dataset: The input dataset
+    one_shot: If true, we make a one shot iterator that's already initialized.
+    device: A fully specified device string where we want to prefetch to
+    buffer_size: Size of the prefetching buffer.
+    shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+  Returns:
+    An Iterator type object.
+  """
+
+  def __init__(self,
+               input_dataset,
+               device,
+               buffer_size):
+    with ops.device("/device:CPU:0"):
+      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
+      input_iterator_handle = core_gen_dataset_ops.iterator_to_string_handle(
+          self._resource)
+
+    self._device = device
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, self.output_types, self.output_shapes, self.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    _prefetch_fn.add_to_graph(None)
+
+    with ops.device(device):
+      self._buffering_resource = function_buffering_resource(
+          f=_prefetch_fn,
+          target_device=gen_dataset_ops.iterator_get_device(self._resource),
+          string_arg=input_iterator_handle,
+          buffer_size=buffer_size,
+          shared_name=iterator_ops._generate_shared_name(
+              "function_buffer_resource"))
+
+  def _next_internal(self):
+    """Returns a nested structure of `tf.Tensor`s containing the next element.
+    """
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        ret = gen_dataset_ops.function_buffering_resource_get_next(
+            function_buffer_resource=self._buffering_resource,
+            output_types=self._flat_output_types)
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 # pylint: enable=protected-access
 
 
@@ -137,14 +241,45 @@ class _PrefetchToDeviceDataset(dataset_ops.Dataset):
     self._device = device
     self._buffer_size = buffer_size if buffer_size is not None else 1
 
+  # The static analysis cannot tell that the eager iterator's superclass has
+  # a `next()` method.
+  # pylint: disable=non-iterator-returned
+  def __iter__(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
+
+    The returned iterator implements the Python iterator protocol and therefore
+    can only be used in eager mode.
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      raise RuntimeError("dataset.__iter__() is only supported when eager "
+                         "execution is enabled.")
+  # pylint: enable=non-iterator-returned
+
   def make_one_shot_iterator(self):
-    return _PrefetchToDeviceIterator(self._input_dataset, self._device,
-                                     self._buffer_size)
+    if context.executing_eagerly():
+      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
+                                            self._buffer_size)
+    else:
+      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
+                                       device=self._device,
+                                       buffer_size=self._buffer_size)
 
   def make_initializable_iterator(self, shared_name=None):
-    raise NotImplementedError("`prefetch_to_device()` is not currently "
-                              "compatible with initializable iterators. Use "
-                              "`make_one_shot_iterator()` instead.")
+    return _PrefetchToDeviceIterator(
+        self._input_dataset,
+        one_shot=False,
+        device=self._device,
+        buffer_size=self._buffer_size,
+        shared_name=shared_name)
 
   def _as_variant_tensor(self):
     # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 95edca6cdd2e22ca5c2ed4b10ebe6462f9446811..4ec8ae1c79d1eb99c56b31c6a0709a84c38f5f90 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -18,9 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 import csv
+from math import ceil
 
 import numpy as np
 
+from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
@@ -122,18 +124,21 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
                            na_value, header, comment, float_dtype,
-                           rows_for_inference):
+                           num_rows_for_inference, select_columns):
   """Infers column types from the first N valid CSV records of files."""
-  inferred_types = [None] * num_cols
+  if select_columns is None:
+    select_columns = range(num_cols)
+  inferred_types = [None] * len(select_columns)
 
-  for rows_read, csv_row in enumerate(
+  for i, csv_row in enumerate(
       _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
                     comment)):
-    if rows_for_inference is not None and rows_read >= rows_for_inference:
+    if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
-    for i, str_val in enumerate(csv_row):
-      inferred_types[i] = _infer_type(str_val, na_value, inferred_types[i],
-                                      float_dtype)
+
+    for j, col_index in enumerate(select_columns):
+      inferred_types[j] = _infer_type(csv_row[col_index], na_value,
+                                      inferred_types[j], float_dtype)
 
   # Replace None's with a default type
   inferred_types = [t or dtypes.string for t in inferred_types]
@@ -160,12 +165,37 @@ def _infer_column_names(filenames, field_delim, use_quote_delim):
   return column_names
 
 
+def _get_sorted_col_indices(select_columns, column_names):
+  """Transforms select_columns argument into sorted column indices."""
+  names_to_indices = {n: i for i, n in enumerate(column_names)}
+  num_cols = len(column_names)
+  for i, v in enumerate(select_columns):
+    if isinstance(v, int):
+      if v < 0 or v >= num_cols:
+        raise ValueError(
+            "Column index %d specified in select_columns out of valid range." %
+            v)
+      continue
+    if v not in names_to_indices:
+      raise ValueError(
+          "Value '%s' specified in select_columns not a valid column index or "
+          "name." % v)
+    select_columns[i] = names_to_indices[v]
+
+  # Sort and ensure there are no duplicates
+  result = sorted(set(select_columns))
+  if len(result) != len(select_columns):
+    raise ValueError("select_columns contains duplicate columns")
+  return result
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
     column_names=None,
     column_defaults=None,
     label_name=None,
+    select_columns=None,
     field_delim=",",
     use_quote_delim=True,
     na_value="",
@@ -176,6 +206,9 @@ def make_csv_dataset(
     shuffle_buffer_size=10000,
     shuffle_seed=None,
     prefetch_buffer_size=1,
+    num_parallel_reads=1,
+    num_parallel_parser_calls=2,
+    sloppy=False,
     default_float_type=dtypes.float32,
     num_rows_for_inference=100,
 ):
@@ -196,20 +229,32 @@ def make_csv_dataset(
       provided, infers the column names from the first row of the records.
       These names will be the keys of the features dict of each dataset element.
     column_defaults: A optional list of default values for the CSV fields. One
-      item per column of the input record. Each item in the list is either a
-      valid CSV dtype (float32, float64, int32, int64, or string), or a
+      item per selected column of the input record. Each item in the list is
+      either a valid CSV dtype (float32, float64, int32, int64, or string), or a
       `Tensor` with one of the aforementioned types. The tensor can either be
       a scalar default value (if the column is optional), or an empty tensor (if
       the column is required). If a dtype is provided instead of a tensor, the
       column is also treated as required. If this list is not provided, tries
       to infer types based on reading the first num_rows_for_inference rows of
       files specified, and assumes all columns are optional, defaulting to `0`
-      for numeric values and `""` for string values.
+      for numeric values and `""` for string values. If both this and
+      `select_columns` are specified, these must have the same lengths, and
+      `column_defaults` is assumed to be sorted in order of increasing column
+      index.
     label_name: A optional string corresponding to the label column. If
       provided, the data for this column is returned as a separate `Tensor` from
       the features dictionary, so that the dataset complies with the format
       expected by a `tf.Estimator.train` or `tf.Estimator.evaluate` input
       function.
+    select_columns: An optional list of integer indices or string column
+      names, that specifies a subset of columns of CSV data to select. If
+      column names are provided, these must correspond to names provided in
+      `column_names` or inferred from the file header lines. When this argument
+      is specified, only a subset of CSV columns will be parsed and returned,
+      corresponding to the columns specified. Using this results in faster
+      parsing and lower memory usage. If both this and `column_defaults` are
+      specified, these must have the same lengths, and `column_defaults` is
+      assumed to be sorted in order of increasing column index.
     field_delim: An optional `string`. Defaults to `","`. Char delimiter to
       separate fields in a record.
     use_quote_delim: An optional bool. Defaults to `True`. If false, treats
@@ -231,6 +276,15 @@ def make_csv_dataset(
     prefetch_buffer_size: An int specifying the number of feature batches to
       prefetch for performance improvement. Recommended value is the number of
       batches consumed per training step.
+    num_parallel_reads: Number of threads used to read CSV records from files.
+      If >1, the results will be interleaved.
+    num_parallel_parser_calls: Number of parallel invocations of the CSV parsing
+      function on CSV records.
+    sloppy: If `True`, reading performance will be improved at
+      the cost of non-deterministic ordering. If `False`, the order of elements
+      produced is deterministic prior to shuffling (elements are still
+      randomized if `shuffle=True`. Note that if the seed is set, then order
+      of elements after shuffling is deterministic). Defaults to `False`.
     default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
       not provided, float-like strings are interpreted to be this type.
     num_rows_for_inference: Number of rows of a file to use for type inference
@@ -247,11 +301,16 @@ def make_csv_dataset(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
-  filenames = _get_file_names(file_pattern, shuffle)
+  # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+  if shuffle:
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
+
+  # Clean arguments; figure out column names and defaults
   if comment is not None and len(comment) != 1:
     raise ValueError("`comment` arg must be a single-character string or None")
 
-  # Clean arguments; figure out column names and defaults
   if column_names is None:
     if not header:
       raise ValueError("Cannot infer column names without a header line.")
@@ -260,6 +319,9 @@ def make_csv_dataset(
   if len(column_names) != len(set(column_names)):
     raise ValueError("Cannot have duplicate column names.")
 
+  if select_columns is not None:
+    select_columns = _get_sorted_col_indices(select_columns, column_names)
+
   if column_defaults is not None:
     column_defaults = [
         constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
@@ -270,9 +332,18 @@ def make_csv_dataset(
     # construction time
     column_defaults = _infer_column_defaults(
         filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, comment, default_float_type, num_rows_for_inference)
+        header, comment, default_float_type, num_rows_for_inference,
+        select_columns)
+
+  if select_columns is not None and len(column_defaults) != len(select_columns):
+    raise ValueError(
+        "If specified, column_defaults and select_columns must have same "
+        "length."
+    )
+  if select_columns is not None and len(column_names) > len(select_columns):
+    # Pick the relevant subset of column names
+    column_names = [column_names[i] for i in select_columns]
 
-  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
   if label_name is not None and label_name not in column_names:
     raise ValueError("`label_name` provided must be one of the columns.")
 
@@ -304,6 +375,7 @@ def make_csv_dataset(
         field_delim=field_delim,
         use_quote_delim=use_quote_delim,
         na_value=na_value,
+        select_cols=select_columns,
     )
     features = dict(zip(column_names, columns))
     if label_name is not None:
@@ -311,16 +383,31 @@ def make_csv_dataset(
       return features, label
     return features
 
-  # TODO(rachelim): interleave records from files for better shuffling
-  dataset = dataset.flat_map(filename_to_dataset)
-  # TODO(rachelim): use fused shuffle_and_repeat for perf
-  if shuffle:
+  # Read files sequentially or in parallel
+  dataset = dataset.apply(
+      interleave_ops.parallel_interleave(
+          filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
+
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    dataset = dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
     dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
-  if num_epochs != 1:
+  elif num_epochs != 1:
     dataset = dataset.repeat(num_epochs)
 
-  dataset = dataset.batch(batch_size)
-  dataset = dataset.map(decode_csv)
+  # Use map_and_batch for perf
+  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
+  # that is added
+  dataset = dataset.apply(
+      batching.map_and_batch(
+          map_func=decode_csv,
+          batch_size=batch_size,
+          num_parallel_batches=int(
+              ceil(num_parallel_parser_calls / batch_size))))
+
   dataset = dataset.prefetch(prefetch_buffer_size)
   return dataset
 
@@ -337,7 +424,8 @@ def make_batched_features_dataset(file_pattern,
                                   prefetch_buffer_size=1,
                                   reader_num_threads=1,
                                   parser_num_threads=2,
-                                  sloppy_ordering=False):
+                                  sloppy_ordering=False,
+                                  drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   Example:
@@ -410,18 +498,19 @@ def make_batched_features_dataset(file_pattern,
       produced is deterministic prior to shuffling (elements are still
       randomized if `shuffle=True`. Note that if the seed is set, then order
       of elements after shuffling is deterministic). Defaults to `False`.
+    drop_final_batch: If `True`, and the batch size does not evenly divide the
+      input dataset size, the final smaller batch will be dropped. Defaults to
+      `False`.
 
   Returns:
     A dataset of `dict` elements. Each `dict` maps feature keys to
     `Tensor` or `SparseTensor` objects.
   """
   # Create dataset of all matching filenames
+  filenames = _get_file_names(file_pattern, False)
+  dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
   if shuffle:
-    dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=True)
-  else:
-    # TODO(b/73959787): Use Dataset.list_files() once ordering is deterministic.
-    filenames = _get_file_names(file_pattern, shuffle)
-    dataset = dataset_ops.Dataset.from_tensor_slices(filenames)
+    dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
   # Read `Example` records from files as tensor objects.
   if reader_args is None:
@@ -450,7 +539,10 @@ def make_batched_features_dataset(file_pattern,
   elif shuffle:
     dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
 
-  dataset = dataset.batch(batch_size)
+  if drop_final_batch:
+    dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+  else:
+    dataset = dataset.batch(batch_size)
 
   # Parse `Example` tensors to a dictionary of `Feature` tensors.
   dataset = dataset.map(
diff --git a/tensorflow/contrib/decision_trees/proto/BUILD b/tensorflow/contrib/decision_trees/proto/BUILD
index ae3847b8b62452b1afbe472fcb6369181ec60b73..3b50a48336d77ebd9327fa24e5612a95d5d0c372 100644
--- a/tensorflow/contrib/decision_trees/proto/BUILD
+++ b/tensorflow/contrib/decision_trees/proto/BUILD
@@ -13,14 +13,6 @@ load(
     "tf_pyclif_proto_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "generic_tree_model",
     srcs = ["generic_tree_model.proto"],
diff --git a/tensorflow/contrib/deprecated/BUILD b/tensorflow/contrib/deprecated/BUILD
index 3dfbbf55273848afb8ad74ad444f0d85b45610bd..401527f1e74f7725d02a3b92a2c661d8ffc11e21 100644
--- a/tensorflow/contrib/deprecated/BUILD
+++ b/tensorflow/contrib/deprecated/BUILD
@@ -30,15 +30,3 @@ py_test(
         "//tensorflow/python:logging_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..74b2cd90a187159fd2da8ce236c14e813cc43c49
--- /dev/null
+++ b/tensorflow/contrib/distribute/BUILD
@@ -0,0 +1,36 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "distribute",
+    srcs = ["__init__.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/contrib/distribute/python:cross_tower_ops",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/contrib/distribute/python:monitor",
+        "//tensorflow/contrib/distribute/python:one_device_strategy",
+        "//tensorflow/contrib/distribute/python:step_fn",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..14de1e8f491634051b6f7c2c5f2bf64f1d73e9a3
--- /dev/null
+++ b/tensorflow/contrib/distribute/README.md
@@ -0,0 +1,140 @@
+# Distribution Strategy
+
+> *NOTE*: This is a experimental feature. The API and performance
+> characteristics are subject to change.
+
+## Overview
+
+[`DistributionStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy)
+API is an easy way to distribute your training
+across multiple devices/machines. Our goal is to allow users to use existing
+models and training code with minimal changes to enable distributed training.
+Moreover, we've design the API in such a way that it works with both eager and
+graph execution.
+
+Currently we support one type of strategy, called
+[`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy).
+It does in-graph replication with synchronous training
+on many GPUs on one machine. Essentially, we create copies of all variables in
+the model's layers on each device. We then use all-reduce to combine gradients
+across the devices before applying them to the variables to keep them in sync.
+In the future, we intend to support other kinds of training configurations such
+as multi-node, synchronous,
+[asynchronous](https://www.tensorflow.org/deploy/distributed#putting_it_all_together_example_trainer_program),
+parameter servers and model parallelism.
+
+## Example
+
+Let's demonstrate how to use this API with a simple example. We will use the
+[`Estimator`](https://www.tensorflow.org/api_docs/python/tf/estimator/Estimator)
+approach, and show you how to scale your model to run on multiple GPUs on one
+machine using `MirroredStrategy`.
+
+Let's consider a very simple model function which tries to learn a simple
+function.
+
+```python
+def model_fn(features, labels, mode):
+  layer = tf.layers.Dense(1)
+  logits = layer(features)
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predictions = {"logits": logits}
+    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+  loss = tf.losses.mean_squared_error(
+      labels=labels, predictions=tf.reshape(logits, []))
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    return tf.estimator.EstimatorSpec(mode, loss=loss)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
+```
+
+Let's also define a simple input function to feed data for training this model.
+Note that we require using
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)
+with `DistributionStrategy`.
+
+
+```python
+def input_fn():
+  features = tf.data.Dataset.from_tensors([[1.]]).repeat(100)
+  labels = tf.data.Dataset.from_tensors(1.).repeat(100)
+  return dataset_ops.Dataset.zip((features, labels))
+```
+
+Now that we have a model function and input function defined, we can define the
+estimator. To use `MirroredStrategy`, all we need to do is:
+
+* Create an instance of the `MirroredStrategy` class.
+* Pass it to the
+[`RunConfig`](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig)
+parameter of `Estimator`.
+
+
+```python
+distribution = tf.contrib.distribute.MirroredStrategy()
+config = tf.estimator.RunConfig(train_distribute=distribution)
+classifier = tf.estimator.Estimator(model_fn=model_fn, config=config)
+classifier.train(input_fn=input_fn)
+```
+
+That's it! This change will now configure estimator to run on all GPUs on your
+machine, with the `MirroredStrategy` approach. It will take care of distributing
+the input dataset, replicating layers and variables on each device, and
+combining and applying gradients.
+
+The model and input functions do not have to change because we have changed the
+underlying components of TensorFlow (such as
+optimizer, batch norm and summaries) to become distribution-aware.
+That means those components know how to
+combine their state across devices. Further, saving and checkpointing works
+seamlessly, so you can save with one or no distribution strategy and resume with
+another.
+
+Above, we showed the easiest way to use [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy#__init__).
+There are few things you can customize in practice:
+
+* You can specify a list of specific GPUs (using param `devices`) or the number
+of GPUs (using param `num_gpus`), in case you don't want auto detection.
+* You can specify various parameters for all reduce with the `cross_tower_ops`
+param, such as the all reduce algorithm to use, and gradient repacking.
+
+## Performance Tips
+
+We've tried to make it such that you get the best performance for your existing
+model. We also recommend you follow the tips from
+[Input Pipeline Performance Guide](https://www.tensorflow.org/performance/datasets_performance).
+Specifically, we found using [`map_and_batch`](https://www.tensorflow.org/performance/datasets_performance#map_and_batch)
+and [`dataset.prefetch`](https://www.tensorflow.org/performance/datasets_performance#pipelining)
+in the input function gives a solid boost in performance. When using
+`dataset.prefetch`, use `buffer_size=None` to let it detect optimal buffer size.
+
+## Caveats
+This feature is in early stages and there are a lot of improvements forthcoming:
+
+* Metrics are not yet supported during distributed training.
+* Summaries are only computed in the first tower in `MirroredStrategy`.
+* Evaluation is not yet distributed.
+* Eager support is in the works; performance can be more challenging with eager
+execution.
+* As mentioned earlier, multi-node and other distributed strategies will be
+introduced in the future.
+* If you are [`batching`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#batch)
+your input data, we will place one batch on each GPU in each step. So your
+effective batch size will be `num_gpus * batch_size`. Therefore, consider
+adjusting your learning rate or batch size according to the number of GPUs.
+We are working on addressing this limitation by splitting each batch across GPUs
+instead.
+* PartitionedVariables are not supported yet.
+
+## What's next?
+
+Please give distribution strategies a try. This feature is in early stages and
+is evolving, so we welcome your feedback via
+[issues on GitHub](https://github.com/tensorflow/tensorflow/issues/new).
+
+
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76711baf3a11c8978fbb5770ec173ff74a153158
--- /dev/null
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -0,0 +1,52 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Prototype of a distributed computation library for TF."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.distribute.python.cross_tower_ops import *
+from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
+from tensorflow.contrib.distribute.python.monitor import Monitor
+from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
+from tensorflow.contrib.distribute.python.step_fn import *
+from tensorflow.python.training.distribute import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+
+_allowed_symbols = [
+    'AllReduceCrossTowerOps',
+    'CrossTowerOps',
+    'DistributionStrategy',
+    'MirroredStrategy',
+    'Monitor',
+    'OneDeviceStrategy',
+    'ReductionToOneDeviceCrossTowerOps',
+    'Step',
+    'StandardInputStep',
+    'StandardSingleLossStep',
+    'TowerContext',
+    'get_cross_tower_context',
+    'get_distribution_strategy',
+    'get_loss_reduction',
+    'get_tower_context',
+    'has_distribution_strategy',
+    'require_tower_context',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..78b2b0054aa95701ad192b4fb9a0727ce287de4b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -0,0 +1,444 @@
+# Implementation of a prototype TF distributed computation library.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# TODO(priyag): Figure out testonly issues that are preventing us from
+# including our tests in pip for now.
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/contrib/data/python/ops:transformation_ops",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpointable",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "values_test",
+    srcs = ["values_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":cross_tower_ops",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "one_device_strategy",
+    srcs = ["one_device_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":values",
+        "//tensorflow/contrib/eager/python:datasets",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "strategy_test_lib",
+    testonly = 1,
+    srcs = ["strategy_test_lib.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "combinations",
+    testonly = 1,
+    srcs = ["combinations.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":one_device_strategy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "combinations_test",
+    srcs = ["combinations_test.py"],
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "mirrored_strategy_test",
+    srcs = ["mirrored_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":mirrored_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_test(
+    name = "one_device_strategy_test",
+    srcs = ["one_device_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":one_device_strategy",
+        ":strategy_test_lib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+cuda_py_test(
+    name = "mirrored_strategy_multigpu_test",
+    srcs = ["mirrored_strategy_multigpu_test.py"],
+    additional_deps = [
+        ":mirrored_strategy",
+        ":values",
+        ":strategy_test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "guitar",
+        "no_pip",
+        "multi_and_single_gpu",
+        # Do not perform the extra analysis on this test, because it is already
+        # performed for the `:mirrored_strategy_test` target.
+        "no_oss",
+        "noasan",
+        "notap",
+        "notsan",
+    ],
+)
+
+py_library(
+    name = "step_fn",
+    srcs = ["step_fn.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "minimize_loss_test",
+    srcs = ["minimize_loss_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops/losses",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+cuda_py_test(
+    name = "estimator_integration_test",
+    srcs = ["estimator_integration_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/estimator:dnn_linear_combined",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:prediction_keys",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "single_loss_example",
+    srcs = ["single_loss_example.py"],
+    deps = [
+        ":step_fn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+cuda_py_test(
+    name = "step_fn_test",
+    srcs = ["step_fn_test.py"],
+    additional_deps = [
+        ":single_loss_example",
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "monitor",
+    srcs = ["monitor.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "monitor_test",
+    srcs = ["monitor_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":monitor",
+        ":one_device_strategy",
+        ":single_loss_example",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+    visibility = ["//tensorflow:internal"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "cross_tower_utils",
+    srcs = ["cross_tower_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/nccl:nccl_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "cross_tower_ops",
+    srcs = ["cross_tower_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_tower_utils",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "cross_tower_ops_test",
+    srcs = ["cross_tower_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        ":cross_tower_ops",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "prefetching_ops_v2",
+    srcs = ["prefetching_ops_v2.py"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+        "//tensorflow/contrib/data/python/ops:prefetching_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
+cuda_py_test(
+    name = "prefetching_ops_v2_test",
+    srcs = ["prefetching_ops_v2_test.py"],
+    additional_deps = [
+        ":prefetching_ops_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b1e7ef9fcd4767c59898bd343e712e285e67d5
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -0,0 +1,297 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Facilities for creating multiple test combinations.
+
+Here is an example of testing various optimizers in Eager and Graph mode:
+
+class AdditionExample(test.TestCase, parameterized.TestCase):
+  @combinations.generate(
+     combinations.combine(mode=["graph", "eager"],
+                          optimizer=[AdamOptimizer(),
+                                     GradientDescentOptimizer()]))
+  def testOptimizer(self, optimizer):
+    ... f(optimizer)...
+
+This will run `testOptimizer` 4 times with the specified optimizers: 2 in
+Eager and 2 in Graph mode.
+The test will be provided with arguments that match the arguments of combine
+by name.  It is necessary to request all arguments, except for `mode`, which is
+optional.
+
+`combine()` function is available for creating a cross product of various
+options.  `times()` function exists for creating a product of N `combine()`-ed
+results.  See below.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+import sys
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.optimizer_v2 import adam as adam_v2
+from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.util import tf_inspect
+
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+
+
+def generate(combinations):
+  """A decorator for generating test cases of a test method or a test class.
+
+  Args:
+    combinations: a list of dictionaries created using combine() and times().
+
+  Restrictions:
+   -- there should always be a "mode" argument.  Accepted values are "eager"
+      and "graph".
+   -- arguments of the test method must match by name to get the corresponding
+      value of the combination.  Tests must accept all arguments (except "mode",
+      which is optional).
+   -- distribution argument is special.  It is meant for passing instances of
+      DistributionStrategy.  Each instance is to be passed as `(<int>,
+      <DistributionStrategy>)` tuple, where <int> is the number of required
+      GPUs.  If the required number of GPUs for the DistributionStrategy isn't
+      available then the test case is going to be skipped.
+
+  Returns:
+    a decorator that will cause the test method to be run under the specified
+    conditions.
+
+  Raises:
+    ValueError - if "mode" argument wasn't either "eager" or "graph.
+  """
+
+  def decorator(test_function):
+    """The decorator to be returned."""
+
+    # Generate good test names that can be used with --test_filter.
+    for combination in combinations:
+      # We use OrderedDicts in `combine()` and `times()` to ensure stable
+      # order of keys in each dictionary.
+      assert isinstance(combination, OrderedDict)
+      name = "".join([
+          "_{}_{}".format(
+              "".join(filter(str.isalnum, key)),
+              "".join(filter(str.isalnum, str(value))))
+          for key, value in combination.items()
+      ])
+      combination.update({"testcase_name": "_test{}".format(name)})
+
+    @parameterized.named_parameters(*combinations)
+    def decorated(self, **kwargs):
+      """A wrapped test method that sets up `test_function`."""
+      assert "mode" in kwargs
+      mode = kwargs["mode"]
+
+      if "distribution" in kwargs:
+        distribution = kwargs["distribution"]
+        kwargs["distribution"] = distribution.strategy
+        if not distribution.required_gpus:
+          if GPU_TEST:
+            self.skipTest("Test that doesn't require GPUs.")
+        elif context.num_gpus() < distribution.required_gpus:
+          self.skipTest(
+              "{} GPUs are not available for this test. {} GPUs are available".
+              format(distribution.required_gpus, context.num_gpus()))
+
+      requested_arguments = tf_inspect.getfullargspec(test_function).args
+      missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
+          set(requested_arguments + ["mode"]))
+      if missing_arguments:
+        raise ValueError("The test is missing arguments {} .".format(
+            missing_arguments))
+
+      kwargs_to_pass = {}
+      for arg in requested_arguments:
+        if arg == "self":
+          kwargs_to_pass[arg] = self
+        else:
+          kwargs_to_pass[arg] = kwargs[arg]
+
+      if mode == "eager":
+        with context.eager_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      elif mode == "graph":
+        with context.graph_mode(), ops.Graph().as_default():
+          test_function(**kwargs_to_pass)
+      else:
+        raise ValueError(
+            "'mode' has to be either 'eager' or 'graph' and not {}".format(
+                mode))
+
+    return decorated
+  return decorator
+
+
+def combine(**kwargs):
+  """Generate combinations based on its keyword arguments.
+
+  Two sets of returned combinations can be concatenated using +.  Their product
+  can be computed using `times()`.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  if not kwargs:
+    return [OrderedDict()]
+
+  sort_by_key = lambda k: k[0][0]
+  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
+  first = list(kwargs.items())[0]
+
+  rest = dict(list(kwargs.items())[1:])
+  rest_combined = combine(**rest)
+
+  key = first[0]
+  values = first[1]
+
+  return [
+      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
+      for v in values
+      for combined in rest_combined
+  ]
+
+
+def times(*combined):
+  """Generate a product of N sets of combinations.
+
+  times(combine(a=[1,2]), combine(b=[3,4])) == combine(a=[1,2], b=[3,4])
+
+  Args:
+    *combined: N lists of dictionaries that specify combinations.
+
+  Returns:
+    a list of dictionaries for each combination.
+
+  Raises:
+    ValueError: if some of the inputs have overlapping keys.
+  """
+  assert combined
+
+  if len(combined) == 1:
+    return combined[0]
+
+  first = combined[0]
+  rest_combined = times(*combined[1:])
+
+  combined_results = []
+  for a in first:
+    for b in rest_combined:
+      if set(a.keys()).intersection(set(b.keys())):
+        raise ValueError("Keys need to not overlap: {} vs {}".format(
+            a.keys(), b.keys()))
+
+      combined_results.append(OrderedDict(list(a.items()) + list(b.items())))
+  return combined_results
+
+
+class NamedObject(object):
+  """A class that translates an object into a good test name."""
+
+  def __init__(self, name, obj):
+    self._name = name
+    self._obj = obj
+
+  def __getattr__(self, name):
+    return getattr(self._obj, name)
+
+  def __call__(self, *args, **kwargs):
+    return self._obj(*args, **kwargs)
+
+  def __repr__(self):
+    return self._name
+
+
+class NamedDistribution(object):
+  """Translates DistributionStrategy and its data into a good name."""
+
+  def __init__(self, name, distribution, required_gpus):
+    self._distribution = distribution
+    self._name = name
+    self._required_gpus = required_gpus
+
+  def __repr__(self):
+    return self._name
+
+  @property
+  def strategy(self):
+    return self._distribution
+
+  @property
+  def required_gpus(self):
+    return self._required_gpus
+
+
+one_device_strategy = NamedDistribution(
+    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
+    None)
+mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "MirroredCPUAndGPU",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
+mirrored_strategy_without_prefetch = NamedDistribution(
+    "MirroredCPUAndGPUNoPrefetch",
+    mirrored_strategy.MirroredStrategy(
+        ["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1)
+mirrored_strategy_with_two_gpus = NamedDistribution(
+    "Mirrored2GPUs",
+    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2)
+
+adam_optimizer_v1_fn = NamedObject(
+    "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v1_fn = NamedObject(
+    "GradientDescentV1", lambda: gradient_descent.GradientDescentOptimizer(0.2))
+
+adam_optimizer_v2_fn = NamedObject(
+    "AdamV2", lambda: adam_v2.AdamOptimizer(0.2, epsilon=1))
+gradient_descent_optimizer_v2_fn = NamedObject(
+    "GradientDescentV2",
+    lambda: gradient_descent_v2.GradientDescentOptimizer(0.2))
+
+graph_and_eager_modes = ["graph", "eager"]
+
+
+def distributions_and_v1_optimizers():
+  """A common set of combination with DistributionStrategies and Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v1_fn, gradient_descent_optimizer_v1_fn])
+
+
+def distributions_and_v2_optimizers():
+  """DistributionStrategies and V2 Optimizers."""
+  return combine(
+      distribution=[
+          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus
+      ],
+      optimizer_fn=[adam_optimizer_v2_fn, gradient_descent_optimizer_v2_fn])
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..219b24160f3902fcfa5363cc39a8fc5b30d00308
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for some testing utils from strategy_test_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.eager import test
+
+
+class TestingCombinationsTest(test.TestCase):
+
+  def test_combine(self):
+    self.assertEqual([{
+        "a": 1,
+        "b": 2
+    }, {
+        "a": 1,
+        "b": 3
+    }, {
+        "a": 2,
+        "b": 2
+    }, {
+        "a": 2,
+        "b": 3
+    }], combinations.combine(a=[1, 2], b=[2, 3]))
+
+  def test_add(self):
+    self.assertEqual(
+        [{
+            "a": 1
+        }, {
+            "a": 2
+        }, {
+            "b": 2
+        }, {
+            "b": 3
+        }],
+        combinations.combine(a=[1, 2]) +
+        combinations.combine(b=[2, 3]))
+
+  def test_times(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1 + c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d1"), ("loss", "callable"),
+                     ("mode", "eager")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "tensor"),
+                     ("mode", "graph")]),
+        OrderedDict([("distribution", "d2"), ("loss", "callable"),
+                     ("mode", "eager")])
+    ], c4)
+
+  def test_times_variable_arguments(self):
+    c1 = combinations.combine(mode=["graph", "eager"])
+    c2 = combinations.combine(optimizer=["adam", "gd"])
+    c3 = combinations.combine(distribution=["d1", "d2"])
+    c4 = combinations.times(c3, c1, c2)
+    self.assertEqual([
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d1"), ("mode", "eager"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "graph"),
+                     ("optimizer", "gd")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "adam")]),
+        OrderedDict([("distribution", "d2"), ("mode", "eager"),
+                     ("optimizer", "gd")])
+    ], c4)
+    self.assertEqual(
+        combinations.combine(
+            mode=["graph", "eager"],
+            optimizer=["adam", "gd"],
+            distribution=["d1", "d2"]), c4)
+
+  def test_overlapping_keys(self):
+    c1 = combinations.combine(mode=["graph"], loss=["callable", "tensor"])
+    c2 = combinations.combine(mode=["eager"], loss=["callable"])
+    with self.assertRaisesRegexp(ValueError, ".*Keys.+overlap.+"):
+      _ = combinations.times(c1, c2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbe5e877d59518056db3fea251cdae0ed854d0e4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -0,0 +1,585 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes for different algorithms of reduction and broadcasting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_utils
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.client import device_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import device_util
+
+
+def _validate_destinations(destinations):
+  if not isinstance(destinations,
+                    (value_lib.DistributedValues, six.string_types, list)):
+    raise ValueError("destinations must be one of a `DistributedValues` object,"
+                     " a device string, a list of device strings or None")
+
+  if not destinations:
+    raise ValueError("destinations can not be empty")
+
+
+def _validate_value_destination_pairs(value_destination_pairs):
+  # pylint: disable=g-missing-docstring
+  if not value_destination_pairs: return False
+  if not isinstance(value_destination_pairs, (list, tuple)): return False
+  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
+    return False
+  if not all([isinstance(v[0], value_lib.PerDevice)
+              for v in value_destination_pairs]):
+    return False
+  return True
+
+
+def _get_devices_from(destinations):
+  if isinstance(destinations, value_lib.DistributedValues):
+    return list(destinations.devices)
+  elif isinstance(destinations, six.string_types):
+    return [device_util.canonicalize(destinations)]
+  else:
+    return [
+        device_util.canonicalize(destination) for destination in destinations
+    ]
+
+
+def _devices_match(left, right):
+  return set(_get_devices_from(left)) == set(_get_devices_from(right))
+
+
+def _all_devices_match(value_destination_pairs):
+  if not all([d is None or _devices_match(v, d)
+              for v, d in value_destination_pairs]):
+    return False
+  if not all([_devices_match(v, value_destination_pairs[0][0])
+              for v, _ in value_destination_pairs[1:]]):
+    return False
+  return True
+
+
+def _simple_broadcast(tensor, destinations):
+  index = {}
+  devices = _get_devices_from(destinations)
+  for d in devices:
+    with ops.device(d):
+      index[d] = array_ops.identity(tensor)
+  return value_lib.Mirrored(index)
+
+
+def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
+                   method_string):
+  # pylint: disable=g-missing-docstring
+  all_values = []
+  count = 0
+  for v in per_device_value._index.values():  # pylint: disable=protected-access
+    if isinstance(v, value_lib.MapOutput):
+      v_list = v.get()
+      if not v_list:
+        continue
+      count += len(v_list)
+      # Sum within each device before aggregating across devices.
+      v = math_ops.add_n(v_list)
+    else:
+      count += 1
+    all_values.append(v)
+  if not all_values:
+    raise ValueError("`per_device_value` must be non-empty")
+
+  with ops.device(reduce_to_device):
+    with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+      if method_string == "sum":
+        reduced = accumulation_fn(all_values)
+      elif method_string == "mean":
+        reduced = accumulation_fn(all_values) / count
+      else:
+        raise ValueError("`method_string` must be 'sum' or 'mean'")
+  return reduced
+
+
+class CrossTowerOps(object):
+  """Base class for cross-tower reduction and broadcasting algorithms."""
+
+  def __init__(self):
+    pass
+
+  def reduce(self, method_string, per_device_value, destinations=None):
+    """Reduce `per_device_value` to `destinations`.
+
+    It runs the reduction operation defined by `method_string` and put the
+    result on `destinations`.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      per_device_value: a PerDevice object.
+      destinations: the reduction destinations.
+
+    Returns:
+      a Mirrored object.
+
+    Raises:
+      ValueError: if per_device_value is not a PerDevice object.
+    """
+    if not isinstance(per_device_value, value_lib.PerDevice):
+      raise ValueError("`per_device_value` must be a `PerDevice` object.")
+    if destinations is not None:
+      _validate_destinations(destinations)
+    return self._reduce(method_string, per_device_value, destinations)
+
+  def batch_reduce(self, method_string, value_destination_pairs):
+    """Reduce PerDevice objects in a batch.
+
+    Reduce each first element in `value_destination_pairs` to each second
+    element which indicates the destinations.
+
+    Args:
+      method_string: either 'sum' or 'mean' specifying the reduction method.
+      value_destination_pairs: a list or a tuple of tuples of PerDevice objects
+        and destinations. If a destination is None, then the destinations
+        are set to match the devices of the input PerDevice object.
+
+    Returns:
+      a list of Mirrored objects.
+
+    Raises:
+      ValueError: if `value_destination_pairs` is not a list or a tuple of
+        tuples of PerDevice objects and destinations
+    """
+    if not _validate_value_destination_pairs(value_destination_pairs):
+      raise ValueError("`value_destination_pairs` must be a list or a tuple of "
+                       "tuples of PerDevice objects and destinations")
+    for _, d in value_destination_pairs:
+      if d is not None:
+        _validate_destinations(d)
+
+    return self._batch_reduce(method_string, value_destination_pairs)
+
+  def broadcast(self, tensor, destinations):
+    """Broadcast the `tensor` to destinations.
+
+    Args:
+      tensor: the tensor to broadcast.
+      destinations: the broadcast destinations.
+
+    Returns:
+      a Mirrored object.
+    """
+    _validate_destinations(destinations)
+    return self._broadcast(tensor, destinations)
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    raise NotImplementedError(
+        "_reduce method must be implemented in descendants.")
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    raise NotImplementedError(
+        "_batch_reduce method must be implemented in descendants.")
+
+  def _broadcast(self, tensor, destinations):
+    return _simple_broadcast(tensor, destinations)
+
+
+class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
+  """Always do reduction to one device first and then do broadcasting.
+
+    Batch reduction is done by reduction on each element one by one.
+  """
+
+  def __init__(self, reduce_to_device=None, accumulation_fn=math_ops.add_n):
+    """Constructor.
+
+    Args:
+      reduce_to_device: the intermediate device to reduce to. If None, reduce
+        to the first device in `destinations` of the reduce() method.
+      accumulation_fn: a function that does accumulation.
+    """
+    self.reduce_to_device = reduce_to_device
+    self.accumulation_fn = accumulation_fn
+    super(ReductionToOneDeviceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    devices = _get_devices_from(destinations or per_device_value)
+    reduce_to_device = self.reduce_to_device or devices[0]
+    reduced = _simple_reduce(per_device_value, reduce_to_device,
+                             self.accumulation_fn, method_string)
+    return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return [self._reduce(method_string, t, destinations=v)
+            for t, v in value_destination_pairs]
+
+
+def _group_value_by_device(per_device_values):
+  """Group values into sublists by their devices.
+
+  This grouping is needed to call the all-reduce library.
+
+  Args:
+    per_device_values: a list of PerDevice obejcts.
+
+  Returns:
+    a list of lists, each sublist has components for its corresponding device of
+      PerDevice objects, paired with a None.
+  """
+  destinations = per_device_values[0].devices
+  grouped = [[] for _ in range(len(destinations))]
+  for per_device_value in per_device_values:
+    # pylint: disable=protected-access
+    for i, v in enumerate(per_device_value._index.values()):
+      assert per_device_value.devices == destinations
+      grouped[i].append((v, None))
+  return grouped
+
+
+def _ungroup_and_make_mirrored(grouped_reduced, destinations, method_string):
+  """Ungroup results from all-reduce and make Mirrored objects.
+
+  Each all-reduce result will be divided by the number of destinations before
+  Mirrored objects are created if method_string is "mean".
+
+  Args:
+    grouped_reduced: a list of lists, each sublist has components for each
+      device, paired with a None. It is the result from
+      cross_tower_utils.aggregate_gradients_using*.
+    destinations: a list of device strings for returned Mirrored objects.
+    method_string: "mean" or "sum".
+
+  Returns:
+    a list of Mirrored objects.
+  """
+  index = [{} for _ in range(len(grouped_reduced[0]))]
+  for d, per_device_reduced in enumerate(grouped_reduced):
+    for i, (v, _) in enumerate(per_device_reduced):
+      if method_string == "mean":
+        index[i][destinations[d]] = v / len(destinations)
+      else:
+        index[i][destinations[d]] = v
+  return [value_lib.Mirrored(v) for v in index]
+
+
+class ConcatAndSplitPacker(object):
+  """Concatenate and split tensors for reduction."""
+
+  def __init__(self, num_packs=1):
+    """Initialize the ConcatAndSplitPacker object.
+
+    Args:
+      num_packs: specifies the number of split packs that will be
+        formed.
+
+    Raises:
+      ValueError: if num_packs is not greater than 0.
+    """
+    if num_packs <= 0:
+      raise ValueError("num_packs must be greater than zero.")
+    self.num_packs = num_packs
+
+  def pack(self, grouped_grads_and_vars):
+    """Pack tensors."""
+    self.grouped_grads_and_vars = grouped_grads_and_vars
+    self.all_tower_shapes = []
+    self.all_tower_sizes = []
+
+    device_grad_packs = []
+    for tower_grads_and_vars in grouped_grads_and_vars:
+      with ops.colocate_with(tower_grads_and_vars[0][0]):
+        # Flatten all the grads.
+        flat_grads = [
+            array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
+        ]
+        # Remember the original shape of all the grads.
+        tower_shapes = [array_ops.shape(g) for g, _ in tower_grads_and_vars]
+        # Remember the original sizes of all the grads.
+        tower_sizes = [array_ops.size(g) for g, _ in tower_grads_and_vars]
+        # Concat all the flat grads into a big flat tensor.
+        concat_grads = array_ops.concat(flat_grads, 0)
+
+        # Split the big tensor into num_splits packs. In cases where the
+        # total size is not divisible num_splits, the last pack gets
+        # more elements.
+        # TODO(zhengxq): it is also possible to optimize away all the concat
+        # as well.
+        num_splits = self.num_packs
+        total_grad_size = array_ops.size(concat_grads)
+        split_size = total_grad_size // num_splits
+        split_size_last = total_grad_size - split_size * (num_splits - 1)
+        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
+        grad_packs = array_ops.split(concat_grads, split_sizes)
+
+        # Ready to aggregate the repacked gradients, with fake variables.
+        # TODO(zhengxq): It is hacky to have to use fake variables.
+        # We should remove the need for variables in
+        # aggregate_gradients_using*.
+        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
+        self.all_tower_shapes.append(tower_shapes)
+        self.all_tower_sizes.append(tower_sizes)
+
+    return device_grad_packs
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the pack."""
+    aggregated_device_grads = []
+    for (summed_tower_grad_packs,
+         tower_grads_and_vars, tower_shapes, tower_sizes) in zip(
+             summed_device_grad_packs, self.grouped_grads_and_vars,
+             self.all_tower_shapes, self.all_tower_sizes):
+      # pylint: enable=line-too-long
+      # Reverse the packing operations in the previous steps. Form the
+      # summed gradients back into their original shapes.
+      with ops.colocate_with(summed_tower_grad_packs[0][0]):
+        # Form a list of the summed grad packs.
+        device_grad_packs = [g for g, _ in summed_tower_grad_packs]
+
+        # Concat them back into a big flat tensor.
+        device_grads_concat = array_ops.concat(device_grad_packs, 0)
+
+        # Split the tensors back into their original sizes.
+        grads_with_sizes = array_ops.split(device_grads_concat, tower_sizes)
+
+        # Reshape the tensors back into their original shapes.
+        grads_with_shapes = [
+            array_ops.reshape(grad, shape)
+            for shape, grad in zip(tower_shapes, grads_with_sizes)
+        ]
+
+        # Form the list with the original list of variables.
+        summed_tower_grads = [
+            (g, v) for g, (_, v) in zip(grads_with_shapes, tower_grads_and_vars)
+        ]
+        aggregated_device_grads.append(summed_tower_grads)
+    return aggregated_device_grads
+
+
+class AggregateSmallTensorPacker(object):
+  """Concatenate small gradient tensors together for reduction."""
+
+  def __init__(self,
+               agg_small_grads_max_bytes=1048576,
+               agg_small_grads_max_group=16):
+    """Initialize the AggregateSmallTensorPacker object.
+
+    Args:
+      agg_small_grads_max_bytes: largest tensor eligible for aggregation,
+        in number of bytes.
+      agg_small_grads_max_group: largest permitted aggregation of small
+        tensors.
+
+    Raises:
+      ValueError: if `agg_small_grads_max_bytes` or `agg_small_grads_max_group`
+        is not greater than 0.
+    """
+    if agg_small_grads_max_bytes <= 0 or agg_small_grads_max_group <= 0:
+      raise ValueError("agg_small_grads_max_bytes and agg_small_grads_max_group"
+                       " should both be greater than zero.")
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+
+  def pack(self, grouped_grads_and_vars):
+    """Aggregate small tensors."""
+    if (self.agg_small_grads_max_bytes > 0 and
+        self.agg_small_grads_max_group > 0):
+      tower_grads, self.packing = cross_tower_utils.pack_small_tensors(
+          grouped_grads_and_vars,
+          max_bytes=self.agg_small_grads_max_bytes,
+          max_group=self.agg_small_grads_max_group)
+    return tower_grads
+
+  def unpack(self, summed_device_grad_packs):
+    """Reverse the aggregation process."""
+    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                  self.packing)
+
+
+class AllReduceCrossTowerOps(CrossTowerOps):
+  """Reduction using all reduce."""
+
+  def __init__(self,
+               all_reduce_alg="nccl",
+               num_packs=1,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """All-reduce implementation of CrossTowerOps.
+
+    Before performing all-reduce, tensors will be repacked or aggregated for
+    more efficient cross-device transportation:
+      1) If `num_packs` is non-zero, pack values into
+        `num_packs` splits.
+      2) Otherwise, if `agg_small_grads_max_bytes` > 0 and
+        `agg_small_grads_max_group` > 0, aggregate values smaller than
+        `agg_small_grads_max_bytes` into groups with at most
+        `agg_small_grads_max_group` values.
+      3) Otherwise, no repacking or grouping will happen.
+
+    Args:
+      all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
+        "hierarchical_copy" are supported.
+      num_packs: see above.
+      agg_small_grads_max_bytes: see above.
+      agg_small_grads_max_group: see above.
+        tensors.
+    """
+    self.all_reduce_alg = all_reduce_alg
+    self.num_packs = num_packs
+    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self.agg_small_grads_max_group = agg_small_grads_max_group
+    super(AllReduceCrossTowerOps, self).__init__()
+
+  def _reduce(self, method_string, per_device_value, destinations):
+    if ((destinations is None or _devices_match(per_device_value, destinations))
+        and not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string, [per_device_value])[0]
+    else:
+      devices = _get_devices_from(destinations or per_device_value)
+      reduce_to_device = devices[0]
+      reduced = _simple_reduce(per_device_value, reduce_to_device,
+                               math_ops.add_n, method_string)
+      return self.broadcast(reduced, devices)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    if (_all_devices_match(value_destination_pairs) and
+        not context.executing_eagerly()):
+      return self._batch_all_reduce(method_string,
+                                    [v[0] for v in value_destination_pairs])
+    else:
+      if not context.executing_eagerly():
+        logging.warning("Efficient batch_reduce is not supported if "
+                        "destinations are different.")
+      return [
+          self._reduce(method_string, t, destinations=v)
+          for t, v in value_destination_pairs
+      ]
+
+  def _batch_all_reduce(self, method_string, per_device_values):
+    """All reduce algorithm in a batch."""
+    destinations = per_device_values[0].devices
+    grouped = _group_value_by_device(per_device_values)
+    if self.num_packs > 0:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s and num_packs = %d", len(per_device_values),
+          self.all_reduce_alg, self.num_packs)
+      tensor_packer = ConcatAndSplitPacker(self.num_packs)
+      device_grad_packs = tensor_packer.pack(grouped)
+    elif (self.agg_small_grads_max_bytes > 0 and
+          self.agg_small_grads_max_group > 0):
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with "
+          "algorithm = %s, agg_small_grads_max_bytes = %d and "
+          "agg_small_grads_max_group = %d", len(per_device_values),
+          self.all_reduce_alg, self.agg_small_grads_max_bytes,
+          self.agg_small_grads_max_group)
+      tensor_packer = AggregateSmallTensorPacker(100, 10)
+      device_grad_packs = tensor_packer.pack(grouped)
+    else:
+      logging.info(
+          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
+          len(per_device_values), self.all_reduce_alg)
+      tensor_packer = None
+      device_grad_packs = grouped
+
+    # The actual aggregation of the repacked gradients. Note that they are
+    # sharded among different aggregation trees. So it is important to strike
+    # the balance on num_splits.
+    if self.all_reduce_alg == "nccl":
+      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+          device_grad_packs)
+    else:
+      # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
+      # order.
+      reduced = (
+          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+              destinations, device_grad_packs))
+
+    if tensor_packer:
+      reduced = tensor_packer.unpack(reduced)
+
+    return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
+                                      method_string)
+
+
+_dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+               [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+
+
+def _has_dgx1_like_links(gpu_links):
+  if not gpu_links:
+    return False
+  # TODO(yuefengz): figure out the right topology for hierarchial copy if
+  # number of gpus are less than 8.
+  if len(gpu_links) < 8:
+    return False
+  for i, (gpu_link, dgx1_link) in enumerate(zip(gpu_links, _dgx1_links)):
+    if (set(gpu_link) != set(dgx1_link) and
+        set(gpu_link) != set(dgx1_link + [i])):
+      return False
+  return True
+
+
+def _choose_all_reduce_algorithm(device_links):
+  if _has_dgx1_like_links(device_links):
+    logging.info("Configured hierarchical_copy with num_packs=%d",
+                 len(device_links))
+    return AllReduceCrossTowerOps(
+        "hierarchical_copy", num_packs=len(device_links))
+  else:
+    logging.info("Configured nccl all-reduce.")
+    return AllReduceCrossTowerOps("nccl", num_packs=1)
+
+
+def choose_the_best(devices, session_config=None):
+  """Find the best subclass of CrossTowerOps given a tensorflow session.
+
+  Args:
+    devices: a list of devices passed for distribute strategy.
+    session_config: a tensorflow session config or None. If None, it will make
+      deciesion based on all local devices.
+
+  Returns:
+    a subclass of CrossTowerOps.
+  """
+  requested_devices = set([device_util.canonicalize(d) for d in devices])
+  machine_devices = device_lib.list_local_devices(session_config=session_config)
+  using_devices = []
+  for d in machine_devices:
+    if device_util.canonicalize(d.name) in requested_devices:
+      using_devices.append(d)
+    else:
+      logging.info(
+          "Device is available but not used by distribute strategy: %s", d.name)
+
+  if len(using_devices) != len(requested_devices):
+    logging.warning("Not all devices in distribute strategy are visible by "
+                    "TensorFlow sessions.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  if any([d.device_type.lower() != "gpu" for d in using_devices]):
+    logging.warning("Not all devices in DistributionStrategy are visible to "
+                    "TensorFlow session.")
+    return ReductionToOneDeviceCrossTowerOps()
+
+  device_links = [[] for _ in range(len(using_devices))]
+  for i, device in enumerate(using_devices):
+    for link in device.locality.links.link:
+      device_links[i].append(link.device_id)
+
+  return _choose_all_reduce_algorithm(device_links)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c7b0870887465ec2fe40007695d099277db38bf
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -0,0 +1,221 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossTowerOps."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _make_per_device(values, devices):
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  assert len(values) == len(devices)
+  index = {}
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index[d] = placed_v
+  return value_lib.PerDevice(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = cross_tower_ops_lib._get_devices_from(devices)
+  return value_lib.Mirrored(
+      {d: v for d, v in zip(devices, [value] * len(devices))})
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_value_equal(self, left, right):
+    if isinstance(left, list):
+      for l, r in zip(left, right):
+        self._assert_value_equal(l, r)
+    else:
+      self.assertEqual(type(left), type(right))
+      self.assertEqual(left.devices, right.devices)
+      if context.executing_eagerly():
+        self.assertEqual([v.numpy() for v in left._index.values()],
+                         list(right._index.values()))
+      else:
+        with self.test_session() as sess:
+          self.assertEqual(
+              sess.run(list(left._index.values())), list(right._index.values()))
+
+  # TODO(yuefengz): decouple the num_gpus check from distribution in
+  # combinations module so that we can pass in devices instead of a distribution
+  # strategy.
+  reduction_to_one_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "DefaultReductionToOneDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossTowerOp",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  accumulation_fn=math_ops.accumulate_n)),
+      ],
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 8, 0, 0)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopyAggregateSmallTensors",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 0, 100, 10))
+      ],
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    devices = distribution.worker_devices
+
+    values = [constant_op.constant(float(d)) for d in range(len(devices))]
+    per_device = _make_per_device(values, devices)
+    mean = (len(devices) - 1.) / 2.
+
+    values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+    per_device_2 = _make_per_device(values_2, devices)
+    mean_2 = mean + 1.
+
+    destination_mirrored = _fake_mirrored(1., devices)
+    destination_different = _fake_mirrored(1., _cpu_device)
+    destination_str = _cpu_device
+    destination_list = devices
+
+    all_destinations = [
+        None, destination_mirrored, destination_different, destination_str,
+        destination_list
+    ]
+
+    # test reduce()
+    for destinations in all_destinations:
+      self._assert_value_equal(
+          cross_tower_ops.reduce("mean", per_device, destinations=destinations),
+          _fake_mirrored(mean, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "mean", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2, destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce("sum", per_device, destinations=destinations),
+          _fake_mirrored(mean * len(devices), destinations or per_device))
+      self._assert_value_equal(
+          cross_tower_ops.reduce(
+              "sum", per_device_2, destinations=destinations),
+          _fake_mirrored(mean_2 * len(devices), destinations or per_device))
+
+    # test batch_reduce()
+    for d1, d2 in itertools.product(all_destinations, all_destinations):
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "mean", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean, d1 or per_device),
+           _fake_mirrored(mean_2, d2 or per_device_2)])
+      self._assert_value_equal(
+          cross_tower_ops.batch_reduce(
+              "sum", [(per_device, d1), (per_device_2, d2)]),
+          [_fake_mirrored(mean * len(devices), d1 or per_device),
+           _fake_mirrored(mean_2 * len(devices), d2 or per_device_2)])
+
+    # test broadcast()
+    for destinations in all_destinations:
+      if destinations is None:
+        continue
+      else:
+        self._assert_value_equal(
+            cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations))
+
+  def testChooseAlgorithm(self):
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
+                    [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if there are only 4 devices
+    device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+    # if devices links contain each device itself
+    device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
+                    [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
+                    [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result.num_packs, 8)
+
+    # if not dgx1-like links
+    device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
+                    [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
+    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertTrue(
+        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertEqual(result.all_reduce_alg, "nccl")
+    self.assertEqual(result.num_packs, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc04e2195f6d305e0f7c642f24c355286f1a8cfa
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -0,0 +1,339 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for cross_tower_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as pycoll
+
+from tensorflow.contrib import nccl
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def aggregate_gradients_using_nccl(tower_grads):
+  """Aggregate gradients using nccl allreduce."""
+  agg_all_g_and_v = []
+  for single_g_and_v in zip(*tower_grads):
+    single_grads = [g for g, _ in single_g_and_v]
+    agg_grads = nccl.all_sum(single_grads)
+    agg_all_g_and_v.append(
+        [(g, v) for g, (_, v) in zip(agg_grads, single_g_and_v)])
+
+  agg_all_g_and_v = list(zip(*agg_all_g_and_v))
+
+  return agg_all_g_and_v
+
+
+def aggregate_gradients_using_hierarchical_copy(avail_devices, tower_grads):
+  """Aggregate gradients using hierarchical copies.
+
+  Args:
+    avail_devices: available GPU devices.
+    tower_grads: List of lists of (gradient, variable) tuples. The outer list
+      is over towers. The inner list is over individual gradients.
+
+  Returns:
+    The list of (aggregated_gradient, variable), where the gradient has been
+      summed across all towers and the variable is chosen from the first tower.
+  """
+  # This only works for DGX-1 type of machine topology
+  # Device peer to peer matrix
+  # DMA: 0 1 2 3 4 5 6 7
+  # 0:   Y Y Y Y Y N N N
+  # 1:   Y Y Y Y N Y N N
+  # 2:   Y Y Y Y N N Y N
+  # 3:   Y Y Y Y N N N Y
+  # 4:   Y N N N Y Y Y Y
+  # 5:   N Y N N Y Y Y Y
+  # 6:   N N Y N Y Y Y Y
+  # 7:   N N N Y Y Y Y Y
+  agg_grads = []
+  num_devices = len(avail_devices)
+  # In the special case of DGX-1 machine topology, the two groups have equal
+  # size.
+  group_size = num_devices // 2
+  for i, single_grads in enumerate(zip(*tower_grads)):
+    group_0_main_device = i % num_devices
+    group_1_main_device = (group_0_main_device + group_size) % num_devices
+    if group_0_main_device < group_size:
+      group_0_begin = 0
+      group_1_begin = group_size
+    else:
+      group_0_begin = group_size
+      group_1_begin = 0
+
+    # Aggregate the first group.
+    group_0_device_grads = single_grads[group_0_begin:
+                                        group_0_begin + group_size]
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_0_device_grads, False, False)
+
+    # Aggregate the second group.
+    group_1_device_grads = single_grads[group_1_begin:
+                                        group_1_begin + group_size]
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads, _ = aggregate_single_gradient_using_copy(
+          group_1_device_grads, False, False)
+
+    # Aggregate between the groups.
+    with ops.device(avail_devices[group_0_main_device]):
+      (agg_total_grads, _), _ = aggregate_single_gradient_using_copy(
+          [group_0_agg_grads, group_1_agg_grads], False, False)
+
+    # Broadcast the result back into the root of each group.
+    with ops.device(avail_devices[group_0_main_device]):
+      group_0_agg_grads_bcast = array_ops.identity(agg_total_grads)
+    with ops.device(avail_devices[group_1_main_device]):
+      group_1_agg_grads_bcast = array_ops.identity(agg_total_grads)
+
+    agg_grads_bcast = []
+    for j in range(len(single_grads)):
+      with ops.device(avail_devices[j]):
+        # Broadcast the result back to each member in the group from the root.
+        if (group_0_main_device < group_size) == (j < group_size):
+          src_device_grad = group_0_agg_grads_bcast
+        else:
+          src_device_grad = group_1_agg_grads_bcast
+        agg_grads_bcast.append(array_ops.identity(src_device_grad))
+
+    agg_grads.append(
+        [(g, v) for g, (_, v) in zip(agg_grads_bcast, single_grads)])
+
+  agg_grads = list(zip(*agg_grads))
+
+  return agg_grads
+
+
+def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
+                                         check_inf_nan):
+  """Calculate the average gradient for a shared variable across all towers.
+
+  Note that this function provides a synchronization point across all towers.
+
+  Args:
+    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
+      (gradient, variable) pair within the outer list represents the gradient
+      of the variable calculated for a single tower, and the number of pairs
+      equals the number of towers.
+    use_mean: if True, mean is taken, else sum of gradients is taken.
+    check_inf_nan: check grads for nans and infs.
+
+  Returns:
+    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
+      gradient has been averaged across all towers. The variable is chosen from
+      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
+  """
+  grads = [g for g, _ in grad_and_vars]
+  grad = math_ops.add_n(grads)
+
+  if use_mean and len(grads) > 1:
+    grad = array_ops.multiply(grad, 1.0 / len(grads))
+
+  v = grad_and_vars[0][1]
+  if check_inf_nan:
+    has_nan_or_inf = array_ops.logical_not(
+        array_ops.reduce_all(array_ops.is_finite(grads)))
+    return (grad, v), has_nan_or_inf
+  else:
+    return (grad, v), None
+
+
+def extract_ranges(index_list, range_size_limit=32):
+  """Extract consecutive ranges and singles from index_list.
+
+  Args:
+    index_list: List of monotone increasing non-negative integers.
+    range_size_limit: Largest size range to return.  If a larger
+      consecutive range exists, it will be returned as multiple
+      ranges.
+
+  Returns:
+    (ranges, singles) where ranges is a list of [first, last] pairs of
+      consecutive elements in index_list, and singles is all of the
+      other elements, in original order.
+  """
+  if not index_list:
+    return [], []
+  first = index_list[0]
+  last = first
+  ranges = []
+  singles = []
+  for i in index_list[1:]:
+    if i == last + 1 and (last - first) <= range_size_limit:
+      last = i
+    else:
+      if last > first:
+        ranges.append([first, last])
+      else:
+        singles.append(first)
+      first = i
+      last = i
+  if last > first:
+    ranges.append([first, last])
+  else:
+    singles.append(first)
+  return ranges, singles
+
+
+GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
+
+
+def pack_range(key, packing, grad_vars, rng):
+  """Form the concatenation of a specified range of gradient tensors.
+
+  Args:
+    key: Value under which to store meta-data in packing that will be used
+      later to restore the grad_var list structure.
+    packing: Dict holding data describing packed ranges of small tensors.
+    grad_vars: List of (grad, var) pairs for one tower.
+    rng: A pair of integers giving the first, last indices of a consecutive
+      range of tensors to be packed.
+
+  Returns:
+    A tensor that is the concatenation of all the specified small tensors.
+  """
+  to_pack = grad_vars[rng[0]:rng[1] + 1]
+  members = []
+  variables = []
+  restore_shapes = []
+  with ops.name_scope('pack'):
+    for g, v in to_pack:
+      variables.append(v)
+      restore_shapes.append(g.shape)
+      with ops.device(g.device):
+        members.append(array_ops.reshape(g, [-1]))
+    packing[key] = GradPackTuple(
+        indices=range(rng[0], rng[1] + 1),
+        vars=variables,
+        shapes=restore_shapes)
+    with ops.device(members[0].device):
+      return array_ops.concat(members, 0)
+
+
+def unpack_grad_tuple(gv, gpt):
+  """Unpack a previously packed collection of gradient tensors.
+
+  Args:
+    gv: A (grad, var) pair to be unpacked.
+    gpt: A GradPackTuple describing the packing operation that produced gv.
+
+  Returns:
+    A list of (grad, var) pairs corresponding to the values that were
+     originally packed into gv, maybe following subsequent operations like
+     reduction.
+  """
+  elt_widths = [x.num_elements() for x in gpt.shapes]
+  with ops.device(gv[0][0].device):
+    with ops.name_scope('unpack'):
+      splits = array_ops.split(gv[0], elt_widths)
+      unpacked_gv = []
+      for idx, s in enumerate(splits):
+        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
+                            gpt.vars[idx]))
+  return unpacked_gv
+
+
+def pack_small_tensors(tower_grads, max_bytes=0, max_group=0):
+  """Concatenate small gradient tensors together for reduction.
+
+  Args:
+    tower_grads: List of lists of (gradient, variable) tuples.
+    max_bytes: Int giving max number of bytes in a tensor that
+      may be considered small.
+    max_group: Int giving max number of small tensors that may be
+      concatenated into one new tensor.
+
+  Returns:
+    new_tower_grads, packing where new_tower_grads is identical to
+      tower_grads except that all feasible small_tensors have been removed
+      from their places and concatenated into larger tensors that are
+      now in the front of the list for each tower, and packing contains
+      the data necessary to restore the tower_grads structure.
+
+  Look through the first tower for gradients of the same type (float),
+  and small size, that are all sequential.  For each such group,
+  replace by a new tensor that is a flattened concatenation.  Note
+  that the corresponding variable will be absent, which doesn't matter
+  because it isn't used during all-reduce.
+
+  Requires:
+    Every gv_list in towers must have isomorphic structure including identical
+      tensor sizes and types.
+  """
+  small_indices = []
+  large_indices = []
+  for idx, (g, _) in enumerate(tower_grads[0]):
+    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
+      small_indices.append(idx)
+    else:
+      large_indices.append(idx)
+  small_ranges, small_singles = extract_ranges(
+      small_indices, range_size_limit=max_group)
+  large_indices = sorted(large_indices + small_singles)
+  num_gv = len(tower_grads[0])
+  packing = {}
+  if small_ranges:
+    new_tower_grads = []
+    for dev_idx, gv_list in enumerate(tower_grads):
+      assert len(gv_list) == num_gv
+      new_gv_list = []
+      for r in small_ranges:
+        key = '%d:%d' % (dev_idx, len(new_gv_list))
+        new_gv_list.append((pack_range(key, packing, gv_list, r),
+                            'packing_var_placeholder'))
+      for i in large_indices:
+        new_gv_list.append(gv_list[i])
+      new_tower_grads.append(new_gv_list)
+    return new_tower_grads, packing
+  else:
+    return tower_grads, None
+
+
+def unpack_small_tensors(tower_grads, packing):
+  """Undo the structure alterations to tower_grads done by pack_small_tensors.
+
+  Args:
+    tower_grads: List of List of (grad, var) tuples.
+    packing: A dict generated by pack_small_tensors describing the changes
+      it made to tower_grads.
+
+  Returns:
+    new_tower_grads: identical to tower_grads except that concatenations
+      of small tensors have been split apart and returned to their original
+      positions, paired with their original variables.
+  """
+  if not packing:
+    return tower_grads
+  new_tower_grads = []
+  num_devices = len(tower_grads)
+  num_packed = len(packing.keys()) // num_devices
+  for dev_idx, gv_list in enumerate(tower_grads):
+    gv_list = list(gv_list)
+    new_gv_list = gv_list[num_packed:]
+    for i in xrange(0, num_packed):
+      k = '%d:%d' % (dev_idx, i)
+      gpt = packing[k]
+      gv = unpack_grad_tuple(gv_list[i], gpt)
+      for gi, idx in enumerate(gpt.indices):
+        assert idx == gpt.indices[gi]
+        new_gv_list.insert(idx, gv[gi])
+    new_tower_grads.append(new_gv_list)
+  return new_tower_grads
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5a520ab5aeafb932092ebbbaaf07480cf40403b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -0,0 +1,127 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that show that DistributionStrategy works with canned Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import dnn_linear_combined
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.summary.writer import writer_cache
+
+
+class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
+                                                 parameterized.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def dataset_input_fn(self, x, y, batch_size, shuffle):
+
+    def input_fn():
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      if shuffle:
+        dataset = dataset.shuffle(batch_size)
+      dataset = dataset.repeat(10).batch(batch_size)
+      return dataset
+
+    return input_fn
+
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph'],
+          distribution=[
+              combinations.one_device_strategy,
+              combinations.mirrored_strategy_with_gpu_and_cpu
+          ]))
+  def test_complete_flow_with_mode(self, distribution):
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+    train_input_fn = self.dataset_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size // len(distribution.worker_devices),
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, y=data, batch_size=batch_size, shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data}, batch_size=batch_size, shuffle=False)
+
+    linear_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    dnn_feature_columns = [
+        feature_column.numeric_column('x', shape=(input_dimension,))
+    ]
+    feature_columns = linear_feature_columns + dnn_feature_columns
+    estimator = dnn_linear_combined.DNNLinearCombinedRegressor(
+        linear_feature_columns=linear_feature_columns,
+        dnn_hidden_units=(2, 2),
+        dnn_feature_columns=dnn_feature_columns,
+        label_dimension=label_dimension,
+        model_dir=self._model_dir,
+        # TODO(isaprykin): Work around the colocate_with error.
+        dnn_optimizer=adagrad.AdagradOptimizer(0.001),
+        linear_optimizer=adagrad.AdagradOptimizer(0.001),
+        config=run_config.RunConfig(train_distribute=distribution))
+
+    num_steps = 10
+    estimator.train(train_input_fn, steps=num_steps)
+
+    scores = estimator.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    predictions = np.array([
+        x[prediction_keys.PredictionKeys.PREDICTIONS]
+        for x in estimator.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, label_dimension), predictions.shape)
+
+    feature_spec = feature_column.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = estimator.export_savedmodel(tempfile.mkdtemp(),
+                                             serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/BUILD b/tensorflow/contrib/distribute/python/examples/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd17850212a1c007e2edb9dd3986b3109f040d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/BUILD
@@ -0,0 +1,30 @@
+# Example TensorFlow models that use DistributionStrategy for training.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_binary(
+    name = "simple_estimator_example",
+    srcs = ["simple_estimator_example.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "simple_tfkeras_example",
+    srcs = [
+        "simple_tfkeras_example.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..00c25c7a2482a559c8b94ff3be86c4961dfb439f
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_estimator_example.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple example to test the a DistributionStrategy with Estimators.
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def build_model_fn_optimizer():
+  """Simple model_fn with optimizer."""
+  # TODO(anjalisridhar): Move this inside the model_fn once OptimizerV2 is
+  # done?
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  def model_fn(features, labels, mode):  # pylint: disable=unused-argument
+    """model_fn which uses a single unit Dense layer."""
+    # You can also use the Flatten layer if you want to test a model without any
+    # weights.
+    layer = tf.layers.Dense(1, use_bias=True)
+    logits = layer(features)
+
+    if mode == tf.estimator.ModeKeys.PREDICT:
+      predictions = {"logits": logits}
+      return tf.estimator.EstimatorSpec(mode, predictions=predictions)
+
+    def loss_fn():
+      y = tf.reshape(logits, []) - tf.constant(1.)
+      return y * y
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      return tf.estimator.EstimatorSpec(mode, loss=loss_fn())
+
+    assert mode == tf.estimator.ModeKeys.TRAIN
+
+    global_step = tf.train.get_global_step()
+    train_op = optimizer.minimize(loss_fn(), global_step=global_step)
+    return tf.estimator.EstimatorSpec(mode, loss=loss_fn(), train_op=train_op)
+
+  return model_fn
+
+
+def main(_):
+  distribution = tf.contrib.distribute.MirroredStrategy(
+      ["/device:GPU:0", "/device:GPU:1"])
+  config = tf.estimator.RunConfig(train_distribute=distribution)
+
+  def input_fn():
+    features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    labels = tf.data.Dataset.from_tensors([1.]).repeat(10)
+    return tf.data.Dataset.zip((features, labels))
+
+  estimator = tf.estimator.Estimator(
+      model_fn=build_model_fn_optimizer(), config=config)
+  estimator.train(input_fn=input_fn, steps=10)
+
+  eval_result = estimator.evaluate(input_fn=input_fn)
+  print("Eval result: {}".format(eval_result))
+
+  def predict_input_fn():
+    predict_features = tf.data.Dataset.from_tensors([[1.]]).repeat(10)
+    return predict_features
+
+  predictions = estimator.predict(input_fn=predict_input_fn)
+  # TODO(anjalsridhar): This returns a generator object, figure out how to get
+  # meaningful results here.
+  print("Prediction results: {}".format(predictions))
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..b87224251ca3844fc81c6f32a893d2c71664a955
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""An example tf.keras model that is trained using MirroredStrategy."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from sys import argv
+import numpy as np
+import tensorflow as tf
+
+
+def input_fn():
+  x = np.random.random((1024, 10))
+  y = np.random.randint(2, size=(1024, 1))
+  x = tf.cast(x, tf.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  dataset = dataset.repeat(10)
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def main(args):
+  if len(args) < 2:
+    print('You must specify  model_dir for checkpoints such as'
+          ' /tmp/tfkeras_example./')
+    return
+
+  print('Using %s to store checkpoints.' % args[1])
+
+  strategy = tf.contrib.distribute.MirroredStrategy(
+      ['/device:GPU:0', '/device:GPU:1'])
+  config = tf.estimator.RunConfig(train_distribute=strategy)
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
+
+  model = tf.keras.Sequential()
+  model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+
+  model.compile(loss='binary_crossentropy', optimizer=optimizer)
+  model.summary()
+  tf.keras.backend.set_learning_phase(True)
+  keras_estimator = tf.keras.estimator.model_to_estimator(
+      keras_model=model, config=config, model_dir=args[1])
+
+  keras_estimator.train(input_fn=input_fn, steps=10)
+  eval_result = keras_estimator.evaluate(input_fn=input_fn)
+  print('Eval result: {}'.format(eval_result))
+
+if __name__ == '__main__':
+  tf.app.run(argv=argv)
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa90df79bbcd621fe7b7d0da04256b7a59d5bfe
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -0,0 +1,279 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.ops.losses import losses_impl
+
+
+class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetwork(self, distribution, optimizer_fn,
+                       use_callable_loss=True):
+    with distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn,
+          use_bias=True,
+          use_callable_loss=use_callable_loss)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers() +
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph", "eager"])))
+  def testOptimizerInsideModelFn(self, distribution, optimizer_fn):
+    created_variables = []
+    trainable_variables = []
+
+    def appending_creator(next_creator, *args, **kwargs):
+      v = next_creator(*args, **kwargs)
+      created_variables.append(v.name)
+      if "trainable" in kwargs and kwargs["trainable"]:
+        trainable_variables.append(v.name)
+      return v
+
+    # Creator scope needs to be set before it's used inside
+    # `distribution.scope`.
+    with variable_scope.variable_creator_scope(
+        appending_creator), distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn,
+          use_bias=True,
+          use_callable_loss=True,
+          create_optimizer_inside_model_fn=True)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      def get_expected_variables(optimizer_fn, num_parameter_devices):
+        variables_map = {
+            "GradientDescent": ["dense/kernel", "dense/bias"],
+            "Adam": [
+                "dense/kernel", "dense/bias", "beta1_power", "beta2_power",
+                "dense/kernel/Adam", "dense/kernel/Adam_1", "dense/bias/Adam",
+                "dense/bias/Adam_1"
+            ]
+        }
+        variables = variables_map[optimizer_fn().get_name()]
+        variables.extend([
+            v + "/replica_{}".format(replica)
+            for v in variables
+            for replica in range(1, num_parameter_devices)
+        ])
+        return set([v + ":0" for v in variables])
+
+      self.assertEqual(
+          get_expected_variables(optimizer_fn,
+                                 len(distribution.parameter_devices)),
+          set(created_variables))
+
+  @combinations.generate(
+      combinations.times(combinations.distributions_and_v1_optimizers(),
+                         combinations.combine(
+                             mode=["graph", "eager"],
+                             momentum=[0.8, 0.9, 0.99],
+                             renorm=[False, True])))
+  def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
+                                    renorm):
+    """Verifies that moving mean updates are reduced across towers."""
+    with distribution.scope():
+      num_towers = len(distribution.worker_devices)
+      model_fn, dataset, batchnorm = batchnorm_example(
+          optimizer_fn,
+          batch_per_epoch=num_towers,
+          momentum=momentum,
+          renorm=renorm)
+
+      # Disable prefetching since that makes the specific input on each device
+      # to be non deterministic, and this test relies on specific input being
+      # on each device.
+      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+        distribution._prefetch_on_device = False
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return control_flow_ops.group(
+            distribution.unwrap(
+                distribution.call_for_each_tower(
+                    model_fn,
+                    iterator.get_next(),
+                    run_concurrently=batchnorm.built)) +
+            ops.get_collection(ops.GraphKeys.UPDATE_OPS))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      expected_moving_means = [0.] * 8
+
+      def averaged_batch_mean(i):
+        # Each batch has shape [16, 8] where the ith element in jth list is
+        # (8 * j + i + tower_id * 100). So the batch mean in each tower is
+        # (60 + i + tower_id * 100). So here comes its batch mean over all
+        # towers:
+        return 60. + i + (num_towers - 1.) / 2. * 100.
+
+      for _ in range(10):
+        run_step()
+        moving_means = self.evaluate(distribution.fetch(batchnorm.moving_mean))
+
+        # We make sure that the moving_mean is updated as if the sample mean is
+        # calculated over all towers.
+        for i, expected_moving_mean in enumerate(expected_moving_means):
+          expected_moving_means[i] -= ((
+              expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
+          self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
+
+  @combinations.generate(
+      combinations.times(
+          combinations.combine(
+              distribution=[combinations.one_device_strategy,
+                            combinations.mirrored_strategy_with_gpu_and_cpu,
+                            combinations.mirrored_strategy_with_two_gpus],
+              optimizer_fn=[combinations.gradient_descent_optimizer_v1_fn,
+                            combinations.gradient_descent_optimizer_v2_fn],
+              loss_reduction=[losses_impl.Reduction.SUM,
+                              losses_impl.Reduction.MEAN,
+                              losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
+                              losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS]),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
+                    use_callable_loss):
+    with distribution.scope():
+      all_vars = []
+
+      def model_fn(x, y):
+
+        def loss_fn():
+          # Use fixed initialization to make the steps deterministic.
+          w = variable_scope.get_variable("w", initializer=[[2.]])
+          all_vars.append(w)
+          predict = math_ops.matmul(x, w)
+          return losses_impl.mean_squared_error(
+              y, predict, reduction=loss_reduction)
+
+        optimizer = optimizer_fn()  # GradientDescent with 0.2 learning rate
+
+        if use_callable_loss:
+          return optimizer.minimize(loss_fn)
+        else:
+          return optimizer.minimize(loss_fn())
+
+      features = dataset_ops.Dataset.from_tensors([[2.], [7.]])
+      labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
+      dataset = dataset_ops.Dataset.zip((features, labels)).repeat()
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return distribution.group(
+            distribution.call_for_each_tower(
+                model_fn, *iterator.get_next(), run_concurrently=False))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables_lib.global_variables_initializer())
+
+      run_step()
+
+      self.assertEqual(distribution.num_towers, len(all_vars))
+      v = all_vars[0]
+      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
+      # Our model is:
+      #   predict = x * w
+      #   loss = (predict - y)^2
+      #   dloss/dpredict = 2*(predict - y)
+      #   dloss/dw = 2 * x^T @ (predict - y)
+      # For our batch size of 2, assuming sum loss reduction:
+      #   x = [2, 7]
+      #   y = [6, 21]
+      #   w_initial = 2
+      #   predict = [4, 14]
+      #   predict - y = [-2, -7]
+      #   dloss/dw = 2 <[2, 7], [-2, -7]> = - 2(4 + 49) = -106
+      # So unreplicated the update to w with lr=0.2 is -0.2 * -106 = 21.2
+      # with sum loss reduction, or 10.6 with mean.
+      if loss_reduction == losses_impl.Reduction.SUM:
+        # Note that the "distribution.num_towers" factor will go away once
+        # we split the input across towers, instead of pulling a complete
+        # batch of input per tower.
+        self.assertNear(weight, 2 + 21.2 * distribution.num_towers, 0.0001)
+      else:
+        # One of the mean loss reductions.
+        self.assertNear(weight, 2 + 10.6, 0.0001)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb0edb3a11df7788991ca14f957494d87593a449
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -0,0 +1,497 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import six
+
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.contrib.distribute.python import values
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):
+  pass
+
+
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices on a single machine.
+
+  This strategy uses one tower per device and sync replication.
+  """
+
+  def __init__(self,
+               devices=None,
+               num_gpus=None,
+               cross_tower_ops=None,
+               prefetch_on_device=None):
+    super(MirroredStrategy, self).__init__()
+    # Convert `num_gpus` into `devices`, shouldn't specify both.
+    if devices is None:
+      if num_gpus is None:
+        num_gpus = context.num_gpus()
+      devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
+    elif num_gpus is not None:
+      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = devices
+    self._canonical_device_set = set(
+        [device_util.canonicalize(d) for d in devices])
+    self._device_index = values.PerDevice(
+        dict((d, i) for i, d in enumerate(devices)))
+    self._cross_tower_ops = cross_tower_ops
+    self._prefetch_on_device = prefetch_on_device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    # Figure out what collections this variable should be added to.
+    # We'll add the MirroredVariable to those collections instead.
+    collections = kwargs.pop("collections", None)
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    kwargs["collections"] = []
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    tower_local = kwargs.pop("tower_local_reduce_method", None)
+    if tower_local is not None:
+      kwargs["trainable"] = False
+
+    # TODO(josh11b,apassos): It would be better if variable initialization
+    # was never recorded on the tape instead of having to do this manually
+    # here.
+    with tape.stop_recording():
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            kwargs["name"] = "%s/replica_%d" % (var0name, i)
+            # Initialize replicas with the same value:
+            if context.executing_eagerly():
+              initial_value = index[devices[0]].value()
+            else:
+              initial_value = index[devices[0]].initial_value
+            kwargs["initial_value"] = array_ops.identity(initial_value)
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+
+      if tower_local is None:
+        result = values.MirroredVariable(index, index[devices[0]])
+      else:
+        result = values.TowerLocalVariable(
+            index, index[devices[0]], tower_local)
+
+    if not context.executing_eagerly():
+      g = ops.get_default_graph()
+      # If "trainable" is True, next_creator() will add the member variables
+      # to the TRAINABLE_VARIABLES collection, so we manually remove
+      # them and replace with the MirroredVariable. We can't set
+      # "trainable" to False for next_creator() since that causes functions
+      # like implicit_gradients to skip those variables.
+      if kwargs.get("trainable", True):
+        collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+        l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+        for v in index.values():
+          l.remove(v)
+      g.add_to_collections(collections, result)
+    return result
+
+  def distribute_dataset(self, dataset):
+    per_device_dataset = values.PerDeviceDataset(
+        dataset, self._devices, self._prefetch_on_device)
+    return per_device_dataset.make_one_shot_iterator()
+
+  def _broadcast(self, tensor, destinations):
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_tower_ops().broadcast(tensor, destinations or
+                                                 self._devices)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    """Run `fn` in separate threads, once per tower/worker device.
+
+    Args:
+      fn: function to run (will be run once per device, each in its own thread).
+      *args: positional arguments for `fn`
+      **kwargs: keyword arguments for `fn`.
+          `"run_concurrently"`: Boolean indicating whether executions of `fn`
+             can be run concurrently (under eager execution only), defaults to
+             `True`.
+
+    Returns:
+      Merged return value of `fn` across all towers.
+
+    Raises:
+      RuntimeError: If fn() calls get_tower_context().merge_call() a different
+          number of times for when called for different devices.
+    """
+    run_concurrently = kwargs.pop("run_concurrently", True)
+    if not context.executing_eagerly():
+      # Lots of TF library code isn't thread-safe in graph mode, and
+      # there is little to be gained by turning on multithreading when
+      # constructing a graph.
+      run_concurrently = False
+      # Needed for per-thread device, etc. contexts in graph mode.
+      ops.get_default_graph().switch_to_thread_local()
+    elif run_concurrently is None:
+      run_concurrently = True
+
+    coord = coordinator.Coordinator(
+        clean_stop_exception_types=(_RequestedStop,))
+
+    shared_variable_store = {}
+
+    # TODO(isaprykin): Create these threads once instead of during every run()
+    # call.
+    threads = []
+    for index, d in enumerate(self._devices):
+      variable_creator_fn = shared_variable_creator.make_fn(
+          shared_variable_store, index)
+      t = MirroredStrategy._MirroredTowerThread(
+          self, coord, d, variable_creator_fn, fn,
+          *values.select_device(d, args), **values.select_device(d, kwargs))
+      threads.append(t)
+
+    for t in threads:
+      t.start()
+
+    # When `fn` starts `should_run` event is set on _MirroredTowerThread
+    # (`MTT`) threads. The execution waits until
+    # `MTT.has_paused` is set, which indicates that either `fn` is
+    # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
+    # complete, then `MTT.done` is set to True.  Otherwise, arguments
+    # of `get_tower_context().merge_call` from all paused threads are grouped
+    # and the `merge_fn` is performed.  Results of the
+    # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
+    # Each such `get_tower_context().merge_call` call returns the
+    # `MTT.merge_result` for that thread when `MTT.should_run` event
+    # is reset again. Execution of `fn` resumes.
+
+    try:
+      with coord.stop_on_exception():
+        all_done = False
+        while not all_done and not coord.should_stop():
+          done = []
+          if run_concurrently:
+            for t in threads:
+              t.should_run.set()
+            for t in threads:
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          else:
+            for t in threads:
+              t.should_run.set()
+              t.has_paused.wait()
+              t.has_paused.clear()
+              if coord.should_stop():
+                return None
+              done.append(t.done)
+          if coord.should_stop():
+            return None
+          all_done = all(done)
+          if not all_done:
+            if any(done):
+              raise RuntimeError("Some towers made a different number of "
+                                 "tower_context().merge_call() calls.")
+            # get_tower_context().merge_call() case
+            merge_args = values.regroup(
+                {t.device: t.merge_args for t in threads})
+            merge_kwargs = values.regroup(
+                {t.device: t.merge_kwargs for t in threads})
+            merge_result = threads[0].merge_fn(
+                self, *merge_args, **merge_kwargs)
+            for t in threads:
+              t.merge_result = values.select_device(t.device, merge_result)
+    finally:
+      for t in threads:
+        t.should_run.set()
+      coord.join(threads)
+
+    return values.regroup({t.device: t.main_result for t in threads})
+
+  def map(self, map_over, fn, *args, **kwargs):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    index = {}
+    i = 0
+    for m in map_over:
+      d = self._devices[i % len(self._devices)]
+      with ops.device(d):
+        l = index.get(d, [])
+        l.append(fn(m,
+                    *values.select_device_mirrored(d, args),
+                    **values.select_device_mirrored(d, kwargs)))
+        index[d] = l
+    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
+    # in addition to PerDevice data.
+    return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
+
+  def configure(self, session_config=None):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
+          self._devices, session_config=session_config)
+
+  def _get_cross_tower_ops(self):
+    if self._cross_tower_ops is None:
+      self._cross_tower_ops = (
+          cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps())
+    return self._cross_tower_ops
+
+  def _reduce(self, method_string, value, destinations):
+    if len(self._devices) == 1 and not isinstance(value, values.PerDevice):
+      value = values.PerDevice({self._devices[0]: value})
+    assert isinstance(value, values.PerDevice)
+
+    return self._get_cross_tower_ops().reduce(
+        method_string, value, destinations=destinations)
+
+  def _batch_reduce(self, method_string, value_destination_pairs):
+    return self._get_cross_tower_ops().batch_reduce(method_string,
+                                                    value_destination_pairs)
+
+  def _update(self, var, fn, *args, **kwargs):
+    # TODO(josh11b): Also support TowerLocalVariables here? If so, args and
+    # kwargs don't need to be mirrored.
+    assert isinstance(var, values.MirroredVariable)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    assert isinstance(colocate_with, list)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.regroup(updates, values.Mirrored)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    assert isinstance(destination, six.string_types)
+    if isinstance(val, values.TowerLocalVariable):
+      val = self.reduce(val.reduce_method, val, destinations=destination)
+      with ops.device(destination):
+        return fn(self.unwrap(val)[0])
+
+    assert isinstance(val, values.Mirrored), (
+        "val = %s (type %s)" % (val, val.__class__.__name__))
+    if val.on_device(destination):
+      with ops.device(destination):
+        # Use an identity here to make sure we are returning a tensor
+        # instead of e.g. a variable object.
+        return array_ops.identity(fn(val.get(destination)))
+    device = None
+    for d in self._devices:
+      if val.on_device(d):
+        device = d
+        break
+    assert device is not None, (
+        "Could not find destination %s in list of devices %s." %
+        (destination, val.devices))
+    with ops.device(device):
+      v = fn(val.get(device))
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return [val.get(device=d) for d in self._devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  @property
+  def is_single_tower(self):
+    return len(self._devices) == 1
+
+  @property
+  def num_towers(self):
+    return len(self._devices)
+
+  def _worker_device_index(self):
+    return self._device_index
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._devices)
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return list(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    elif isinstance(colocate_with, values.DistributedValues):
+      # pylint: disable=protected-access
+      return list(colocate_with._index.keys())
+    elif isinstance(colocate_with, six.string_types):
+      return [colocate_with]
+    else:
+      return colocate_with
+
+  class _MirroredTowerThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredStrategy._MirroredTowerThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.tower_id = dist.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # TowerContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_tower_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._captured_name_scope = self.graph.get_name_scope()
+      if self._captured_name_scope:
+        self._captured_name_scope += "/"
+      if self.tower_id > 0:
+        if not self._captured_name_scope:
+          self._captured_name_scope = ""
+        self._captured_name_scope += "tower_%d/" % self.tower_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            self.graph.as_default(), \
+            MirroredTowerContext(self.distribution, self.tower_id), \
+            ops.device(self.device), \
+            ops.name_scope(self._captured_name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.tower_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredTowerContext(distribute_lib.TowerContext):
+  """TowerContext used in MirroredStrategy.call_for_each_tower().
+
+  Opened in `_MirroredTowerThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_tower()`).
+  """
+
+  def _merge_call(self, fn, *args, **kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredTowerThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def device(self):
+    distribute_lib.require_tower_context(self)
+    return self._distribution_strategy.worker_devices[self._tower_id]
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e9f06da8e2ed185c2c32f79a5a4f5407165fb1d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -0,0 +1,435 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Multi-GPU tests for MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+
+GPU_TEST = "test_gpu" in sys.argv[0]
+
+
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    if GPU_TEST:
+      self.assertGreater(context.num_gpus(), 0)
+      if context.num_gpus() > 1:
+        devices = ["/device:GPU:0", "/device:GPU:1"]
+    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
+    return mirrored_strategy.MirroredStrategy(devices)
+
+  def testMinimizeLossEager(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    soft_placement = not GPU_TEST
+    print("testMinimizeLossGraph soft_placement:", soft_placement)
+    self._test_minimize_loss_graph(
+        self._get_distribution_strategy(), soft_placement=soft_placement)
+
+  def testMapReduce(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_tower_id(self._get_distribution_strategy())
+
+  def testNumTowers(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self.assertEqual(2, self._get_distribution_strategy().num_towers)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testRunRegroupError(self):
+
+    def run_fn(device_id):
+      # Generates a list with different lengths on different devices.
+      # Will fail in _regroup() (if more than one device).
+      return list(range(device_id))
+
+    dist = self._get_distribution_strategy()
+    with dist.scope(), self.assertRaises(AssertionError):
+      dist.call_for_each_tower(run_fn, dist.worker_device_index)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testReduceToCpu(self):
+    if not GPU_TEST:
+      self.skipTest("Not GPU test")
+
+    def run_fn(device_id):
+      return device_id
+
+    dist = self._get_distribution_strategy()
+    with dist.scope():
+      result = dist.call_for_each_tower(run_fn, dist.worker_device_index)
+      reduced = dist.reduce("sum", result, destinations="/device:CPU:0")
+      unwrapped = dist.unwrap(reduced)
+      self.assertEqual(1, len(unwrapped))
+      expected = sum(range(len(dist.worker_devices)))
+      self.assertEqual(expected, self.evaluate(unwrapped[0]))
+
+
+@test_util.with_c_api
+class MirroredStrategyVariableCreationTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _skip_eager_if_gpus_less_than(self, num_gpus):
+    if context.num_gpus() < num_gpus and context.executing_eagerly():
+      self.skipTest("Enough GPUs not available for this test in eager mode.")
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSingleVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      # This variable should be created only once across the threads because of
+      # special variable_creator functions used by `dist.call_for_each_tower`.
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnnamedVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v = variable_scope.variable(1.0)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # Default name of "Variable" will be used.
+      self.assertEquals("Variable:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariables(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      for i in range(5):
+        vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for i, v in enumerate(result):
+        self.assertIsInstance(v, values.MirroredVariable)
+        self.assertEquals("foo" + str(i) + ":0", v.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleVariablesWithSameCanonicalName(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      vs = []
+      vs.append(variable_scope.variable(1.0, name="foo/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
+      vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
+      vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return vs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      for v in result:
+        self.assertIsInstance(v, values.MirroredVariable)
+      self.assertEquals(4, len(result))
+      self.assertEquals("foo/bar:0", result[0].name)
+      self.assertEquals("foo_1/bar:0", result[1].name)
+      self.assertEquals("foo_1/bar_1:0", result[2].name)
+      self.assertEquals("foo/bar_1:0", result[3].name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableWithSameCanonicalNameAcrossThreads(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(device_id):
+      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      # The resulting mirrored variable will use the name from the first device.
+      self.assertEquals("foo_0:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithLayers(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn(features):
+      with variable_scope.variable_scope("common"):
+        layer1 = core.Dense(1)
+        layer1(features)
+        layer2 = core.Dense(1)
+        layer2(features)
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        layer3 = core.Dense(1)
+        layer3(features)
+        return [(layer1.kernel, layer1.bias),
+                (layer2.kernel, layer2.bias),
+                (layer3.kernel, layer3.bias)]
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+    features = dataset_ops.Dataset.from_tensors([[1.]]).repeat(10)
+    features = dist.distribute_dataset(features).get_next()
+
+    with dist.scope():
+      result = dist.call_for_each_tower(
+          model_fn, features, run_concurrently=False)
+      suffixes = ["", "_1", "_2"]
+      for (kernel, bias), suffix in zip(result, suffixes):
+        self.assertIsInstance(kernel, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertIsInstance(bias, values.MirroredVariable)
+        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testWithGetVariableAndVariableScope(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn():
+      v0 = variable_scope.get_variable("var-thread0", [1])
+      with variable_scope.variable_scope("common"):
+        v1 = variable_scope.get_variable("var-thread1", [1])
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        v2 = variable_scope.get_variable("var-thread2", [1])
+
+      return v0, v1, v2
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+    with dist.scope():
+      with variable_scope.variable_scope("main"):
+        v = variable_scope.get_variable("var-main0", [1])
+        self.assertEquals("main/var-main0:0", v.name)
+
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(3, len(result))
+        v0, v1, v2 = result
+        self.assertIsInstance(v0, values.MirroredVariable)
+        self.assertEquals("main/var-thread0:0", v0.name)
+        self.assertIsInstance(v1, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread1:0", v1.name)
+        self.assertIsInstance(v2, values.MirroredVariable)
+        self.assertEquals("main/common/var-thread2:0", v2.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testThreeDevices(self):
+    self._skip_eager_if_gpus_less_than(2)
+
+    def model_fn():
+      v = variable_scope.variable(1.0, name="foo")
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
+
+    with dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEquals("foo:0", result.name)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testNonMatchingVariableCreation(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    def model_fn(name):
+      v = variable_scope.variable(1.0, name=name)
+      distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      names = values.DistributedValues({
+          "/device:CPU:0": "foo",
+          "/device:GPU:0": "bar"
+      })
+      with self.assertRaises(RuntimeError):
+        _ = dist.call_for_each_tower(model_fn, names, run_concurrently=False)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTowerLocalVariable(self):
+    self._skip_eager_if_gpus_less_than(1)
+
+    all_v_sum = {}
+    all_v_mean = {}
+
+    def model_fn(device_id):
+      tower_context = distribute_lib.get_tower_context()
+      with tower_context.tower_local_var_scope("sum"):
+        v_sum = variable_scope.variable(1.0)
+      with tower_context.tower_local_var_scope("mean"):
+        v_mean = variable_scope.variable(4.0)
+      self.assertTrue(isinstance(v_sum, values.TowerLocalVariable))
+      self.assertTrue(isinstance(v_mean, values.TowerLocalVariable))
+      updates = [v_sum.assign_add(2.0 + device_id),
+                 v_mean.assign(6.0 * device_id)]
+      all_v_sum[device_id] = v_sum
+      all_v_mean[device_id] = v_mean
+      return updates, v_sum, v_mean
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      # Create "sum" and "mean" versions of TowerLocalVariables.
+      ret_ops, ret_v_sum, ret_v_mean = dist.call_for_each_tower(
+          model_fn, dist.worker_device_index, run_concurrently=False)
+      # Should see the same wrapping instance in all towers.
+      self.assertIs(all_v_sum[0], ret_v_sum)
+      self.assertIs(all_v_mean[0], ret_v_mean)
+      for i in range(1, dist.num_towers):
+        self.assertIs(all_v_sum[0], all_v_sum[1])
+        self.assertIs(all_v_mean[0], all_v_mean[1])
+
+      # Apply updates
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      expected_sum = 0.0
+      expected_mean = 0.0
+      for i, d in enumerate(dist.worker_devices):
+        # Test access within a device scope, should see different values.
+        with ops.device(d):
+          v_sum_value = self.evaluate(ret_v_sum.read_value())
+          v_mean_value = self.evaluate(ret_v_mean.read_value())
+          expected = i + 3.0
+          self.assertEqual(expected, v_sum_value)
+          expected_sum += expected
+          expected = i * 6.0
+          self.assertEqual(expected, v_mean_value)
+          expected_mean += expected
+
+      # fetch() should return the value you get by applying the
+      # reduction across all towers.
+      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
+      expected_mean /= len(dist.worker_devices)
+      self.assertEqual(expected_mean, self.evaluate(dist.fetch(ret_v_mean)))
+
+  # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
+  # testing this in eager mode.
+
+  def testNameScope(self):
+    def model_fn():
+      with ops.name_scope("foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(1.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+        self.assertEquals(2, len(result))
+        for v, name in zip(result, ["a", "b"]):
+          self.assertIsInstance(v, values.DistributedValues)
+          v0, v1 = dist.unwrap(v)
+          self.assertEquals("main/foo/" + name + ":0", v0.name)
+          self.assertEquals("main/tower_1/foo/" + name + ":0", v1.name)
+
+  def testWithDefaultName(self):
+    def model_fn():
+      with ops.name_scope(None, "foo"):
+        a = constant_op.constant(1.0, name="a")
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+        b = constant_op.constant(2.0, name="b")
+      return a, b
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      self.assertEquals(2, len(result))
+      for v, name in zip(result, ["a", "b"]):
+        self.assertIsInstance(v, values.DistributedValues)
+        v0, v1 = dist.unwrap(v)
+        self.assertEquals("foo/" + name + ":0", v0.name)
+        self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1ef0ecc77a8e8432dfa4eb6da7c324b371dab70
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -0,0 +1,91 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class MirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import distribute as distribute_lib
+
+
+@test_util.with_c_api
+class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+@test_util.with_c_api
+class VariableCreatorStackTest(test.TestCase):
+
+  def testCreatorStacksAreThreadLocal(self):
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dist = mirrored_strategy.MirroredStrategy(devices)
+
+    def model_fn(device_id):
+      assert isinstance(device_id, int)
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        distribute_lib.get_tower_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        dist.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = dist.call_for_each_tower(model_fn, dist.worker_device_index)
+      result = dist.unwrap(result)
+      expected = ["main_thread:thread_0", "main_thread:thread_1"]
+      self.assertEquals(expected, result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/monitor.py b/tensorflow/contrib/distribute/python/monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7644acedc99361d7287a91832d76bc68cbc6ac0a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Monitor is responsible for training, checkpointing and recovery."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import variables
+
+
+class Monitor(object):
+  """Executes training steps, recovers and checkpoints.
+
+  Note that this class is particularly preliminary, experimental, and
+  expected to change.
+  """
+  # TODO(isaprykin): Support step functions that need multiple session calls.
+  # TODO(isaprykin): Support extra arguments to the step function.
+  # TODO(isaprykin): Support recovery, checkpointing and summaries.
+
+  def __init__(self, step_callable, session=None):
+    """Initialize the Monitor with components for executing training steps.
+
+    Args:
+      step_callable: a training `Step` that's capable of signaling when done.
+      session: a `Session` instance that's needed for graph mode.
+
+    Raises:
+      ValueError: if `session` was provided for eager mode or not provided for
+        graph mode.
+    """
+    if context.executing_eagerly():
+      if session is not None:
+        raise ValueError("Should not provide a `session` in Eager mode.")
+      self._run_step = step_callable
+    else:
+      if session is None:
+        raise ValueError("Should provide a `session` in Graph mode.")
+      self._run_step = session.make_callable(step_callable())
+      session.run(variables.global_variables_initializer())
+
+  def run_steps(self, num_steps=None):
+    step = 0
+    while num_steps is None or step < num_steps:
+      try:
+        self._run_step()
+        step += 1
+      except errors.OutOfRangeError:
+        break
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8277e1e7919e86ef616b31d0986589dcc9c49bbd
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -0,0 +1,84 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Monitor."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import monitor as monitor_lib
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.training import gradient_descent
+
+
+class MonitorTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(optimizer_fn, distribution)
+
+      if context.executing_eagerly():
+        monitor = monitor_lib.Monitor(single_loss_step, None)
+      else:
+        with self.test_session() as sess:
+          monitor = monitor_lib.Monitor(single_loss_step, sess)
+
+      monitor.run_steps(1)
+
+      self.assertEqual(1, len(layer.trainable_variables))
+      mirrored_weight_variable = layer.trainable_variables[0]
+      start_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      start_error = abs(numpy.array(start_error) - 1)
+
+      monitor.run_steps(9)
+      end_error = self.evaluate(distribution.fetch(mirrored_weight_variable))
+      end_error = abs(numpy.array(end_error) - 1)
+      self.assertGreaterEqual(start_error, end_error)
+
+  def testPassingASessionInEager(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with self.test_session() as sess:
+      with self.assertRaisesRegexp(ValueError, "Should not provide"):
+        _ = monitor_lib.Monitor(step_function, sess)
+
+  def testNotPassingASessionInGraph(self):
+    distribution = one_device_strategy.OneDeviceStrategy(
+        "/device:CPU:0")
+    step_function, _ = single_loss_example(
+        lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
+
+    with context.graph_mode(), ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError, "Should provide"):
+        _ = monitor_lib.Monitor(step_function, session=None)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..39c49442b9c3245cfd0b67a51be68773a6fd3ff4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class OneDeviceStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.eager.python import datasets
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import distribute as distribute_lib
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+class OneDeviceStrategy(distribute_lib.DistributionStrategy):
+  """A distribution strategy for running on a single device."""
+  # TODO(josh11b): Do we wrap values in types to generate errors if you are
+  # doing something that won't work with other DistributionStrategy
+  # implementations?
+
+  def __init__(self, device):
+    super(OneDeviceStrategy, self).__init__()
+    self._device = device
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # No need to distinguish tower-local variables when not mirroring,
+    # we just enforce that they are not trainable.
+    if kwargs.pop("tower_local_reduce_method", None) is not None:
+      kwargs["trainable"] = False
+
+    colocate_with = kwargs.pop("colocate_with", None)
+    if colocate_with is None:
+      with ops.device(self._device):
+        return next_creator(*args, **kwargs)
+    if isinstance(colocate_with, six.string_types):
+      with ops.device(colocate_with):
+        return next_creator(*args, **kwargs)
+    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+        isinstance(colocate_with[0], six.string_types)):
+      with ops.device(colocate_with[0]):
+        return next_creator(*args, **kwargs)
+    with ops.colocate_with(colocate_with):
+      return next_creator(*args, **kwargs)
+
+  def distribute_dataset(self, dataset):
+    if context.executing_eagerly():
+      return datasets.Iterator(dataset)
+    else:
+      return dataset.make_one_shot_iterator()
+
+  def _broadcast(self, tensor, destinations):
+    return tensor
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # We don't run `fn` in multiple threads in OneDeviceStrategy.
+    kwargs.pop("run_concurrently", None)
+    with ops.device(self._device), _OneDeviceTowerContext(self):
+      return fn(*args, **kwargs)
+
+  def map(self, map_over, fn, *args, **kwargs):
+    with ops.device(self._device):
+      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
+
+  def _reduce(self, method_string, value, destinations):
+    if not isinstance(value, values.MapOutput):
+      return value
+    l = value.get()
+    assert l
+    with ops.device(self._device):
+      if method_string == "sum":
+        return math_ops.add_n(l)
+      elif method_string == "mean":
+        return math_ops.add_n(l) / len(l)
+      else:
+        assert False
+
+  def _update(self, var, fn, *args, **kwargs):
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(var, *args, **kwargs)
+
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    del colocate_with
+    with ops.device(self._device), distribute_lib.UpdateContext(self._device):
+      return fn(*args, **kwargs)
+
+  def _fetch(self, val, destination, fn):
+    """Return a copy of `val` or `fn(val)` on `destination`."""
+    with ops.device(self._device):
+      v = fn(val)
+    with ops.device(destination):
+      return array_ops.identity(v)
+
+  def _unwrap(self, value):
+    return [value]
+
+  @property
+  def is_single_tower(self):
+    return True
+
+  @property
+  def num_towers(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    return [self._device]
+
+  @property
+  def parameter_devices(self):
+    return [self._device]
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return [self._device]
+
+  def _worker_device_index(self):
+    return 0
+
+
+class _OneDeviceTowerContext(distribute_lib.TowerContext):
+
+  def __init__(self, distribution_strategy):
+    distribute_lib.TowerContext.__init__(
+        self, distribution_strategy, tower_id=0)
+
+  @property
+  def device(self):
+    return self._distribution_strategy.worker_devices[0]
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7101ed0756f44b846f10ddc6d429afe005a2f196
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class OneDeviceStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import one_device_strategy
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+
+
+@test_util.with_c_api
+class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+
+  def testMinimizeLossEager(self):
+    self._test_minimize_loss_eager(self._get_distribution_strategy())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+  def testMapReduce(self):
+    self._test_map_reduce(self._get_distribution_strategy())
+
+  def testDeviceIndex(self):
+    self._test_device_index(self._get_distribution_strategy())
+
+  def testTowerId(self):
+    self._test_tower_id(self._get_distribution_strategy())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testCallAndMergeExceptions(self):
+    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/optimizer_v2_test.py b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0912b625f44342d22acc0ce9bb52a6b632c75a0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/optimizer_v2_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for running legacy optimizer code with DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+class MinimizeLossOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v2_optimizers(),
+          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
+          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+  def testTrainNetwork(self, distribution, optimizer_fn,
+                       use_callable_loss=True):
+    with distribution.scope():
+      model_fn, dataset, layer = minimize_loss_example(
+          optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
+
+      iterator = distribution.distribute_dataset(dataset)
+
+      def run_step():
+        return control_flow_ops.group(distribution.unwrap(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(), run_concurrently=layer.built)))
+
+      if not context.executing_eagerly():
+        with self.test_session() as sess:
+          run_step = sess.make_callable(run_step())
+        self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfcbb8568f92ebabbeeedb45ee677e4ee23d77dc
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2.py
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Extension of prefetching_ops to support more than one device."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import warnings
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.contrib.data.python.ops import gen_dataset_ops
+from tensorflow.contrib.data.python.ops import prefetching_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.data.util import nest as data_nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.util import nest
+
+
+# pylint: disable=protected-access
+class _PrefetchToDeviceIterator(object):
+  """A replacement for @{tf.data.Iterator} that prefetches to another device."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    self._input_dataset = input_dataset
+    self._get_next_call_count = 0
+    self._devices = devices
+    input_iterator = input_dataset.make_one_shot_iterator()
+    input_iterator_handle = input_iterator.string_handle()
+
+    @function.Defun(dtypes.string)
+    def _prefetch_fn(handle):
+      """Prefetches one element from `input_iterator`."""
+      remote_iterator = iterator_ops.Iterator.from_string_handle(
+          handle, input_iterator.output_types, input_iterator.output_shapes,
+          input_iterator.output_classes)
+      ret = remote_iterator.get_next()
+      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+
+    target_device = gen_dataset_ops.iterator_get_device(
+        input_iterator._iterator_resource)
+    self._buffering_resources = []
+    for device in nest.flatten(self._devices):
+      with ops.device(device):
+        buffer_resource_handle = prefetching_ops.function_buffering_resource(
+            f=_prefetch_fn,
+            target_device=target_device,
+            string_arg=input_iterator_handle,
+            buffer_size=buffer_size)
+        self._buffering_resources.append(buffer_resource_handle)
+
+  def get_next(self, name=None):
+    """See @{tf.data.Iterator.get_next}."""
+    self._get_next_call_count += 1
+    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
+      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
+
+    flat_result = []
+    # TODO(priyag): This will fail if the input size (typically number of
+    # batches) is not divisible by number of devices.
+    # How do we handle that more gracefully / let the user know?
+    for buffer_resource in self._buffering_resources:
+      flat_ret = gen_dataset_ops.function_buffering_resource_get_next(
+          buffer_resource,
+          output_types=data_nest.flatten(sparse.as_dense_types(
+              self.output_types, self.output_classes)), name=name)
+
+      ret = sparse.deserialize_sparse_tensors(
+          data_nest.pack_sequence_as(self.output_types, flat_ret),
+          self.output_types, self.output_shapes, self.output_classes)
+
+      for tensor, shape in zip(
+          data_nest.flatten(ret), data_nest.flatten(self.output_shapes)):
+        if isinstance(tensor, ops.Tensor):
+          tensor.set_shape(shape)
+      flat_result.append(ret)
+
+    return nest.pack_sequence_as(self._devices, flat_result)
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+# pylint: enable=protected-access
+
+
+class _PrefetchToDeviceDataset(dataset_ops.Dataset):
+  """A `Dataset` whose iterator prefetches elements to other device(s)."""
+
+  def __init__(self, input_dataset, devices, buffer_size):
+    self._input_dataset = input_dataset
+    self._devices = devices
+    self._buffer_size = buffer_size if buffer_size is not None else 1
+
+  def make_one_shot_iterator(self):
+    return _PrefetchToDeviceIterator(self._input_dataset, self._devices,
+                                     self._buffer_size)
+
+  def make_initializable_iterator(self, shared_name=None):
+    raise NotImplementedError("`prefetch_to_devices()` is not currently "
+                              "compatible with initializable iterators. Use "
+                              "`make_one_shot_iterator()` instead.")
+
+  def _as_variant_tensor(self):
+    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
+    # transformation methods is called.
+    # TODO(mrry): Investigate support for chaining further transformations after
+    # the prefetch, including GPU support.
+    raise NotImplementedError("`prefetch_to_devices()` must be the last "
+                              "transformation in a dataset pipeline.")
+
+  # TODO(priyag): Fix the output types, shapes and classes to match the result
+  # of get_next (which has the additional nesting layer of devices now).
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+
+def prefetch_to_devices(devices, buffer_size=None):
+  """A transformation that prefetches dataset values to the given `devices`.
+
+  NOTE: Although the transformation creates a @{tf.data.Dataset}, the
+  transformation must be the final `Dataset` in the input pipeline.
+
+  Args:
+    devices: A nested structure of devices on which to prefetch the data. It can
+      be a single device name, or a tuple or list of device names.
+    buffer_size: (Optional.) The number of elements to buffer on each device.
+      Defaults to an automatically chosen value.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+  def _apply_fn(dataset):
+    return _PrefetchToDeviceDataset(dataset, devices, buffer_size)
+
+  return _apply_fn
diff --git a/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed16f4607881f2864479c04b4c25e95d9fa1850
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/prefetching_ops_v2_test.py
@@ -0,0 +1,68 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for prefetching_ops_v2."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class PrefetchingOpsV2Test(test.TestCase):
+
+  def testPrefetchToOneDevice(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices("/gpu:0"))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in range(10):
+        self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testPrefetchToTwoDevicesInAList(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops_v2.prefetch_to_devices(["/cpu:0", "/gpu:0"]))
+
+    iterator = device_dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    output = []
+    with self.test_session() as sess:
+      for _ in range(5):
+        result = sess.run(next_element)
+        self.assertEqual(2, len(result))
+        output.extend(result)
+      self.assertEquals(set(range(10)), set(output))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/contrib/distribute/python/shared_variable_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7083e279f20803b227dcd52f6420ae832aa2df4
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator.py
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to re-use variables created on first device on subsequent devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+_VARIABLE_UNIQUIFYING_REGEX = re.compile(r"_\d/")
+_VARIABLE_UNIQUIFYING_REGEX_AT_END = re.compile(r"_\d$")
+
+
+def _canonicalize_variable_name(name):
+  # If no name is specified, uses default name "Variable".
+  if name is None:
+    return "Variable"
+  # Replace all instances of "_<num>/" with "/"
+  name = _VARIABLE_UNIQUIFYING_REGEX.sub("/", name)
+  # Replace any instances of "_<num>" at the end of the string with ""
+  name = _VARIABLE_UNIQUIFYING_REGEX_AT_END.sub("", name)
+  return name
+
+
+def make_fn(shared_variable_store, device_id):
+  """Construct the variable creator function for device `device_id`.
+
+  Constructs custom variable creator functions for the given device.
+  On first device (device_id == 0), it creates the variable using the
+  `next_creator`, and stores it in the provided `shared_variable_store`.
+  On all other devices (device_id > 0), it tries to re-use the variable
+  already created with the same name. If no such variable exists, it throws an
+  error.
+  Additionally, we de-uniquify variable names before checking for matches. This
+  helps re-use variables which are intended to be the same but have different
+  names due to variable uniquification happening upstream. Since this might
+  mean we may have multiple variables with the same canonical name, we store
+  them in a list per canonical name and return them in the same order as well.
+
+  Args:
+    shared_variable_store: A dictionary that we will use to store variables
+      created on the first device, and re-used by creators for other devices.
+    device_id: Integer index of the device whose creator should be
+      constructed.
+
+  Returns:
+    An appropriate creator function based on device_id.
+
+  """
+  variable_scope_access_index = {}
+  assert isinstance(device_id, int)
+
+  def create_new_variable(next_creator, *args, **kwargs):
+    """Create the variable using `next_creator` and store it."""
+    canonical_name = _canonicalize_variable_name(kwargs.get("name"))
+    v = next_creator(*args, **kwargs)
+
+    if canonical_name not in shared_variable_store:
+      shared_variable_store[canonical_name] = []
+    shared_variable_store[canonical_name].append(v)
+    return v
+
+  def reuse_variable(next_creator, *args, **kwargs):
+    """Re-use existing variable from store with same name (in order)."""
+    del next_creator, args
+    name = kwargs.get("name")
+    canonical_name = _canonicalize_variable_name(name)
+
+    try:
+      variable_index = variable_scope_access_index.get(canonical_name, 0)
+      v = shared_variable_store[canonical_name][variable_index]
+      # TODO(priyag): Make this variable re-use more robust by adding checks
+      # that the requested shape and dtype match the existing variable.
+      variable_scope_access_index[canonical_name] = variable_index + 1
+      return v
+    except (KeyError, IndexError):
+      raise RuntimeError(
+          "Tried to create variable {} with mismatching name on device {}".
+          format(name, device_id))
+
+  if device_id == 0:
+    return create_new_variable
+  else:
+    return reuse_variable
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..713494d603b855be2863af9f24ab98d4cf048042
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SharedVariableCreator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variable_scope
+
+
+class CanonicalizeVariableNameTest(test.TestCase):
+
+  def _canonicalize(self, name):
+    return shared_variable_creator._canonicalize_variable_name(name)
+
+  def testNoName(self):
+    self.assertEquals("Variable", self._canonicalize(None))
+
+  def testPatternInMiddle(self):
+    self.assertEquals("foo/bar/baz", self._canonicalize("foo_1/bar_1/baz"))
+
+  def testPatternAtEnd(self):
+    self.assertEquals("foo", self._canonicalize("foo_1"))
+
+  def testWrongPatterns(self):
+    self.assertEquals("foo_1:0", self._canonicalize("foo_1:0"))
+    self.assertEquals("foo1", self._canonicalize("foo1"))
+    self.assertEquals("foo_a", self._canonicalize("foo_a"))
+
+
+@test_util.with_c_api
+class SharedVariableCreatorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSharedVariable(self):
+
+    shared_variable_store = {}
+    num_devices = 3
+    creator_fns = []
+    for i in range(num_devices):
+      creator_fn = shared_variable_creator.make_fn(shared_variable_store, i)
+      creator_fns.append(creator_fn)
+
+    with variable_scope.variable_creator_scope(creator_fns[0]):
+      v0 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[1]):
+      v1 = variable_scope.variable(1.0, name="foo")
+
+    with variable_scope.variable_creator_scope(creator_fns[2]):
+      v2 = variable_scope.variable(1.0, name="foo")
+
+    # v1 and v2 should be same as v0
+    self.assertIs(v1, v0)
+    self.assertIs(v2, v0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..cef5fd2f8943d348a0721cd72032bf6cb2199ad9
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple network to use in tests and examples."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import step_fn
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.layers import core
+from tensorflow.python.layers import normalization
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+
+def single_loss_example(optimizer_fn, distribution, use_bias=False):
+  """Build a very simple network to use in tests and examples."""
+  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+  optimizer = optimizer_fn()
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def loss_fn(x):
+    y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+    return y * y
+
+  single_loss_step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer,
+                                                    distribution)
+
+  # Layer is returned for inspecting the kernels in tests.
+  return single_loss_step, layer
+
+
+def minimize_loss_example(optimizer_fn,
+                          use_bias=False,
+                          use_callable_loss=True,
+                          create_optimizer_inside_model_fn=False):
+  """Example of non-distribution-aware legacy code."""
+  dataset = dataset_ops.Dataset.from_tensors([[1.]]).repeat()
+  # An Optimizer instance is created either outside or inside model_fn.
+  outer_optimizer = None
+  if not create_optimizer_inside_model_fn:
+    outer_optimizer = optimizer_fn()
+
+  layer = core.Dense(1, use_bias=use_bias)
+
+  def model_fn(x):
+    """A very simple model written by the user."""
+
+    def loss_fn():
+      y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
+      return y * y
+
+    optimizer = outer_optimizer or optimizer_fn()
+
+    if use_callable_loss:
+      return optimizer.minimize(loss_fn)
+    else:
+      return optimizer.minimize(loss_fn())
+
+  return model_fn, dataset, layer
+
+
+def batchnorm_example(optimizer_fn,
+                      batch_per_epoch=1,
+                      momentum=0.9,
+                      renorm=False):
+  """Example of non-distribution-aware legacy code with batch normalization."""
+  # input shape is [16, 8], input values are increasing in both dimensions.
+  dataset = dataset_ops.Dataset.from_tensor_slices(
+      [[[float(x * 8 + y + z * 100)
+         for y in range(8)]
+        for x in range(16)]
+       for z in range(batch_per_epoch)]).repeat()
+  optimizer = optimizer_fn()
+  batchnorm = normalization.BatchNormalization(
+      renorm=renorm, momentum=momentum, fused=False)
+
+  def model_fn(x):
+
+    def loss_fn():
+      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
+      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
+      return loss
+
+    # Callable loss.
+    return optimizer.minimize(loss_fn)
+
+  return model_fn, dataset, batchnorm
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
new file mode 100644
index 0000000000000000000000000000000000000000..82514c64be40b421c4a9887932f2cfb8e1ac4be0
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The step function abstraction represents a single training step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.training import optimizer as optimizer_lib
+
+
+class Step(object):
+  """Interface for performing each step of a training algorithm."""
+
+  def __init__(self, distribution):
+    self._distribution = distribution
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def __call__(self):
+    """Perform one step of this training algorithm."""
+    return self.step(self.inputs())
+
+  def inputs(self):
+    """For the generating the input to be passed to `step()`."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  def step(self, inputs):
+    """Perform the main computation of this training algorithm."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class StandardInputStep(Step):
+  """Step with a standard implementation of input handling.
+
+  Args:
+    input_dataset: a tf.data Dataset that provides input.
+  """
+
+  def __init__(self, input_dataset, distribution):
+    Step.__init__(self, distribution)
+    self._distributed_input = distribution.distribute_dataset(input_dataset)
+
+  def inputs(self):
+    return self._distributed_input.get_next()
+
+
+class StandardSingleLossStep(StandardInputStep):
+  """A step function that implements a training step for a feed forward network.
+
+  An instance of this class is intended to be used as a callable:
+
+  ```python
+  ...
+  step = step_fn.StandardSingleLossStep(dataset, loss_fn, optimizer)
+  step.initialize(distribution)
+
+  # Run a single training step on a given DistributionStrategy:
+  step(distribution)
+  ...
+  ```
+
+  Args:
+    input_dataset: a tf.data Dataset that provides input.
+    loss_fn: a function that returns loss.
+    optimizer: an optimizer that implements an update rule.
+    distribution: a `DistributionStrategy` object.
+  """
+
+  def __init__(self, input_dataset, loss_fn, optimizer, distribution):
+    StandardInputStep.__init__(self, input_dataset, distribution)
+    self._loss_fn = loss_fn
+    self._optimizer = optimizer
+    self._is_run_concurrently = False
+
+  def step(self, inputs):
+    with self._distribution.scope():
+      gradients_fn = backprop.implicit_grad(self._loss_fn)
+      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
+
+      grads_and_vars = self.distribution.call_for_each_tower(
+          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
+      # If threads use layers, then we need to run the first step sequentially,
+      # so that layers.build() is not executed in parallel.  Otherwise, multiple
+      # sets of mirrored variables are going to be created.
+      self._is_run_concurrently = True
+      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
+          self.distribution, grads_and_vars)
diff --git a/tensorflow/contrib/distribute/python/step_fn_test.py b/tensorflow/contrib/distribute/python/step_fn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..75c5ec9659d193e77d219ba79977615d58841d64
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/step_fn_test.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for class Step."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variables
+
+
+class SingleLossStepTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          combinations.distributions_and_v1_optimizers(),
+          combinations.combine(mode=combinations.graph_and_eager_modes)))
+  def testTrainNetwork(self, distribution, optimizer_fn):
+    with distribution.scope():
+      single_loss_step, layer = single_loss_example(
+          optimizer_fn, distribution, use_bias=True)
+
+      if context.executing_eagerly():
+        run_step = single_loss_step
+      else:
+        with self.test_session() as sess:
+          run_step = sess.make_callable(single_loss_step())
+      self.evaluate(variables.global_variables_initializer())
+
+      weights, biases = [], []
+      for _ in range(10):
+        run_step()
+
+        weights.append(self.evaluate(distribution.fetch(layer.kernel)))
+        biases.append(self.evaluate(distribution.fetch(layer.bias)))
+
+      error = abs(numpy.add(numpy.squeeze(weights), numpy.squeeze(biases)) - 1)
+      is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
+      self.assertTrue(is_not_increasing)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b4ad9f146bc1d6a987fbeecbb05122946137154
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -0,0 +1,225 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for testing DistributionStrategy descendants."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer
+
+
+class _TestException(Exception):
+  pass
+
+
+# May be the argument to either distribution.call_for_each_tower() or
+# get_tower_context().merge_call()
+def _raise_exception_fn(_=None):
+  raise _TestException()
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that raises an exception.
+def _merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_raise_exception_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that raises an exception.
+def _call_raises_fn(dist):
+  dist.call_for_each_tower(_raise_exception_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call,
+# calls a get_tower_context().merge_call() that calls a
+# call_for_each_tower() that raises an exception.
+def _merge_call_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_raises_fn)
+
+
+# Must be the argument to a get_tower_context().merge_call() call, calls
+# dist.call_for_each_tower() with a function that calls a
+# get_tower_context().merge_call() that raises an exception.
+def _call_merge_raises_fn(dist):
+  dist.call_for_each_tower(_merge_raises_fn)
+
+
+# Must be the argument to a distribution.call_for_each_tower() call, calls a
+# get_tower_context().merge_call() that calls a call_for_each_tower() that
+# calls a get_tower_context().merge_call() that raises an exception.
+def _merge_call_merge_raises_fn():
+  distribute_lib.get_tower_context().merge_call(_call_merge_raises_fn)
+
+
+class DistributionTestBase(test.TestCase):
+  """Some tests that should work with any DistributionStrategy."""
+
+  def _test_minimize_loss_eager(self, d):
+    with d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
+      # common `implicit_grad` function and put it in DistributionStrategy.
+      grad_fn = backprop.implicit_grad(loss)
+      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          # control_dependencies irrelevant but harmless in eager execution
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      for i in range(10):
+        b, a = step()
+        if i == 0:
+          before, = b  # pylint: disable=unbalanced-tuple-unpacking
+        after, = a  # pylint: disable=unbalanced-tuple-unpacking
+
+      error_before = abs(before.numpy() - 1)
+      error_after = abs(after.numpy() - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_minimize_loss_graph(self, d, soft_placement=False):
+    config = config_pb2.ConfigProto()
+    config.allow_soft_placement = soft_placement
+    config.gpu_options.per_process_gpu_memory_fraction = 0.3
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         self.test_session(config=config) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss(x):
+        # TODO(josh11b): What if this constant was instead a captured
+        # value?  Would it need to be a value that has been passed
+        # through d.broadcast()?
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      grad_fn = backprop.implicit_grad(loss)
+
+      def update(v, g):
+        return v.assign_sub(0.2 * g)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one)
+
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.fetch(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            g = d.reduce("sum", g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.fetch(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+      variables.global_variables_initializer().run()
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+
+  def _test_map_reduce(self, d, in_graph=None):
+    with d.scope():
+      map_in = [constant_op.constant(i) for i in range(10)]
+      map_out = d.map(map_in, lambda x, y: x * y, 2)
+      observed = d.fetch(d.reduce("sum", map_out))
+      expected = 90  # 2 * (0 + 1 + ... + 9)
+      self.assertEqual(expected, observed.numpy())
+
+  def _test_device_index(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn(device_id):
+        self.assertLess(device_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[device_id])
+        expected_devices[device_id] = True
+
+      d.call_for_each_tower(mark_devices_fn, d.worker_device_index)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_tower_id(self, d):
+    with d.scope():
+      expected_devices = [False] * len(d.worker_devices)
+
+      def mark_devices_fn():
+        tower_id = distribute_lib.get_tower_context().tower_id
+        self.assertLess(tower_id, len(d.worker_devices))
+        self.assertFalse(expected_devices[tower_id])
+        expected_devices[tower_id] = True
+
+      d.call_for_each_tower(mark_devices_fn)
+      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+
+  def _test_call_and_merge_exceptions(self, dist):
+    with dist.scope():
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_raise_exception_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_raises_fn)
+      with self.assertRaises(_TestException):
+        dist.call_for_each_tower(_merge_call_merge_raises_fn)
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
new file mode 100644
index 0000000000000000000000000000000000000000..87bf0590384cc74ca0f0575bcef4e84599a8b666
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -0,0 +1,578 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values.
+
+See go/tf-distribution-strategy.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import weakref
+
+import six
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import prefetching_ops_v2
+from tensorflow.contrib.eager.python import datasets
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import saver
+from tensorflow.python.util import nest
+
+
+# pylint: disable=line-too-long
+# TODO(josh11b): Should device values be strings or DeviceSpec objects
+# Not sure DeviceSpec objects are usable as a dict key.
+class DistributedValues(object):
+  """Holds a map from device to values. Either PerDevice or Mirrored."""
+
+  def __init__(self, index):
+    self._index = {device_util.canonicalize(key): value
+                   for key, value in six.iteritems(index)}
+
+  def get(self, device=None):
+    """Returns the value for the current device or raises a ValueError."""
+    if device is None:
+      tower_context = distribute_lib.get_tower_context()
+      if tower_context:
+        device = tower_context.device
+      else:
+        device = distribute_lib.get_update_device()
+        if device is None:
+          device = device_util.current()
+    device = device_util.canonicalize(device)
+    try:
+      return self._index[device]
+    except KeyError:
+      raise ValueError("Device %s not found in %s (current device %s)" %
+                       (device, self._index.keys(), device_util.current()))
+
+  def on_device(self, device):
+    device = device_util.canonicalize(device)
+    return device in self._index
+
+  @property
+  def devices(self):
+    return list(self._index.keys())
+
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+  # TODO(josh11b): Possibly make an accessor for _index for use by
+  # DistributionStrategy implementations.
+
+
+class DistributedDelegate(DistributedValues):
+  """A map from device to values; acts as the same type as the values."""
+
+  def __init__(self, index):
+    super(DistributedDelegate, self).__init__(index)
+
+  def __getattr__(self, name):
+    return getattr(self.get(), name)
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o): return self.get() + o
+  def __radd__(self, o): return o + self.get()
+  def __sub__(self, o): return self.get() - o
+  def __rsub__(self, o): return o - self.get()
+  def __mul__(self, o): return self.get() * o
+  def __rmul__(self, o): return o * self.get()
+  def __truediv__(self, o): return self.get() / o
+  def __rtruediv__(self, o): return o / self.get()
+  def __floordiv__(self, o): return self.get() // o
+  def __rfloordiv__(self, o): return o // self.get()
+  def __mod__(self, o): return self.get() % o
+  def __rmod__(self, o): return o % self.get()
+  def __lt__(self, o): return self.get() < o
+  def __le__(self, o): return self.get() <= o
+  def __gt__(self, o): return self.get() > o
+  def __ge__(self, o): return self.get() >= o
+  def __and__(self, o): return self.get() & o
+  def __rand__(self, o): return o & self.get()
+  def __or__(self, o): return self.get() | o
+  def __ror__(self, o): return o | self.get()
+  def __xor__(self, o): return self.get() ^ o
+  def __rxor__(self, o): return o ^ self.get()
+  def __getitem__(self, o): return self.get()[o]
+  def __pow__(self, o, modulo=None): return pow(self.get(), o, modulo)
+  def __rpow__(self, o): return pow(o, self.get())
+  def __invert__(self): return ~self.get()
+  def __neg__(self): return -self.get()
+  def __abs__(self): return abs(self.get())
+
+  def __div__(self, o):
+    try:
+      return self.get().__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self.get().__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self.get().__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self.get().__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  # TODO(josh11b): Even more operator overloads.
+
+
+class PerDevice(DistributedValues):
+  """Holds a map from device to unsynchronized values."""
+  pass
+
+
+class Mirrored(DistributedValues):
+  """Holds a map from device to values which are kept in sync."""
+  pass
+
+
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(array_ops.identity(tensor))
+
+
+DistributedVarOp = collections.namedtuple(
+    "DistributedVarOp", ["name", "graph", "type"])
+
+
+class DistributedVariable(DistributedDelegate):
+  """Holds a map from device to variables."""
+  # TODO(josh11b): Support changing the set of variables if e.g. if new
+  # devices are joining or a device is to leave.
+
+  def __init__(self, index):
+    # Child class must set self._primary_var before calling
+    # super(...).__init__(index).
+    self._common_name = self._primary_var.name.split(":")[0]
+    super(DistributedVariable, self).__init__(index)
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group([v.initializer for v in self._index.values()])
+
+  @property
+  def graph(self):
+    return self._primary_var.graph
+
+  @property
+  def _shared_name(self):
+    return self._common_name
+
+  @property
+  def _unique_id(self):
+    return self._primary_var._unique_id   # pylint: disable=protected-access
+
+  @property
+  def name(self):
+    return self._primary_var.name
+
+  @property
+  def dtype(self):
+    return self._primary_var.dtype
+
+  @property
+  def shape(self):
+    return self._primary_var.shape
+
+  def get_shape(self):
+    return self._primary_var.get_shape()
+
+  def to_proto(self, export_scope=None):
+    return self._primary_var.to_proto(export_scope=export_scope)
+
+  @property
+  def op(self):
+    # We want cross-tower code that does some var.op.X calls
+    # to work (even if the current device isn't in self.devices), but
+    # other uses of var.op in a cross-tower context to fail.
+    if distribute_lib.get_cross_tower_context():
+      return DistributedVarOp(self._primary_var.op.name,
+                              self._primary_var.op.graph,
+                              self._primary_var.op.type)
+    return self.get().op
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion(var, dtype=None, name=None, as_ref=False):
+  # Try to avoid assignments to and other mutations of MirroredVariable
+  # state except through a DistributionStrategy.update() call.
+  assert not as_ref
+  return ops.internal_convert_to_tensor(
+      var.get(), dtype=dtype, name=name, as_ref=as_ref)
+
+
+ops.register_tensor_conversion_function(DistributedVariable, _tensor_conversion)
+ops.register_dense_tensor_like_type(DistributedVariable)
+
+
+class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
+  """Class for defining how to restore a MirroredVariable."""
+
+  def __init__(self, mirrored_variable, primary_variable, name):
+    self._mirrored_variable = mirrored_variable
+    super(_MirroredSaveable, self).__init__(primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._mirrored_variable._index)])  # pylint: disable=protected-access
+
+
+def _get_update_device():
+  """Validate we are in update/update_non_slot() and return current device.
+
+  This is used in MirroredVariable.assign* members, to make sure they
+  are only called via an update method, to make sure all components of the
+  variable are being updated in a consistent way.
+
+  Returns:
+    A string device.
+
+  Raises:
+    RuntimeError: If not in distribution.update()/.update_non_slot().
+  """
+  device = distribute_lib.get_update_device()
+  if device is None:
+    raise RuntimeError(
+        "Use DistributionStrategy.update() to modify a MirroredVariable.")
+  return device
+
+
+class MirroredVariable(DistributedVariable, Mirrored,
+                       checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are kept in sync."""
+
+  def __init__(self, index, primary_var):
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in six.itervalues(index):
+      v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._primary_var = primary_var
+    super(MirroredVariable, self).__init__(index)
+
+  # We use _get_update_device() for the assign* methods to enforce
+  # that we are in an update() function. The arguments to update() are
+  # automatically unwrapped so the update() function would normally
+  # see regular variables, not MirroredVariables. However, the update
+  # function can still operate on wrapped MirroredVariables through
+  # object members, captured arguments, etc. This is more likely in an
+  # update_non_slot() function (like OptimizerV2._finish), which can
+  # update several non-slot variables in one call.
+  def assign_sub(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get(device=_get_update_device()).assign(*args, **kwargs)
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    MirroredVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _MirroredSaveable(self, self._primary_var, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Class for defining how to restore a TowerLocalVariable."""
+
+  def __init__(self, tower_local_variable, name):
+    self._tower_local_variable = tower_local_variable
+    # We use a callable so that we don't have to evaluate this expression
+    # in the case where we are trying to restore instead of save.
+    def tensor():
+      return distribute_lib.get_distribution_strategy().fetch(
+          tower_local_variable)
+    spec = saver.BaseSaverBuilder.SaveSpec(
+        tensor=tensor,
+        slice_spec="",
+        name=name,
+        dtype=tower_local_variable.dtype)
+    super(_TowerLocalSaveable, self).__init__(tensor, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into all variables."""
+    tensor, = restored_tensors
+    # To preserve the sum across save and restore, we have to divide the
+    # total across all devices when restoring a variable that was summed
+    # when saving.
+    if self._tower_local_variable.reduce_method == "sum":
+      tensor *= 1. / len(self._tower_local_variable.devices)
+    return control_flow_ops.group([
+        _assign_on_device(d, v, tensor)
+        for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
+
+
+class TowerLocalVariable(DistributedVariable, PerDevice,
+                         checkpointable.CheckpointableBase):
+  """Holds a map from device to variables whose values are reduced on save."""
+
+  def __init__(self, index, primary_var, reduce_method):
+    self._primary_var = primary_var
+    self._reduce_method = reduce_method
+    super(TowerLocalVariable, self).__init__(index)
+
+  def assign_sub(self, *args, **kwargs):
+    return self.get().assign_sub(*args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    return self.get().assign_add(*args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    return self.get().assign(*args, **kwargs)
+
+  @property
+  def reduce_method(self):
+    return self._reduce_method
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides CheckpointableBase method.
+
+    This allows both name-based and object-based save and restore of
+    TowerLocalVariables.
+
+    Returns:
+      A dictionary mapping attribute names to `SaveableObject` factories.
+    """
+    def _saveable_factory(name=self._common_name):
+      return _TowerLocalSaveable(self, name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+
+def _devices_match(d1, d2):
+  return device_util.canonicalize(d1) == device_util.canonicalize(d2)
+
+
+def regroup(per_device, wrap_class=PerDevice):
+  """Makes device->nest map into a nest of PerDevice/Mirrored values."""
+  items = list(per_device.items())
+  assert items
+  v0 = items[0][1]  # First value
+
+  if isinstance(v0, list):
+    for _, v in items[1:]:
+      assert isinstance(v, list)
+      assert len(v) == len(v0), ("len(v) == %d, len(v0) == %d, v: %s, v0: %s" %
+                                 (len(v), len(v0), v, v0))
+    return [regroup({k: v[i] for k, v in items}, wrap_class)
+            for i in range(len(v0))]
+
+  if isinstance(v0, tuple):
+    for _, v in items[1:]:
+      assert isinstance(v, tuple)
+      assert len(v) == len(v0)
+    regrouped_tuple = tuple(regroup({k: v[i] for k, v in items}, wrap_class)
+                            for i in range(len(v0)))
+    if hasattr(v0, "_fields"):
+      # This tuple is in fact a namedtuple! Create a new namedtuple instance
+      # and initialize it with the regrouped values:
+      assert hasattr(type(v0), "_make")
+      return type(v0)._make(regrouped_tuple)
+    else:
+      return regrouped_tuple
+
+  if isinstance(v0, dict):
+    v0keys = set(v0.keys())
+    for _, v in items[1:]:
+      assert isinstance(v, dict)
+      assert set(v.keys()) == v0keys
+    return {key: regroup({k: v[key] for k, v in items}, wrap_class)
+            for key in v0keys}
+
+  # If exactly the same object across all devices, return it unwrapped.
+  same_id = True
+  for _, v in items[1:]:
+    if v is not v0:
+      same_id = False
+      break
+  # Consider three cases where same_id is true:
+  # * If v0 is a MirroredVariable (and same_id means it is the same
+  #   across all devices), we want to return it. We check
+  #   MirroredVariable specifically since it can look like it
+  #   has a _mirrored_container member since its members do.
+  # * If v0 is a member of a mirrored variable, in which case
+  #   hasattr(v0, "_mirrored_container") is true, we want to
+  #   return the MirroredVariable that contains it using the
+  #   _mirrored_container logic below. This case can trigger
+  #   same_id when there is only one device.
+  # * In any other situation, same_id means we return v0.
+  if same_id and (isinstance(v0, MirroredVariable) or
+                  not hasattr(v0, "_mirrored_container")):
+    return v0
+
+  # Detect the case where each device has a parallel component of the
+  # same MirroredVariable. In this case we want to return the
+  # containing MirroredVariable, after a bunch of sanity checking.
+  # In particular, each component should have the same container,
+  # and the devices of the variables should match the keys of the
+  # per-device dictionary.
+  # TODO(josh11b): Do we need similar logic for TowerLocalVariables?
+  if hasattr(v0, "_mirrored_container"):
+    # pylint: disable=protected-access
+    assert not isinstance(v0, MirroredVariable), (
+        "ids = %s, items = %s" % ([id(v[1]) for v in items], items))
+    assert _devices_match(v0.device, items[0][0]), (
+        "v0.device = %s, items = %s" % (v0.device, items))
+    mirrored_container = v0._mirrored_container()
+    assert mirrored_container is not None
+    for d, v in items[1:]:
+      assert _devices_match(v.device, d), (
+          "v.device = %s, d = %s, items = %s" % (v.device, d, items))
+      assert mirrored_container is v._mirrored_container()
+    return mirrored_container
+  # pylint: enable=protected-access
+
+  return wrap_class(per_device)
+
+
+def select_device(device, structured):
+  """Specialize a nest of regular & per-device values for one device."""
+  def _get(x):
+    return x.get(device) if isinstance(x, DistributedValues) else x
+
+  return nest.map_structure(_get, structured)
+
+
+def select_device_mirrored(device, structured):
+  """Specialize a nest of regular & mirrored values for one device."""
+  def _get_mirrored(x):
+    if isinstance(x, DistributedValues):
+      if not isinstance(x, Mirrored):
+        raise TypeError(
+            "Expected value to be mirrored across towers: %s in %s." %
+            (x, structured))
+      return x.get(device)
+    else:
+      return x
+
+  return nest.map_structure(_get_mirrored, structured)
+
+
+class PerDeviceDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `PerDeviceDataset`."""
+
+  def __init__(self, iterator, devices, prefetch_on_device=None):
+    self._iterator = iterator
+    self._devices = devices
+    self._prefetch_on_device = prefetch_on_device
+
+  def get_next(self, name=None):
+    """Scatter the input across devices."""
+    if self._prefetch_on_device:
+      data_list = self._iterator.get_next(name=name)
+      index = dict(zip(self._devices, data_list))
+    else:
+      batch = self._iterator.get_next(name=name)
+      index = {}
+      def get_ith(i):
+        return lambda x: x[i]
+
+      for i, d in enumerate(self._devices):
+        index[d] = nest.map_structure(get_ith(i), batch)
+        if context.executing_eagerly():
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+
+    return regroup(index)
+
+
+class PerDeviceDataset(object):
+  """Like `tf.data.Dataset` split devices, producing `PerDevice` data."""
+
+  def __init__(self, dataset, devices, prefetch_on_device=None):
+    self._devices = devices
+
+    # Default to using prefetching in graph mode, unless specified.
+    # TODO(priyag): Enable prefetching in eager mode.
+    self._prefetch_on_device = prefetch_on_device
+    if self._prefetch_on_device is None:
+      self._prefetch_on_device = not context.executing_eagerly()
+    assert not (self._prefetch_on_device and context.executing_eagerly()), (
+        "Prefetching is only supported in graph mode currently")
+
+    if self._prefetch_on_device:
+      self._dataset = dataset
+    else:
+      # TODO(priyag): If dropping remainder is not appropriate, find another
+      # approach to distributing the dataset when not possible to divide evenly.
+      # Possibly not an issue when we start using PartitionedDataset.
+      self._dataset = dataset.apply(
+          batching.batch_and_drop_remainder(len(devices)))
+
+  def make_one_shot_iterator(self):
+    """Get a one time use iterator for the distributed PerDeviceDataset."""
+    if self._prefetch_on_device:
+      on_device_dataset = self._dataset.apply(
+          prefetching_ops_v2.prefetch_to_devices(self._devices))
+      dataset_iterator = on_device_dataset.make_one_shot_iterator()
+    elif context.executing_eagerly():
+      dataset_iterator = datasets.Iterator(self._dataset)
+    else:
+      dataset_iterator = self._dataset.make_one_shot_iterator()
+
+    return PerDeviceDataIterator(
+        dataset_iterator, self._devices, self._prefetch_on_device)
+
+
+class MapOutput(object):
+  """Map can result in multiple outputs per device."""
+
+  def __init__(self, l):
+    self._l = l
+
+  def get(self):
+    return self._l
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c0d4b7d6c78b7cf63c613201d83d4793ecfe76b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -0,0 +1,807 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import device_util
+from tensorflow.python.training import saver as saver_lib
+
+
+@test_util.with_c_api
+class DistributedValuesTest(test.TestCase):
+
+  def testGetEager(self):
+    with ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testGetGraph(self):
+    with context.graph_mode(), \
+        ops.Graph().as_default(), \
+        ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      with self.assertRaises(ValueError):
+        self.assertIsNone(v.get("/device:GPU:2"))
+
+  def testCanonicalization(self):
+    canonical_cpu = ["/job:localhost/replica:0/task:0/device:CPU:0"]
+    v = values.DistributedValues({"": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/device:CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/cpu:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    v = values.DistributedValues({"/CPU:0": 42})
+    self.assertEqual(canonical_cpu, list(v._index.keys()))
+    with self.assertRaises(AssertionError):
+      v = values.DistributedValues({"/device:cpu:0": 42})
+
+
+@test_util.with_c_api
+class DistributedDelegateTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetAttr(self):
+    with ops.device("/device:CPU:0"):
+
+      class Foo(object):
+
+        def __init__(self, x):
+          self.x = x
+
+      v = values.DistributedDelegate(
+          {"/device:CPU:0": Foo(7), "/device:GPU:0": Foo(8)})
+      self.assertEqual(7, v.x)
+      with self.assertRaises(AttributeError):
+        _ = v.y
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOperatorOverride(self):
+    with ops.device("/device:CPU:0"):
+      v = values.DistributedDelegate({"/device:CPU:0": 7, "/device:GPU:0": 8})
+      # v should act like int(7).
+      self.assertEqual(8, v + 1)
+      self.assertEqual(10, 3 + v)
+      self.assertEqual(14, v + v)
+      self.assertEqual(5, v - 2)
+      self.assertEqual(6, 13 - v)
+      self.assertEqual(0, v - v)
+      self.assertEqual(14, v * 2)
+      self.assertEqual(21, 3 * v)
+      self.assertEqual(49, v * v)
+      self.assertEqual(3.5, v / 2)
+      self.assertEqual(1.5, 10.5 / v)
+      self.assertEqual(3, v // 2)
+      self.assertEqual(2, 15 // v)
+      self.assertEqual(1, v % 2)
+      self.assertEqual(2, 16 % v)
+      self.assertTrue(v < 12)
+      self.assertTrue(v <= 12)
+      self.assertFalse(v > 12)
+      self.assertFalse(v >= 12)
+      self.assertFalse(12 < v)
+      self.assertFalse(12 <= v)
+      self.assertTrue(12 > v)
+      self.assertTrue(12 >= v)
+      self.assertEqual(3, v & 3)
+      self.assertEqual(3, 11 & v)
+      self.assertEqual(15, v | 8)
+      self.assertEqual(23, 16 | v)
+      self.assertEqual(4, v ^ 3)
+      self.assertEqual(12, 11 ^ v)
+      self.assertEqual(343, pow(v, 3))
+      self.assertEqual(3, pow(v, 3, 10))
+      self.assertEqual(128, pow(2, v))
+      self.assertEqual(-7, -v)
+      self.assertEqual(~7, ~v)
+      self.assertEqual(7, abs(v))
+      with self.assertRaises(TypeError):
+        _ = v[2]
+
+
+def _device_str(d):
+  return "/device:GPU:" + str(d)
+
+
+def _nested_value(d):
+  return ("a" + d, ["b" + d, {"c": "d" + d, "e": "f" + d}, "g" + d], "h" + d)
+
+
+def _make_mirrored():
+  v = []
+  index = {}
+  devices = ["/device:GPU:0", "/device:CPU:0"]
+  for d, n, init in zip(devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  mirrored = values.MirroredVariable(index, v[0])
+  return v, devices, mirrored
+
+
+@test_util.with_c_api
+class RegroupAndSelectDeviceTest(test.TestCase):
+
+  def _is_per_device(self, result, expected, klass=values.PerDevice):
+    self.assertIsInstance(result, klass)
+    # We canonicalize the devices to match the device strings returned
+    # by PerDevice, which also does device string canonicalization.
+    devices = [device_util.canonicalize(_device_str(i))
+               for i in range(len(expected))]
+    self.assertEqual(set(devices), set(result.devices))
+    for i, d in enumerate(devices):
+      self.assertEqual(expected[i], result.get(d))
+      self.assertEqual(expected[i], result.get(_device_str(i)))
+
+  def testNested(self):
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"])
+    self._is_per_device(result[2], ["h1", "h2"])
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"])
+    self._is_per_device(result[1][2], ["g1", "g2"])
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"])
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"])
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # select_device_mirrored() should fail due to non-mirrored values
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(0), result)
+    with self.assertRaises(TypeError):
+      values.select_device_mirrored(_device_str(1), result)
+
+  def testWrapClass(self):
+    # Normally a mirrored value would be the same across devices, but
+    # for a test it is convenient to be able to tell the values apart.
+    result = values.regroup({_device_str(0): _nested_value("1"),
+                             _device_str(1): _nested_value("2")},
+                            values.Mirrored)
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(3, len(result))
+    self._is_per_device(result[0], ["a1", "a2"], values.Mirrored)
+    self._is_per_device(result[2], ["h1", "h2"], values.Mirrored)
+
+    self.assertIsInstance(result[1], list)
+    self.assertEqual(3, len(result[1]))
+    self._is_per_device(result[1][0], ["b1", "b2"], values.Mirrored)
+    self._is_per_device(result[1][2], ["g1", "g2"], values.Mirrored)
+
+    self.assertIsInstance(result[1][1], dict)
+    self.assertEqual(set(["c", "e"]), set(result[1][1].keys()))
+    self._is_per_device(result[1][1]["c"], ["d1", "d2"], values.Mirrored)
+    self._is_per_device(result[1][1]["e"], ["f1", "f2"], values.Mirrored)
+
+    # Also test that we can undo the merge using select_device()
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device(_device_str(1), result))
+    # Values are marked as mirrored, so select_device_mirrored() is allowed.
+    self.assertEqual(_nested_value("1"),
+                     values.select_device_mirrored(_device_str(0), result))
+    self.assertEqual(_nested_value("2"),
+                     values.select_device_mirrored(_device_str(1), result))
+
+  def testMirroredContainer(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+    v, devices, mirrored = _make_mirrored()
+    result = values.regroup(dict(zip(devices, v)))
+    self.assertIs(mirrored, result)
+
+  def testSameId(self):
+    foo = object()
+    result = values.regroup({_device_str(0): ("a", foo),
+                             _device_str(1): ("b", foo)})
+    self.assertIsInstance(result, tuple)
+    self.assertEqual(2, len(result))
+    self._is_per_device(result[0], ["a", "b"])
+    self.assertIs(foo, result[1])
+
+    # Test select_device(), should undo the merge done by regroup().
+    result_0 = values.select_device(_device_str(0), result)
+    self.assertIsInstance(result_0, tuple)
+    self.assertEqual(2, len(result_0))
+    self.assertEqual("a", result_0[0])
+    self.assertIs(foo, result_0[1])
+    result_1 = values.select_device(_device_str(1), result)
+    self.assertIsInstance(result_1, tuple)
+    self.assertEqual(2, len(result_1))
+    self.assertEqual("b", result_1[0])
+    self.assertIs(foo, result_1[1])
+
+  def testOneDevice(self):
+    result = values.regroup({_device_str(0): _nested_value("1")})
+    # On one device regroup() and select_device() are basically identity.
+    self.assertEqual(_nested_value("1"), result)
+    self.assertEqual(_nested_value("1"),
+                     values.select_device(_device_str(0), result))
+
+    # The one exception has to do with MirroredVariables.
+    d = "/device:CPU:0"
+    with ops.device(d):
+      v = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+      index = {d: v}
+    mirrored = values.MirroredVariable(index, v)
+    result = values.regroup(index)
+    self.assertIs(mirrored, result)
+
+  def testNamedTupleEstimatorSpec(self):
+    with context.graph_mode(), ops.Graph().as_default():
+      created_estimator_specs = {}
+      to_regroup = {}
+
+      for device_id in range(3):
+        spec = model_fn_lib.EstimatorSpec(
+            mode=model_fn_lib.ModeKeys.TRAIN,
+            loss=constant_op.constant(device_id / 2),
+            train_op=array_ops.identity(constant_op.constant(device_id)))
+        created_estimator_specs[device_id] = spec
+        to_regroup[_device_str(device_id)] = spec
+
+      merged_estimator_spec = values.regroup(to_regroup)
+
+      self.assertTrue(
+          isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
+      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      for device_id in range(3):
+        d = _device_str(device_id)
+        self.assertEquals(created_estimator_specs[device_id].loss,
+                          merged_estimator_spec.loss.get(d))
+        self.assertEquals(created_estimator_specs[device_id].train_op,
+                          merged_estimator_spec.train_op.get(d))
+        # Scaffold is populated by `EstimatorSpec.__new__`.
+        self.assertEquals(created_estimator_specs[device_id].scaffold,
+                          merged_estimator_spec.scaffold.get(d))
+        # Also test that we can undo the merge using select_device()
+        self.assertEquals(created_estimator_specs[device_id],
+                          values.select_device(_device_str(device_id),
+                                               merged_estimator_spec))
+
+
+@test_util.with_c_api
+class PerDeviceDatasetTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _test_iterator_no_prefetch(self, devices, dataset, expected_values):
+    per_device_dataset = values.PerDeviceDataset(
+        dataset, devices, prefetch_on_device=False)
+    iterator = per_device_dataset.make_one_shot_iterator()
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      actual = self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, actual)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      self.evaluate([
+          values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator_with_prefetch(self, devices, dataset, expected_values):
+    if not context.executing_eagerly():
+      per_device_dataset = values.PerDeviceDataset(
+          dataset, devices, prefetch_on_device=True)
+      iterator = per_device_dataset.make_one_shot_iterator()
+
+      # With prefetching, we cannot guarantee which input ends up on which
+      # device, so we verify that the complete set seen on all devices is
+      # correct, and equal numbers are distributed to each device.
+      combined_actual = []
+      combined_expected = []
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        combined_actual.extend(self.evaluate([
+            values.select_device(d, next_element) for d in devices]))
+        combined_expected.extend(expected_value)
+
+      self.assertEqual(set(combined_expected), set(combined_actual))
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        self.evaluate([
+            values.select_device(d, next_element) for d in devices])
+
+  def _test_iterator(self, devices, dataset, expected_values):
+    self._test_iterator_no_prefetch(devices, dataset, expected_values)
+    self._test_iterator_with_prefetch(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOneDevice(self):
+    devices = ["/device:CPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testMultipleDevices(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testTupleDataset(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset1 = dataset_ops.Dataset.range(10)
+    dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(devices, dataset, expected_values)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testUnevenDatasetBatches(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    devices = ["/device:CPU:0", "/device:GPU:0"]
+    dataset = dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(devices, dataset, expected_values)
+
+
+@test_util.with_c_api
+class MirroredVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, _, mirrored = _make_mirrored()
+
+    self.assertEquals(v[0].name, mirrored.name)
+    self.assertEquals(v[0].dtype, mirrored.dtype)
+    self.assertEquals(v[0].shape, mirrored.shape)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    mirrored = values.MirroredVariable(index, v)
+
+    self.assertEquals(v.name, mirrored.name)
+    self.assertEquals(v.dtype, mirrored.dtype)
+    self.assertEquals(v.shape, mirrored.shape)
+
+  def _assign_mirrored(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreMirroredOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path, saver = self._save_return_saver(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+
+      # Restores the saved value of 3. to both variables.
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  def _save_mirrored(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [3., 4.])
+
+      # Saves the current value of v[0], 3.
+      save_path = self._save(sess, mirrored)
+
+      # Change the values between save and restore.
+      self._assign_mirrored(devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.))
+
+      # Saves the current value of var, 3.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3. to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3., self.evaluate(var))
+
+  def _restore_mirrored(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, devices, mirrored = _make_mirrored()
+
+      # Overwrite the initial values.
+      self._assign_mirrored(devices, v, [7., 8.])
+
+      # Restores the saved value of 3. to both variables.
+      saver = saver_lib.Saver(var_list=[mirrored])
+      saver.restore(sess, save_path)
+      self.assertEqual([3., 3.], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_mirrored(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveMirroredRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_mirrored()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreMirrored(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_mirrored(save_path)
+
+
+_devices = ["/device:GPU:0", "/device:CPU:0"]
+
+
+def _make_tower_local(method):
+  v = []
+  index = {}
+  for d, n, init in zip(_devices, ["v", "v/replica"], [1., 2.]):
+    with ops.device(d):
+      v.append(variable_scope.get_variable(
+          name=n, initializer=init, use_resource=True))
+      index[d] = v[-1]
+  tower_local = values.TowerLocalVariable(index, v[0], method)
+  return v, tower_local
+
+
+@test_util.with_c_api
+class TowerLocalVariableTest(test.TestCase):
+
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testProperties(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    v, tower_local = _make_tower_local("sum")
+
+    self.assertEquals(v[0].name, tower_local.name)
+    self.assertEquals(v[0].dtype, tower_local.dtype)
+    self.assertEquals(v[0].shape, tower_local.shape)
+    self.assertEquals("sum", tower_local.reduce_method)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testVariableOnAnotherDevice(self):
+    v = variable_scope.get_variable(
+        name="v", initializer=[1.], use_resource=True)
+    index = {"/job:foo/device:CPU:0": v}
+    tower_local = values.TowerLocalVariable(index, v, "mean")
+
+    self.assertEquals(v.name, tower_local.name)
+    self.assertEquals(v.dtype, tower_local.dtype)
+    self.assertEquals(v.shape, tower_local.shape)
+    self.assertEquals("mean", tower_local.reduce_method)
+
+  def _assign_tower_local(self, devices, v, new):
+    for d, var, n in zip(devices, v, new):
+      with ops.device(d):
+        self.evaluate(var.assign(n))
+
+  def _save_return_saver(self, sess, var):
+    saver = saver_lib.Saver(var_list=[var])
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    return saver.save(sess, prefix), saver
+
+  def _save(self, sess, var):
+    save_path, _ = self._save_return_saver(sess, var)
+    return save_path
+
+  def _dist_scope(self):
+    return mirrored_strategy.MirroredStrategy(_devices).scope()
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalSumOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 7.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 7. which gets divided equally
+        # between the variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveAndRestoreTowerLocalMeanOneGraph(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    with self.test_session() as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5.
+        save_path, saver = self._save_return_saver(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+
+        # Restores the saved value of 3.5 to both variables.
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _save_tower_local_mean(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [3., 4.])
+
+      with self._dist_scope():
+        # Saves the current value of (v[0] + v[1])/2, 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_tower_local_sum(self):
+    """Save variables with mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [1.5, 2.])
+
+      with self._dist_scope():
+        # Saves the current value of v[0] + v[1], 3.5
+        save_path = self._save(sess, tower_local)
+
+        # Change the values between save and restore.
+        self._assign_tower_local(_devices, v, [5., 6.])
+    return save_path
+
+  def _save_normal(self):
+    """Save variables without mirroring, returns save_path."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=1., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(3.5))
+
+      # Saves the current value of var, 3.5.
+      save_path = self._save(sess, var)
+
+      # Change the values between save and restore.
+      self.evaluate(var.assign(5.))
+    return save_path
+
+  def _restore_normal(self, save_path):
+    """Restore to variables without mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      var = variable_scope.get_variable(
+          name="v", initializer=7., use_resource=True)
+
+      # Overwrite the initial value.
+      self.evaluate(var.assign(8.))
+
+      # Restores the saved value of 3.5 to `var`.
+      saver = saver_lib.Saver(var_list=[var])
+      saver.restore(sess, save_path)
+      self.assertEqual(3.5, self.evaluate(var))
+
+  def _restore_tower_local_mean(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("mean")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
+
+  def _restore_tower_local_sum(self, save_path):
+    """Restore to variables with mirroring in a fresh graph."""
+    with self.test_session(graph=ops.Graph()) as sess:
+      v, tower_local = _make_tower_local("sum")
+
+      # Overwrite the initial values.
+      self._assign_tower_local(_devices, v, [7., 8.])
+
+      with self._dist_scope():
+        # Restores the saved value of 3.5 to both variables.
+        saver = saver_lib.Saver(var_list=[tower_local])
+        saver.restore(sess, save_path)
+        self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_tower_local_sum(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalMeanRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_mean()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveTowerLocalSumRestoreNormal(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_tower_local_sum()
+    self._restore_normal(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalMean(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_mean(save_path)
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testSaveNormalRestoreTowerLocalSum(self):
+    if context.num_gpus() < 1 and context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test in eager mode.")
+
+    save_path = self._save_normal()
+    self._restore_tower_local_sum(save_path)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 1c381cc354fa4e5a630cfb5025dfd4bddf04a71c..fec6eafd4a66d38e9c99163b059bfeb81d8ad120 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -454,6 +454,21 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+)
+
+cuda_py_test(
+    name = "batch_reshape_test",
+    size = "small",
+    srcs = ["python/kernel_tests/batch_reshape_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
 )
 
 cuda_py_test(
@@ -475,6 +490,16 @@ cuda_py_test(
     tags = ["nomsan"],  # disable to avoid false positives from scipy.
 )
 
+cuda_py_test(
+    name = "seed_stream_test",
+    size = "small",
+    srcs = ["python/kernel_tests/seed_stream_test.py"],
+    additional_deps = [
+        ":distributions_py",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 cuda_py_test(
     name = "statistical_testing_test",
     size = "medium",
@@ -486,12 +511,7 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
-    tags = [
-        "manual",
-        "noasan",
-        "noguitar",
-        "optonly",
-    ],
+    shard_count = 4,
 )
 
 cuda_py_test(
@@ -745,18 +765,6 @@ cuda_py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # === Bijector Tests ==========================================================
 
 cuda_py_test(
@@ -1125,6 +1133,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 61c411271d0bb8d7b4cc3b14992b82ec1e5674ed..ddf59891e626a85e6c917ac74b3cfaabf16eb15d 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distributions.python.ops import bijectors
 from tensorflow.contrib.distributions.python.ops.autoregressive import *
+from tensorflow.contrib.distributions.python.ops.batch_reshape import *
 from tensorflow.contrib.distributions.python.ops.binomial import *
 from tensorflow.contrib.distributions.python.ops.cauchy import *
 from tensorflow.contrib.distributions.python.ops.chi2 import *
@@ -58,6 +59,7 @@ from tensorflow.contrib.distributions.python.ops.quantized_distribution import *
 from tensorflow.contrib.distributions.python.ops.relaxed_bernoulli import *
 from tensorflow.contrib.distributions.python.ops.relaxed_onehot_categorical import *
 from tensorflow.contrib.distributions.python.ops.sample_stats import *
+from tensorflow.contrib.distributions.python.ops.seed_stream import *
 from tensorflow.contrib.distributions.python.ops.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.test_util import *
 from tensorflow.contrib.distributions.python.ops.vector_diffeomixture import *
@@ -96,9 +98,10 @@ _allowed_symbols = [
     'ReparameterizationType',
     'Distribution',
     'Autoregressive',
-    'Binomial',
+    'BatchReshape',
     'Bernoulli',
     'Beta',
+    'Binomial',
     'BetaWithSoftplusConcentration',
     'Categorical',
     'Chi2',
@@ -124,6 +127,7 @@ _allowed_symbols = [
     'NormalWithSoftplusScale',
     'Poisson',
     'PoissonLogNormalQuadratureCompound',
+    'SeedStream',
     'SinhArcsinh',
     'StudentT',
     'StudentTWithAbsDfSoftplusScale',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..59d549b7b80a3d80d0b8409542eb6583f645bdaa
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -0,0 +1,568 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for BatchReshape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import batch_reshape as batch_reshape_lib
+from tensorflow.contrib.distributions.python.ops import mvn_diag as mvn_lib
+from tensorflow.contrib.distributions.python.ops import poisson as poisson_lib
+from tensorflow.contrib.distributions.python.ops import wishart as wishart_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import normal as normal_lib
+from tensorflow.python.platform import test
+
+
+class _BatchReshapeTest(object):
+
+  def make_wishart(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype([
+        [[1., 0.5],
+         [0.5, 1.]],
+        [[0.5, 0.25],
+         [0.25, 0.75]],
+    ])
+    scale = np.reshape(np.concatenate([scale, scale], axis=0),
+                       old_batch_shape + [dims, dims])
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    wishart = wishart_lib.WishartFull(df=5, scale=scale_ph)
+    reshape_wishart = batch_reshape_lib.BatchReshape(
+        distribution=wishart,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+
+    return wishart, reshape_wishart
+
+  def test_matrix_variate_sample_and_log_prob(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_wishart.batch_shape_tensor()
+    event_shape = reshape_wishart.event_shape_tensor()
+
+    expected_sample_shape = [3, 1] + new_batch_shape + [dims, dims]
+    x = wishart.sample([3, 1], seed=42)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_wishart.sample([3, 1], seed=42)
+
+    expected_log_prob_shape = [3, 1] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        wishart.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_wishart.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims, dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_wishart.batch_shape)
+    self.assertAllEqual([dims, dims], reshape_wishart.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_matrix_variate_stats(self):
+    dims = 2
+    new_batch_shape = [4]
+    old_batch_shape = [2, 2]
+    wishart, reshape_wishart = self.make_wishart(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_entropy = array_ops.reshape(
+        wishart.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_wishart.entropy()
+
+    expected_mean = array_ops.reshape(
+        wishart.mean(), expected_matrix_stat_shape)
+    actual_mean = reshape_wishart.mean()
+
+    expected_mode = array_ops.reshape(
+        wishart.mode(), expected_matrix_stat_shape)
+    actual_mode = reshape_wishart.mode()
+
+    expected_stddev = array_ops.reshape(
+        wishart.stddev(), expected_matrix_stat_shape)
+    actual_stddev = reshape_wishart.stddev()
+
+    expected_variance = array_ops.reshape(
+        wishart.variance(), expected_matrix_stat_shape)
+    actual_variance = reshape_wishart.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_variance.shape)
+
+  def make_normal(self, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = self.dtype(0.5 + np.arange(
+        np.prod(old_batch_shape)).reshape(old_batch_shape))
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    normal = normal_lib.Normal(loc=self.dtype(0), scale=scale_ph)
+    reshape_normal = batch_reshape_lib.BatchReshape(
+        distribution=normal,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return normal, reshape_normal
+
+  def test_scalar_variate_sample_and_log_prob(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(
+        new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_normal.batch_shape_tensor()
+    event_shape = reshape_normal.event_shape_tensor()
+
+    expected_sample_shape = new_batch_shape
+    x = normal.sample(seed=52)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_normal.sample(seed=52)
+
+    expected_log_prob_shape = new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        normal.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_normal.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_normal.batch_shape)
+    self.assertAllEqual([], reshape_normal.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_scalar_variate_stats(self):
+    new_batch_shape = [2, 2]
+    old_batch_shape = [4]
+
+    normal, reshape_normal = self.make_normal(new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        normal.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_normal.entropy()
+
+    expected_mean = array_ops.reshape(
+        normal.mean(), expected_scalar_stat_shape)
+    actual_mean = reshape_normal.mean()
+
+    expected_mode = array_ops.reshape(
+        normal.mode(), expected_scalar_stat_shape)
+    actual_mode = reshape_normal.mode()
+
+    expected_stddev = array_ops.reshape(
+        normal.stddev(), expected_scalar_stat_shape)
+    actual_stddev = reshape_normal.stddev()
+
+    expected_variance = array_ops.reshape(
+        normal.variance(), expected_scalar_stat_shape)
+    actual_variance = reshape_normal.variance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_scalar_stat_shape, actual_variance.shape)
+
+  def make_mvn(self, dims, new_batch_shape, old_batch_shape):
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+    reshape_mvn = batch_reshape_lib.BatchReshape(
+        distribution=mvn,
+        batch_shape=new_batch_shape_ph,
+        validate_args=True)
+    return mvn, reshape_mvn
+
+  def test_vector_variate_sample_and_log_prob(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    batch_shape = reshape_mvn.batch_shape_tensor()
+    event_shape = reshape_mvn.event_shape_tensor()
+
+    expected_sample_shape = [3] + new_batch_shape + [dims]
+    x = mvn.sample(3, seed=62)
+    expected_sample = array_ops.reshape(x, expected_sample_shape)
+    actual_sample = reshape_mvn.sample(3, seed=62)
+
+    expected_log_prob_shape = [3] + new_batch_shape
+    expected_log_prob = array_ops.reshape(
+        mvn.log_prob(x), expected_log_prob_shape)
+    actual_log_prob = reshape_mvn.log_prob(expected_sample)
+
+    with self.test_session() as sess:
+      [
+          batch_shape_,
+          event_shape_,
+          expected_sample_, actual_sample_,
+          expected_log_prob_, actual_log_prob_,
+      ] = sess.run([
+          batch_shape,
+          event_shape,
+          expected_sample, actual_sample,
+          expected_log_prob, actual_log_prob,
+      ])
+    self.assertAllEqual(new_batch_shape, batch_shape_)
+    self.assertAllEqual([dims], event_shape_)
+    self.assertAllClose(expected_sample_, actual_sample_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_log_prob_, actual_log_prob_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(new_batch_shape, reshape_mvn.batch_shape)
+    self.assertAllEqual([dims], reshape_mvn.event_shape)
+    self.assertAllEqual(expected_sample_shape, actual_sample.shape)
+    self.assertAllEqual(expected_log_prob_shape, actual_log_prob.shape)
+
+  def test_vector_variate_stats(self):
+    dims = 3
+    new_batch_shape = [2, 1]
+    old_batch_shape = [2]
+    mvn, reshape_mvn = self.make_mvn(
+        dims, new_batch_shape, old_batch_shape)
+
+    expected_scalar_stat_shape = new_batch_shape
+
+    expected_entropy = array_ops.reshape(
+        mvn.entropy(), expected_scalar_stat_shape)
+    actual_entropy = reshape_mvn.entropy()
+
+    expected_vector_stat_shape = new_batch_shape + [dims]
+
+    expected_mean = array_ops.reshape(
+        mvn.mean(), expected_vector_stat_shape)
+    actual_mean = reshape_mvn.mean()
+
+    expected_mode = array_ops.reshape(
+        mvn.mode(), expected_vector_stat_shape)
+    actual_mode = reshape_mvn.mode()
+
+    expected_stddev = array_ops.reshape(
+        mvn.stddev(), expected_vector_stat_shape)
+    actual_stddev = reshape_mvn.stddev()
+
+    expected_variance = array_ops.reshape(
+        mvn.variance(), expected_vector_stat_shape)
+    actual_variance = reshape_mvn.variance()
+
+    expected_matrix_stat_shape = new_batch_shape + [dims, dims]
+
+    expected_covariance = array_ops.reshape(
+        mvn.covariance(), expected_matrix_stat_shape)
+    actual_covariance = reshape_mvn.covariance()
+
+    with self.test_session() as sess:
+      [
+          expected_entropy_, actual_entropy_,
+          expected_mean_, actual_mean_,
+          expected_mode_, actual_mode_,
+          expected_stddev_, actual_stddev_,
+          expected_variance_, actual_variance_,
+          expected_covariance_, actual_covariance_,
+      ] = sess.run([
+          expected_entropy, actual_entropy,
+          expected_mean, actual_mean,
+          expected_mode, actual_mode,
+          expected_stddev, actual_stddev,
+          expected_variance, actual_variance,
+          expected_covariance, actual_covariance,
+      ])
+    self.assertAllClose(expected_entropy_, actual_entropy_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mean_, actual_mean_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_mode_, actual_mode_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_stddev_, actual_stddev_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_variance_, actual_variance_,
+                        atol=0., rtol=1e-6)
+    self.assertAllClose(expected_covariance_, actual_covariance_,
+                        atol=0., rtol=1e-6)
+    if not self.is_static_shape:
+      return
+    self.assertAllEqual(expected_scalar_stat_shape, actual_entropy.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mean.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_mode.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_stddev.shape)
+    self.assertAllEqual(expected_vector_stat_shape, actual_variance.shape)
+    self.assertAllEqual(expected_matrix_stat_shape, actual_covariance.shape)
+
+  def test_bad_reshape_size(self):
+    dims = 2
+    new_batch_shape = [2, 3]
+    old_batch_shape = [2]   # 2 != 2*3
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(
+          ValueError, (r"`batch_shape` size \(6\) must match "
+                       r"`distribution\.batch_shape` size \(2\)")):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r"`batch_shape` size must match "
+                                      r"`distributions.batch_shape` size"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_positive_shape(self):
+    dims = 2
+    new_batch_shape = [-1, -2]   # -1*-2=2 so will pass size check.
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be positive.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be positive.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_non_vector_shape(self):
+    dims = 2
+    new_batch_shape = 2
+    old_batch_shape = [2]
+
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+
+    scale = np.ones(old_batch_shape + [dims], self.dtype)
+    scale_ph = array_ops.placeholder_with_default(
+        scale, shape=scale.shape if self.is_static_shape else None)
+    mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(ValueError, r".*must be a vector.*"):
+        batch_reshape_lib.BatchReshape(
+            distribution=mvn,
+            batch_shape=new_batch_shape_ph,
+            validate_args=True)
+
+    else:
+      with self.test_session():
+        with self.assertRaisesOpError(r".*must be a vector.*"):
+          batch_reshape_lib.BatchReshape(
+              distribution=mvn,
+              batch_shape=new_batch_shape_ph,
+              validate_args=True).sample().eval()
+
+  def test_broadcasting_explicitly_unsupported(self):
+    old_batch_shape = [4]
+    new_batch_shape = [1, 4, 1]
+    rate_ = self.dtype([1, 10, 2, 20])
+
+    rate = array_ops.placeholder_with_default(
+        rate_,
+        shape=old_batch_shape if self.is_static_shape else None)
+    poisson_4 = poisson_lib.Poisson(rate)
+    new_batch_shape_ph = (
+        constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
+        else array_ops.placeholder_with_default(
+            np.int32(new_batch_shape), shape=None))
+    poisson_141_reshaped = batch_reshape_lib.BatchReshape(
+        poisson_4, new_batch_shape_ph, validate_args=True)
+
+    x_4 = self.dtype([2, 12, 3, 23])
+    x_114 = self.dtype([2, 12, 3, 23]).reshape(1, 1, 4)
+
+    if self.is_static_shape:
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "too few batch and event dims"):
+        poisson_141_reshaped.log_prob(x_4)
+      with self.assertRaisesRegexp(NotImplementedError,
+                                   "unexpected batch and event shape"):
+        poisson_141_reshaped.log_prob(x_114)
+      return
+
+    with self.assertRaisesOpError("too few batch and event dims"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_4).eval()
+
+    with self.assertRaisesOpError("unexpected batch and event shape"):
+      with self.test_session():
+        poisson_141_reshaped.log_prob(x_114).eval()
+
+
+class BatchReshapeStaticTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float32
+  is_static_shape = True
+
+
+class BatchReshapeDynamicTest(_BatchReshapeTest, test.TestCase):
+
+  dtype = np.float64
+  is_static_shape = False
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
index ad11d9f2484c4b08c67c5f82aec1320475d1d983..074b5f275d107fa49de42df262476bd4aa48ffae 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/kumaraswamy_bijector_test.py
@@ -69,7 +69,7 @@ class KumaraswamyBijectorTest(test.TestCase):
       bijector = Kumaraswamy(
           concentration1=concentration1,
           concentration0=concentration0, validate_args=True)
-      # Omitting the endpoints 0 and 1, since idlj will be inifinity at these
+      # Omitting the endpoints 0 and 1, since idlj will be infinity at these
       # endpoints.
       y = np.linspace(.01, 0.99, num=10).astype(np.float32)
       x = 1 - (1 - y ** concentration1) ** concentration0
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
index 4186cf129dbf31724c84133734da3f226817c71a..ea04e8c29a2c94d4939bad277afa380401067ff2 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/sample_stats_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.contrib.distributions.python.ops import sample_stats
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.platform import test
 
@@ -455,6 +456,16 @@ class PercentileTestWithNearestInterpolation(test.TestCase):
       with self.assertRaisesOpError("rank"):
         pct.eval(feed_dict={q_ph: [0.5]})
 
+  def test_finds_max_of_long_array(self):
+    # d - 1 == d in float32 and d = 3e7.
+    # So this test only passes if we use double for the percentile indices.
+    # If float is used, it fails with InvalidArgumentError about an index out of
+    # bounds.
+    x = math_ops.linspace(0., 3e7, num=int(3e7))
+    with self.test_session():
+      minval = sample_stats.percentile(x, q=0, validate_args=True)
+      self.assertAllEqual(0, minval.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..968057331787059240110b90545f70c0ab128aa8
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -0,0 +1,70 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the SeedStream class."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops import seed_stream
+from tensorflow.python.platform import test
+
+
+class SeedStreamTest(test.TestCase):
+
+  def assertAllUnique(self, items):
+    self.assertEqual(len(items), len(set(items)))
+
+  def testNonRepetition(self):
+    # The probability of repetitions in a short stream from a correct
+    # PRNG is negligible; this test catches bugs that prevent state
+    # updates.
+    strm = seed_stream.SeedStream(seed=4, salt="salt")
+    output = [strm() for _ in range(50)]
+    self.assertEqual(sorted(output), sorted(list(set(output))))
+
+  def testReproducibility(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm3 = seed_stream.SeedStream(seed=4, salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertEqual(outputs, [strm2() for _ in range(50)])
+    self.assertEqual(outputs, [strm3() for _ in range(50)])
+
+  def testSeededDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=5, salt="salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testSaltedDistinctness(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(seed=4, salt="another salt")
+    self.assertAllUnique(
+        [strm1() for _ in range(50)] + [strm2() for _ in range(50)])
+
+  def testNestingRobustness(self):
+    # SeedStreams started from generated seeds should not collide with
+    # the master or with each other, even if the salts are the same.
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1(), salt="salt")
+    strm3 = seed_stream.SeedStream(strm1(), salt="salt")
+    outputs = [strm1() for _ in range(50)]
+    self.assertAllUnique(
+        outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index 3548ac18078a0b40f117c2bf9e2b34d20cee163b..c4fb669ebbc0b3d75da227e56f240980971efd82 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -21,40 +21,75 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.distributions.python.ops import statistical_testing as st
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import check_ops
 from tensorflow.python.platform import test
 
 
 class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_design_mean_one_sample_soundness(self):
-    numbers = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
     rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
-    with self.test_session() as sess:
-      for ff in rates:
-        for fp in rates:
-          sufficient_n = st.min_num_samples_for_dkwm_mean_test(
-              numbers, 0., 1., false_fail_rate=ff, false_pass_rate=fp)
-          detectable_d = st.min_discrepancy_of_true_means_detectable_by_dkwm(
-              sufficient_n, 0., 1., false_fail_rate=ff, false_pass_rate=fp)
-          sess.run(check_ops.assert_less_equal(detectable_d, numbers))
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      sufficient_n = st.min_num_samples_for_dkwm_mean_test(
+          thresholds, low=0., high=1., false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm(
+              sufficient_n, low=0., high=1., false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
 
   def test_dkwm_design_mean_two_sample_soundness(self):
-    numbers = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
+    thresholds = [1e-5, 1e-2, 1.1e-1, 0.9, 1., 1.02, 2., 10., 1e2, 1e5, 1e10]
     rates = [1e-6, 1e-3, 1e-2, 1.1e-1, 0.2, 0.5, 0.7, 1.]
-    with self.test_session() as sess:
-      for ff in rates:
-        for fp in rates:
-          (sufficient_n1,
-           sufficient_n2) = st.min_num_samples_for_dkwm_mean_two_sample_test(
-               numbers, 0., 1., 0., 1.,
-               false_fail_rate=ff, false_pass_rate=fp)
-          d_fn = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample
-          detectable_d = d_fn(
-              sufficient_n1, 0., 1., sufficient_n2, 0., 1.,
-              false_fail_rate=ff, false_pass_rate=fp)
-          sess.run(check_ops.assert_less_equal(detectable_d, numbers))
+    false_fail_rates, false_pass_rates = np.meshgrid(rates, rates)
+    false_fail_rates = false_fail_rates.flatten().astype(np.float32)
+    false_pass_rates = false_pass_rates.flatten().astype(np.float32)
+
+    detectable_discrepancies = []
+    for false_pass_rate, false_fail_rate in zip(
+        false_pass_rates, false_fail_rates):
+      [
+          sufficient_n1,
+          sufficient_n2
+      ] = st.min_num_samples_for_dkwm_mean_two_sample_test(
+          thresholds, low1=0., high1=1., low2=0., high2=1.,
+          false_fail_rate=false_fail_rate,
+          false_pass_rate=false_pass_rate)
+
+      detectable_discrepancies.append(
+          st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+              n1=sufficient_n1,
+              low1=0.,
+              high1=1.,
+              n2=sufficient_n2,
+              low2=0.,
+              high2=1.,
+              false_fail_rate=false_fail_rate,
+              false_pass_rate=false_pass_rate))
+
+    detectable_discrepancies_ = self.evaluate(detectable_discrepancies)
+    for discrepancies, false_pass_rate, false_fail_rate in zip(
+        detectable_discrepancies_, false_pass_rates, false_fail_rates):
+      below_threshold = discrepancies <= thresholds
+      self.assertAllEqual(
+          np.ones_like(below_threshold, np.bool), below_threshold,
+          msg='false_pass_rate({}), false_fail_rate({})'.format(
+              false_pass_rate, false_fail_rate))
 
   def test_true_mean_confidence_interval_by_dkwm_one_sample(self):
     rng = np.random.RandomState(seed=0)
@@ -93,28 +128,28 @@ class StatisticalTestingTest(test.TestCase):
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.4.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too high"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.4, false_fail_rate=1e-6))
 
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is not 0.6.
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("Mean confidence interval too low"):
         sess.run(st.assert_true_mean_equal_by_dkwm(
             samples, 0., 1., 0.6, false_fail_rate=1e-6))
 
   def test_dkwm_mean_two_sample_assertion(self):
     rng = np.random.RandomState(seed=0)
-    num_samples = 15000
+    num_samples = 4000
 
-    # 15000 samples is chosen to be enough to find discrepancies of
-    # size 0.1 or more with assurance 1e-6, as confirmed here:
+    # 4000 samples is chosen to be enough to find discrepancies of
+    # size 0.2 or more with assurance 1e-6, as confirmed here:
     with self.test_session() as sess:
       d = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
           num_samples, 0., 1., num_samples, 0., 1.,
           false_fail_rate=1e-6, false_pass_rate=1e-6)
       d = sess.run(d)
-      self.assertLess(d, 0.1)
+      self.assertLess(d, 0.2)
 
     # Test that the test assertion agrees that the standard
     # uniform distribution has the same mean as itself.
@@ -124,19 +159,37 @@ class StatisticalTestingTest(test.TestCase):
       sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
           samples1, 0., 1., samples2, 0., 1., false_fail_rate=1e-6))
 
+  def test_dkwm_mean_two_sample_assertion_beta_2_1_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(2, 1).
       beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples1 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_high_samples, 0., 1.,
             false_fail_rate=1e-6))
 
+  def test_dkwm_mean_two_sample_assertion_beta_1_2_false(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 4000
+    samples1 = rng.uniform(size=num_samples).astype(np.float32)
+
+    # As established above, 4000 samples is enough to find discrepancies
+    # of size 0.2 or more with assurance 1e-6.
+
+    with self.test_session() as sess:
       # Test that the test assertion confirms that the mean of the
       # standard uniform distribution is different from the mean of beta(1, 2).
       beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("samples2 has a smaller mean"):
         sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
             samples1, 0., 1.,
             beta_low_samples, 0., 1.,
@@ -144,21 +197,22 @@ class StatisticalTestingTest(test.TestCase):
 
   def test_dkwm_argument_validity_checking(self):
     rng = np.random.RandomState(seed=0)
-    samples = rng.uniform(size=5000).astype(np.float32)
+    samples = rng.uniform(
+        low=[0., 1.], high=[1., 2.], size=(2500, 1, 2)).astype(np.float32)
 
     # Test that the test library complains if the given samples fall
     # outside the purported bounds.
     with self.test_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaisesOpError("maximum value exceeds expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0., 0.5, error_rate=0.5))
-      with self.assertRaises(errors.InvalidArgumentError):
+            samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
+      with self.assertRaisesOpError("minimum value falls below expectations"):
         sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, 0.5, 1., error_rate=0.5))
+            samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
 
       # But doesn't complain if they don't.
       op = st.true_mean_confidence_interval_by_dkwm(
-          samples, 0., 1., error_rate=0.5)
+          samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
       _ = sess.run(op)
 
 
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5590cd552a915a3ecfc1912ee530baf79665a6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -0,0 +1,416 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The BatchReshape distribution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+
+__all__ = [
+    "BatchReshape",
+]
+
+
+class BatchReshape(distribution_lib.Distribution):
+  """The Batch-Reshaping distribution.
+
+  This "meta-distribution" reshapes the batch dimensions of another
+  distribution.
+
+  Note: Unlike `tf.reshape`, the `BatchReshape` distribution does not support
+  `-1` for flattening.
+
+  #### Examples
+
+  ```python
+  tfd = tf.contrib.distributions
+
+  dtype = np.float32
+  dims = 2
+  new_batch_shape = [1, 2, 3]
+  old_batch_shape = [6]
+
+  scale = np.ones(old_batch_shape + [dims], dtype)
+  mvn = tfd.MultivariateNormalDiag(scale_diag=scale)
+  reshape_mvn = tfd.BatchReshape(
+      distribution=mvn,
+      batch_shape=new_batch_shape,
+      validate_args=True)
+
+  reshape_mvn.batch_shape
+  # ==> [1, 2, 3]
+
+  x = reshape_mvn.sample(sample_shape=[4, 5])
+  x.shape
+  # ==> [4, 5, 1, 2, 3, 2] == sample_shape + new_batch_shape + [dims]
+
+  reshape_mvn.log_prob(x).shape
+  # ==> [4, 5, 1, 2, 3] == sample_shape + new_batch_shape
+  ```
+
+  """
+
+  def __init__(self,
+               distribution,
+               batch_shape,
+               validate_args=False,
+               allow_nan_stats=True,
+               name=None):
+    """Construct BatchReshape distribution.
+
+    Args:
+      distribution: The base distribution instance to reshape. Typically an
+        instance of `Distribution`.
+      batch_shape: Positive `int`-like vector-shaped `Tensor` representing the
+        new shape of the batch dimensions.
+      validate_args: Python `bool`, default `False`. When `True` distribution
+        parameters are checked for validity despite possibly degrading runtime
+        performance. When `False` invalid inputs may silently render incorrect
+        outputs.
+      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
+        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
+        result is undefined. When `False`, an exception is raised if one or
+        more of the statistic's batch members are undefined.
+      name: The name to give Ops created by the initializer.
+        Default value: `"BatchReshape" + distribution.name`.
+
+    Raises:
+      ValueError: if `batch_shape` is not a vector.
+      ValueError: if `batch_shape` has non-positive elements.
+      ValueError: if `batch_shape` size is not the same as a
+        `distribution.batch_shape` size.
+    """
+    parameters = locals()
+    name = name or "BatchReshape" + distribution.name
+    self._distribution = distribution
+    with ops.name_scope(name, values=[batch_shape]) as name:
+      self._batch_shape_ = ops.convert_to_tensor(
+          batch_shape,
+          dtype=dtypes.int32,
+          name="batch_shape")
+      self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
+      if self._batch_shape_static is not None:
+        self._batch_shape_static = np.int32(self._batch_shape_static)
+      self._runtime_assertions = validate_init_args(
+          self._distribution,
+          self._batch_shape_,
+          validate_args,
+          self._batch_shape_static)
+      super(BatchReshape, self).__init__(
+          dtype=self._distribution.dtype,
+          reparameterization_type=self._distribution.reparameterization_type,
+          validate_args=validate_args,
+          allow_nan_stats=allow_nan_stats,
+          parameters=parameters,
+          graph_parents=(
+              [self._batch_shape_] +
+              self._distribution._graph_parents),  # pylint: disable=protected-access
+          name=name)
+
+  @property
+  def distribution(self):
+    return self._distribution
+
+  def _batch_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self._batch_shape_)
+
+  def _batch_shape(self):
+    return tensor_shape.TensorShape(self._batch_shape_static)
+
+  def _event_shape_tensor(self):
+    with ops.control_dependencies(self._runtime_assertions):
+      return array_ops.identity(self.distribution.event_shape_tensor())
+
+  def _event_shape(self):
+    return self.distribution.event_shape
+
+  def _sample_n(self, n, seed=None):
+    with ops.control_dependencies(self._runtime_assertions):
+      x = self.distribution.sample(sample_shape=n, seed=seed)
+      new_shape = array_ops.concat([
+          [n],
+          self.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      return array_ops.reshape(x, new_shape)
+
+  def _log_prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_prob, x)
+
+  def _prob(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.prob, x)
+
+  def _log_cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_cdf, x)
+
+  def _cdf(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.cdf, x)
+
+  def _log_survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.log_survival_function, x)
+
+  def _survival_function(self, x):
+    return self._call_reshape_input_output(
+        self.distribution.survival_function, x)
+
+  def _entropy(self):
+    return self._call_and_reshape_output(
+        self.distribution.entropy,
+        [],
+        [tensor_shape.scalar()])
+
+  def _mean(self):
+    return self._call_and_reshape_output(self.distribution.mean)
+
+  def _mode(self):
+    return self._call_and_reshape_output(self.distribution.mode)
+
+  def _stddev(self):
+    return self._call_and_reshape_output(self.distribution.stddev)
+
+  def _variance(self):
+    return self._call_and_reshape_output(self.distribution.variance)
+
+  def _covariance(self):
+    return self._call_and_reshape_output(
+        self.distribution.covariance,
+        [self.event_shape_tensor()]*2,
+        [self.event_shape]*2)
+
+  def _sample_shape(self, x):
+    """Computes graph and static `sample_shape`."""
+    x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+    event_ndims = (array_ops.size(self.event_shape_tensor())
+                   if self.event_shape.ndims is None
+                   else self.event_shape.ndims)
+    batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                   if self.batch_shape.ndims is None
+                   else self.batch_shape.ndims)
+    sample_ndims = x_ndims - batch_ndims - event_ndims
+    if isinstance(sample_ndims, int):
+      static_sample_shape = x.shape[:sample_ndims]
+    else:
+      static_sample_shape = tensor_shape.TensorShape(None)
+    if static_sample_shape.is_fully_defined():
+      sample_shape = np.int32(static_sample_shape.as_list())
+    else:
+      sample_shape = array_ops.shape(x)[:sample_ndims]
+    return sample_shape, static_sample_shape
+
+  def _call_reshape_input_output(self, fn, x):
+    """Calls `fn`, appropriately reshaping its input `x` and output."""
+    with ops.control_dependencies(
+        self._runtime_assertions + self._validate_sample_arg(x)):
+      sample_shape, static_sample_shape = self._sample_shape(x)
+      old_shape = array_ops.concat([
+          sample_shape,
+          self.distribution.batch_shape_tensor(),
+          self.event_shape_tensor(),
+      ], axis=0)
+      result = fn(array_ops.reshape(x, old_shape))
+      new_shape = array_ops.concat([
+          sample_shape,
+          self.batch_shape_tensor(),
+      ], axis=0)
+      result = array_ops.reshape(result, new_shape)
+      if (static_sample_shape.ndims is not None and
+          self.batch_shape.ndims is not None):
+        new_shape = static_sample_shape.concatenate(self.batch_shape)
+        result.set_shape(result.shape.merge_with(new_shape))
+      return result
+
+  def _call_and_reshape_output(
+      self,
+      fn,
+      event_shape_list=None,
+      static_event_shape_list=None):
+    """Calls `fn` and appropriately reshapes its output."""
+    with ops.control_dependencies(self._runtime_assertions):
+      if event_shape_list is None:
+        event_shape_list = [self._event_shape_tensor()]
+      if static_event_shape_list is None:
+        static_event_shape_list = [self.event_shape]
+      new_shape = array_ops.concat(
+          [self.batch_shape_tensor()] + event_shape_list,
+          axis=0)
+      result = array_ops.reshape(fn(), new_shape)
+      if (self.batch_shape.ndims is not None and
+          self.event_shape.ndims is not None):
+        event_shape = tensor_shape.TensorShape([])
+        for rss in static_event_shape_list:
+          event_shape = event_shape.concatenate(rss)
+        static_shape = result.shape.merge_with(
+            self.batch_shape.concatenate(event_shape))
+        result.set_shape(static_shape)
+      return result
+
+  def _validate_sample_arg(self, x):
+    """Helper which validates sample arg, e.g., input to `log_prob`."""
+    with ops.name_scope(name="validate_sample_arg", values=[x]):
+      x_ndims = (array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims)
+      event_ndims = (array_ops.size(self.event_shape_tensor())
+                     if self.event_shape.ndims is None
+                     else self.event_shape.ndims)
+      batch_ndims = (array_ops.size(self.batch_shape_tensor())
+                     if self.batch_shape.ndims is None
+                     else self.batch_shape.ndims)
+      expected_batch_event_ndims = batch_ndims + event_ndims
+
+      if (isinstance(x_ndims, int) and
+          isinstance(expected_batch_event_ndims, int)):
+        if x_ndims < expected_batch_event_ndims:
+          raise NotImplementedError(
+              "Broadcasting is not supported; too few batch and event dims "
+              "(expected at least {}, saw {}).".format(
+                  expected_batch_event_ndims, x_ndims))
+        ndims_assertion = []
+      elif self.validate_args:
+        ndims_assertion = [
+            check_ops.assert_greater_equal(
+                x_ndims,
+                expected_batch_event_ndims,
+                message=("Broadcasting is not supported; too few "
+                         "batch and event dims."),
+                name="assert_batch_and_event_ndims_large_enough"),
+        ]
+
+      if (self.batch_shape.is_fully_defined() and
+          self.event_shape.is_fully_defined()):
+        expected_batch_event_shape = np.int32(self.batch_shape.concatenate(
+            self.event_shape).as_list())
+      else:
+        expected_batch_event_shape = array_ops.concat([
+            self.batch_shape_tensor(),
+            self.event_shape_tensor(),
+        ], axis=0)
+
+      sample_ndims = x_ndims - expected_batch_event_ndims
+      if isinstance(sample_ndims, int):
+        sample_ndims = max(sample_ndims, 0)
+      if (isinstance(sample_ndims, int) and
+          x.shape[sample_ndims:].is_fully_defined()):
+        actual_batch_event_shape = np.int32(x.shape[sample_ndims:].as_list())
+      else:
+        sample_ndims = math_ops.maximum(sample_ndims, 0)
+        actual_batch_event_shape = array_ops.shape(x)[sample_ndims:]
+
+      if (isinstance(expected_batch_event_shape, np.ndarray) and
+          isinstance(actual_batch_event_shape, np.ndarray)):
+        if any(expected_batch_event_shape != actual_batch_event_shape):
+          raise NotImplementedError("Broadcasting is not supported; "
+                                    "unexpected batch and event shape "
+                                    "(expected {}, saw {}).".format(
+                                        expected_batch_event_shape,
+                                        actual_batch_event_shape))
+        # We need to set the final runtime-assertions to `ndims_assertion` since
+        # its possible this assertion was created. We could add a condition to
+        # only do so if `self.validate_args == True`, however this is redundant
+        # as `ndims_assertion` already encodes this information.
+        runtime_assertions = ndims_assertion
+      elif self.validate_args:
+        # We need to make the `ndims_assertion` a control dep because otherwise
+        # TF itself might raise an exception owing to this assertion being
+        # ill-defined, ie, one cannot even compare different rank Tensors.
+        with ops.control_dependencies(ndims_assertion):
+          shape_assertion = check_ops.assert_equal(
+              expected_batch_event_shape,
+              actual_batch_event_shape,
+              message=("Broadcasting is not supported; "
+                       "unexpected batch and event shape."),
+              name="assert_batch_and_event_shape_same")
+        runtime_assertions = [shape_assertion]
+      else:
+        runtime_assertions = []
+
+      return runtime_assertions
+
+
+def validate_init_args(
+    distribution,
+    batch_shape,
+    validate_args,
+    batch_shape_static):
+  """Helper to __init__ which makes or raises assertions."""
+  with ops.name_scope(name="validate_init_args",
+                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
+    runtime_assertions = []
+
+    if batch_shape.shape.ndims is not None:
+      if batch_shape.shape.ndims != 1:
+        raise ValueError("`batch_shape` must be a vector "
+                         "(saw rank: {}).".format(
+                             batch_shape.shape.ndims))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_rank(
+              batch_shape,
+              1,
+              message="`batch_shape` must be a vector.",
+              name="assert_batch_shape_is_vector"),
+      ]
+
+    batch_size_static = np.prod(batch_shape_static)
+    dist_batch_size_static = (
+        None if not distribution.batch_shape.is_fully_defined()
+        else np.prod(distribution.batch_shape).value)
+
+    if batch_size_static is not None and dist_batch_size_static is not None:
+      if batch_size_static != dist_batch_size_static:
+        raise ValueError("`batch_shape` size ({}) must match "
+                         "`distribution.batch_shape` size ({}).".format(
+                             batch_size_static,
+                             dist_batch_size_static))
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_equal(
+              math_ops.reduce_prod(batch_shape),
+              math_ops.reduce_prod(distribution.batch_shape_tensor()),
+              message=("`batch_shape` size must match "
+                       "`distributions.batch_shape` size."),
+              name="assert_batch_size"),
+      ]
+
+    if batch_shape_static is not None:
+      if np.any(batch_shape_static < 1):
+        raise ValueError("`batch_shape` elements must be positive "
+                         "(i.e., larger than zero).")
+    elif validate_args:
+      runtime_assertions += [
+          check_ops.assert_positive(
+              batch_shape,
+              message=("`batch_shape` elements must be positive "
+                       "(i.e., larger than zero)."),
+              name="assert_batch_shape_positive")
+      ]
+
+    return runtime_assertions
diff --git a/tensorflow/contrib/distributions/python/ops/estimator.py b/tensorflow/contrib/distributions/python/ops/estimator.py
index 6b53338c4542c75d3977c075b7750c780080ac48..98edd337fe02ffbf53c6ecd9ebda9424231ea2fe 100644
--- a/tensorflow/contrib/distributions/python/ops/estimator.py
+++ b/tensorflow/contrib/distributions/python/ops/estimator.py
@@ -75,7 +75,7 @@ def estimator_head_distribution_regression(make_distribution_fn,
 
 
 class _DistributionRegressionHead(_RegressionHead):
-  """Creates a _RegressionHead instance from an arbitray `Distribution`."""
+  """Creates a _RegressionHead instance from an arbitrary `Distribution`."""
 
   def __init__(self,
                make_distribution_fn,
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index 7dcb3e3ac4db1855adacb7ec0fa8554c45d9c859..b1bacb91b03093fa93a7e5f7eb855dc944dafb44 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -36,7 +36,7 @@ class Independent(distribution_lib.Distribution):
 
   This distribution is useful for regarding a collection of independent,
   non-identical distributions as a single random variable. For example, the
-  `Indpendent` distribution composed of a collection of `Bernoulli`
+  `Independent` distribution composed of a collection of `Bernoulli`
   distributions might define a distribution over an image (where each
   `Bernoulli` is a distribution over each pixel).
 
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 46c2cc8b7a8c536a90176fbb2b2d52fed61e4705..e3e40b2e9ca232b9970768f21fb95887fdf0df2d 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -52,7 +52,7 @@ class OneHotCategorical(distribution.Distribution):
 
   #### Examples
 
-  Creates a 3-class distiribution, with the 2nd class, the most likely to be
+  Creates a 3-class distribution, with the 2nd class, the most likely to be
   drawn from.
 
   ```python
@@ -60,7 +60,7 @@ class OneHotCategorical(distribution.Distribution):
   dist = OneHotCategorical(probs=p)
   ```
 
-  Creates a 3-class distiribution, with the 2nd class the most likely to be
+  Creates a 3-class distribution, with the 2nd class the most likely to be
   drawn from, using logits.
 
   ```python
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index b525809015537ac8c7ee701c100fba6541fe2e92..e454a53c6275e0c60edd8c87b1c3be670f2b22de 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -35,10 +35,10 @@ class RelaxedBernoulli(transformed_distribution.TransformedDistribution):
 
   The RelaxedBernoulli is a distribution over the unit interval (0,1), which
   continuously approximates a Bernoulli. The degree of approximation is
-  controlled by a temperature: as the temperaturegoes to 0 the RelaxedBernoulli
-  becomes discrete with a distribution described by the `logits` or `probs`
-  parameters, as the temperature goes to infinity the RelaxedBernoulli
-  becomes the constant distribution that is identically 0.5.
+  controlled by a temperature: as the temperature goes to 0 the
+  RelaxedBernoulli becomes discrete with a distribution described by the
+  `logits` or `probs` parameters, as the temperature goes to infinity the
+  RelaxedBernoulli becomes the constant distribution that is identically 0.5.
 
   The RelaxedBernoulli distribution is a reparameterized continuous
   distribution that is the binary special case of the RelaxedOneHotCategorical
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index ff33f327c7a77597e516208cacad8c4aed65d1c9..f56ba0781604cb5a4fb3070b79aa86e09ceb6766 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -303,7 +303,7 @@ class RelaxedOneHotCategorical(
   The RelaxedOneHotCategorical is a distribution over random probability
   vectors, vectors of positive real values that sum to one, which continuously
   approximates a OneHotCategorical. The degree of approximation is controlled by
-  a temperature: as the temperaturegoes to 0 the RelaxedOneHotCategorical
+  a temperature: as the temperature goes to 0 the RelaxedOneHotCategorical
   becomes discrete with a distribution described by the `logits` or `probs`
   parameters, as the temperature goes to infinity the RelaxedOneHotCategorical
   becomes the constant distribution that is identically the constant vector of
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index dfc813361977c159d8d48f9d5b9ff03db5b4acdc..f5aaa5cf34abde3ea4d25de1ecf3adaef3f2a770 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -301,13 +302,16 @@ def percentile(x,
 
   with ops.name_scope(name, [x, q]):
     x = ops.convert_to_tensor(x, name="x")
-    q = math_ops.to_float(q, name="q")
+    # Double is needed here and below, else we get the wrong index if the array
+    # is huge along axis.
+    q = math_ops.to_double(q, name="q")
     _get_static_ndims(q, expect_ndims=0)
 
     if validate_args:
       q = control_flow_ops.with_dependencies([
-          check_ops.assert_rank(q, 0), check_ops.assert_greater_equal(q, 0.),
-          check_ops.assert_less_equal(q, 100.)
+          check_ops.assert_rank(q, 0),
+          check_ops.assert_greater_equal(q, math_ops.to_double(0.)),
+          check_ops.assert_less_equal(q, math_ops.to_double(100.))
       ], q)
 
     if axis is None:
@@ -332,7 +336,7 @@ def percentile(x,
       y = _move_dims_to_flat_end(x, axis, x_ndims)
 
     frac_at_q_or_above = 1. - q / 100.
-    d = math_ops.to_float(array_ops.shape(y)[-1])
+    d = math_ops.to_double(array_ops.shape(y)[-1])
 
     if interpolation == "lower":
       index = math_ops.ceil((d - 1) * frac_at_q_or_above)
@@ -341,12 +345,18 @@ def percentile(x,
     elif interpolation == "nearest":
       index = math_ops.round((d - 1) * frac_at_q_or_above)
 
+    # If d is gigantic, then we would have d == d - 1, even in double... So
+    # let's use max/min to avoid out of bounds errors.
+    d = array_ops.shape(y)[-1]
+    # d - 1 will be distinct from d in int32.
+    index = clip_ops.clip_by_value(math_ops.to_int32(index), 0, d - 1)
+
     # Sort everything, not just the top 'k' entries, which allows multiple calls
     # to sort only once (under the hood) and use CSE.
     sorted_y = _sort_tensor(y)
 
     # result.shape = B
-    result = sorted_y[..., math_ops.to_int32(index)]
+    result = sorted_y[..., index]
     result.set_shape(y.get_shape()[:-1])
 
     if keep_dims:
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..056d349688511e19a4fa3d58a5b3c1c8355671a3
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Local PRNG for amplifying seed entropy into seeds for base operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import hashlib
+
+
+class SeedStream(object):
+  """Local PRNG for amplifying seed entropy into seeds for base operations.
+
+  Writing sampling code which correctly sets the pseudo-random number
+  generator (PRNG) seed is surprisingly difficult.  This class serves as
+  a helper for the TensorFlow Probability coding pattern designed to
+  avoid common mistakes.
+
+  # Motivating Example
+
+  A common first-cut implementation of a sampler for the beta
+  distribution is to compute the ratio of a gamma with itself plus
+  another gamma.  This code snippet tries to do that, but contains a
+  surprisingly common error:
+
+  ```python
+  def broken_beta(shape, alpha, beta, seed):
+    x = tf.random_gamma(shape, alpha, seed=seed)
+    y = tf.random_gamma(shape, beta, seed=seed)
+    return x / (x + y)
+  ```
+
+  The mistake is that the two gamma draws are seeded with the same
+  seed.  This causes them to always produce the same results, which,
+  in turn, leads this code snippet to always return `0.5`.  Because it
+  can happen across abstraction boundaries, this kind of error is
+  surprisingly easy to make when handling immutable seeds.
+
+  # Goals
+
+  TensorFlow Probability adopts a code style designed to eliminate the
+  above class of error, without exacerbating others.  The goals of
+  this code style are:
+
+  - Support reproducibility of results (by encouraging seeding of all
+    pseudo-random operations).
+
+  - Avoid shared-write global state (by not relying on a global PRNG).
+
+  - Prevent accidental seed reuse by TF Probability implementers.  This
+    goal is served with the local pseudo-random seed generator provided
+    in this module.
+
+  - Mitigate potential accidental seed reuse by TF Probability clients
+    (with a salting scheme).
+
+  - Prevent accidental resonances with downstream PRNGs (by hashing the
+    output).
+
+  ## Non-goals
+
+  - Implementing a high-performance PRNG for generating large amounts of
+    entropy.  That's the job of the underlying TensorFlow PRNG we are
+    seeding.
+
+  - Avoiding random seed collisions, aka "birthday attacks".
+
+  # Code pattern
+
+  ```python
+  def random_beta(shape, alpha, beta, seed):        # (a)
+    seed = SeedStream(seed, salt="random_beta")     # (b)
+    x = tf.random_gamma(shape, alpha, seed=seed())  # (c)
+    y = tf.random_gamma(shape, beta, seed=seed())   # (c)
+    return x / (x + y)
+  ```
+
+  The elements of this pattern are:
+
+  - Accept an explicit seed (line a) as an argument in all public
+    functions, and write the function to be deterministic (up to any
+    numerical issues) for fixed seed.
+
+    - Rationale: This provides the client with the ability to reproduce
+      results.  Accepting an immutable seed rather than a mutable PRNG
+      object reduces code coupling, permitting different sections to be
+      reproducible independently.
+
+  - Use that seed only to initialize a local `SeedStream` instance (line b).
+
+    - Rationale: Avoids accidental seed reuse.
+
+  - Supply the name of the function being implemented as a salt to the
+    `SeedStream` instance (line b).  This serves to keep the salts
+    unique; unique salts ensure that clients of TF Probability will see
+    different functions always produce independent results even if
+    called with the same seeds.
+
+  - Seed each callee operation with the output of a unique call to the
+    `SeedStream` instance (lines c).  This ensures reproducibility of
+    results while preventing seed reuse across callee invocations.
+
+  # Why salt?
+
+  Salting the `SeedStream` instances (with unique salts) is defensive
+  programming against a client accidentally committing a mistake
+  similar to our motivating example.  Consider the following situation
+  that might arise without salting:
+
+  ```python
+  def tfp_foo(seed):
+    seed = SeedStream(seed, salt="")
+    foo_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def tfp_bar(seed):
+    seed = SeedStream(seed, salt="")
+    bar_stuff = tf.random_normal(seed=seed())
+    ...
+
+  def client_baz(seed):
+    foo = tfp_foo(seed=seed)
+    bar = tfp_bar(seed=seed)
+    ...
+  ```
+
+  The client should have used different seeds as inputs to `foo` and
+  `bar`.  However, because they didn't, *and because `foo` and `bar`
+  both sample a Gaussian internally as their first action*, the
+  internal `foo_stuff` and `bar_stuff` will be the same, and the
+  returned `foo` and `bar` will not be independent, leading to subtly
+  incorrect answers from the client's simulation.  This kind of bug is
+  particularly insidious for the client, because it depends on a
+  Distributions implementation detail, namely the order in which `foo`
+  and `bar` invoke the samplers they depend on.  In particular, a
+  Bayesflow team member can introduce such a bug in previously
+  (accidentally) correct client code by performing an internal
+  refactoring that causes this operation order alignment.
+
+  A salting discipline eliminates this problem by making sure that the
+  seeds seen by `foo`'s callees will differ from those seen by `bar`'s
+  callees, even if `foo` and `bar` are invoked with the same input
+  seed.
+  """
+
+  def __init__(self, seed, salt):
+    """Initializes a `SeedStream`.
+
+    Args:
+      seed: Any Python object convertible to string, supplying the
+        initial entropy.  If `None`, operations seeded with seeds
+        drawn from this `SeedStream` will follow TensorFlow semantics
+        for not being seeded.
+      salt: Any Python object convertible to string, supplying
+        auxiliary entropy.  Must be unique across the Distributions
+        and TensorFlow Probability code base.  See class docstring for
+        rationale.
+    """
+    self._seed = seed
+    self._salt = salt
+    self._counter = 0
+
+  def __call__(self):
+    """Returns a fresh integer usable as a seed in downstream operations.
+
+    If this `SeedStream` was initialized with `seed=None`, returns
+    `None`.  This has the effect that downstream operations (both
+    `SeedStream`s and primitive TensorFlow ops) will behave as though
+    they were unseeded.
+
+    The returned integer is non-negative, and uniformly distributed in
+    the half-open interval `[0, 2**512)`.  This is consistent with
+    TensorFlow, as TensorFlow operations internally use the residue of
+    the given seed modulo `2**31 - 1` (see
+    `tensorflow/python/framework/random_seed.py`).
+
+    Returns:
+      seed: A fresh integer usable as a seed in downstream operations,
+        or `None`.
+    """
+    self._counter += 1
+    if self._seed is None:
+      return None
+    composite = str((self._seed, self._counter, self._salt)).encode("utf-8")
+    return int(hashlib.sha512(composite).hexdigest(), 16)
+
+  @property
+  def original_seed(self):
+    return self._seed
+
+  @property
+  def salt(self):
+    return self._salt
+
+# Design rationales for the SeedStream class
+#
+# - Salts are accepted for the reason given above to supply them.
+#
+# - A `None` seed propagates to downstream seeds, so they exhibit
+#   their "unseeded" behavior.
+#
+# - The return value is a Python int so it can be passed directly to
+#   TensorFlow operations as a seed.  It is large to avoid losing seed
+#   space needlessly (TF will internally read only the last 31 bits).
+#
+# - The output is hashed with a crypto-grade hash function as a form
+#   of defensive programming: this reliably prevents all possible
+#   accidental resonances with all possible downstream PRNGs.  The
+#   specific function used is not important; SHA512 was ready to hand.
+#
+# - The internal state update is a simple counter because (a) given
+#   that the output is hashed anyway, this is enough, and (b) letting
+#   it be this predictable permits a future "generate many seeds in
+#   parallel" operation whose results would agree with running
+#   sequentially.
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index d66c34cc1a45cc09da5138a5f72ae3817690db49..9b9fff0afabda7e08e4cadbd8b524c3ecceb73a2 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -12,7 +12,114 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Statistical test assertions calibrated for their error rates."""
+"""Statistical test assertions calibrated for their error rates.
+
+Statistical tests have an inescapable probability of error: a correct
+sampler can still fail a test by chance, and an incorrect sampler can
+still pass a test by chance.  This library is about bounding both of
+those error rates.  This requires admitting a task-specific notion of
+"discrepancy": Correct code will fail rarely, code that misbehaves by
+more than the discrepancy will pass rarely, and nothing reliable can
+be said about code that misbehaves, but misbehaves by less than the
+discrepancy.
+
+# Example
+
+Consider testing that the mean of a scalar probability distribution P
+is some expected constant.  Suppose the support of P is the interval
+`[0, 1]`.  Then you might do this:
+
+```python
+tfd = tf.contrib.distributions
+
+expected_mean = ...
+num_samples = 5000
+samples = ... draw 5000 samples from P
+
+# Check that the mean looks right
+check1 = tfd.assert_true_mean_equal_by_dkwm(
+    samples, low=0., high=1., expected=expected_mean,
+    false_fail_rate=1e-6)
+
+# Check that the difference in means detectable with 5000 samples is
+# small enough
+check2 = tf.assert_less(
+    tfd.min_discrepancy_of_true_means_detectable_by_dkwm(
+        num_samples, low=0., high=1.0,
+        false_fail_rate=1e-6, false_pass_rate=1e-6),
+    0.01)
+
+# Be sure to execute both assertion ops
+sess.run([check1, check2])
+```
+
+The second assertion is an instance of experiment design.  It's a
+deterministic computation (independent of the code under test) that
+checks that `5000` samples is enough to reliably resolve mean
+differences of `0.01` or more.  Here "reliably" means that if the code
+under test is correct, the probability of drawing an unlucky sample
+that causes this test to fail is at most 1e-6; and if the code under
+test is incorrect enough that its true mean is 0.01 more or less than
+expected, then the probability of drawing a "lucky" sample that causes
+the test to false-pass is also at most 1e-6.
+
+# Overview
+
+Every function in this library can be characterized in terms of:
+
+- The property being tested, such as the full density of the
+  distribution under test, or just its true mean, or a single
+  Bernoulli probability, etc.
+
+- The relation being asserted, e.g., whether the mean is less, more,
+  or equal to the given expected value.
+
+- The stochastic bound being relied upon, such as the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval)
+  or the CDF of the binomial distribution (for assertions about
+  Bernoulli probabilities).
+
+- The number of sample sets in the statistical test.  For example,
+  testing equality of means has a one-sample variant, where the
+  expected mean is given exactly, and a two-sample variant, where the
+  expected mean is itself given by a set of samples (e.g., from an
+  alternative algorithm).
+
+- What operation(s) of the test are to be performed.  Each test has
+  three of these:
+
+  1. `assert` executes the test.  Specifically, it creates a TF op that
+     produces an error if it has enough evidence to prove that the
+     property under test is violated.  These functions depend on the
+     desired false failure rate, because that determines the sizes of
+     appropriate confidence intervals, etc.
+
+  2. `min_discrepancy` computes the smallest difference reliably
+     detectable by that test, given the sample count and error rates.
+     What it's a difference of is test-specific.  For example, a test
+     for equality of means would make detection guarantees about the
+     difference the true means.
+
+  3. `min_num_samples` computes the minimum number of samples needed
+     to reliably detect a given discrepancy with given error rates.
+
+  The latter two are for experimental design, and are meant to be
+  usable either interactively or inline in the overall test method.
+
+This library follows a naming convention, to make room for every
+combination of the above.  A name mentions the operation first, then
+the property, then the relation, then the bound, then, if the test
+takes more than one set of samples, a token indicating this.  For
+example, `assert_true_mean_equal_by_dkwm` (which is implicitly
+one-sample).  Each name is a grammatically sound noun phrase (or verb
+phrase, for the asserts).
+
+# Asymptotic properties
+
+The number of samples needed tends to scale as `O(1/discrepancy**2)` and
+as `O(log(1/error_rate))`.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -40,7 +147,7 @@ __all__ = [
 
 
 def _batch_sort_vector(x, ascending=True, name=None):
-  with ops.name_scope(name, "sort_each_row", [x]):
+  with ops.name_scope(name, "_batch_sort_vector", [x]):
     x = ops.convert_to_tensor(x, name="x")
     n = array_ops.shape(x)[-1]
     if ascending:
@@ -127,7 +234,7 @@ def _maximum_mean(samples, envelope, high, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     high = ops.convert_to_tensor(high, name="high")
 
-    xmax = math_ops.reduce_max(samples, axis=[-1])
+    xmax = math_ops.reduce_max(samples, axis=[0])
     msg = "Given sample maximum value exceeds expectations"
     check_op = check_ops.assert_less_equal(xmax, high, message=msg)
     with ops.control_dependencies([check_op]):
@@ -172,7 +279,7 @@ def _minimum_mean(samples, envelope, low, name=None):
     envelope = ops.convert_to_tensor(envelope, name="envelope")
     low = ops.convert_to_tensor(low, name="low")
 
-    xmin = math_ops.reduce_min(samples, axis=[-1])
+    xmin = math_ops.reduce_min(samples, axis=[0])
     msg = "Given sample minimum value falls below expectations"
     check_op = check_ops.assert_greater_equal(xmin, low, message=msg)
     with ops.control_dependencies([check_op]):
@@ -212,8 +319,8 @@ def _dkwm_cdf_envelope(n, error_rate, name=None):
     return math_ops.sqrt(-gen_math_ops.log(error_rate / 2.) / (2. * n))
 
 
-def _check_shape_dominates(tensor, tensors):
-  """Check that broadcasting `tensor` against `tensors` does not expand it.
+def _check_shape_dominates(samples, parameters):
+  """Check that broadcasting `samples` against `parameters` does not expand it.
 
   Why?  Because I want to be very sure that the samples tensor is not
   accidentally enlarged by broadcasting against tensors that are
@@ -221,24 +328,27 @@ def _check_shape_dominates(tensor, tensors):
   sample counts end up inflated.
 
   Args:
-    tensor: A Tensor whose shape is to be protected against broadcasting.
-    tensors: A list of Tensors to check
+    samples: A Tensor whose shape is to be protected against broadcasting.
+    parameters: A list of Tensors who are parameters for the statistical test.
 
   Returns:
-    tensor: `tf.identity(tensor)` with control dependencies attached;
-      be sure to use that downstream.
+    samples: Return original `samples` with control dependencies attached
+      to ensure no broadcasting.
   """
   def check(t):
-    target = array_ops.shape(tensor)[1:]
-    result = array_ops.broadcast_dynamic_shape(target, array_ops.shape(t))
+    samples_batch_shape = array_ops.shape(samples)[1:]
+    broadcasted_batch_shape = array_ops.broadcast_dynamic_shape(
+        samples_batch_shape, array_ops.shape(t))
     # This rank check ensures that I don't get a wrong answer from the
     # _shapes_ broadcasting against each other.
-    gt = check_ops.assert_greater(array_ops.rank(target), array_ops.rank(t))
-    eq = check_ops.assert_equal(target, result)
-    return gt, eq
-  checks = list(itertools.chain(*[check(t) for t in tensors]))
+    samples_batch_ndims = array_ops.size(samples_batch_shape)
+    ge = check_ops.assert_greater_equal(
+        samples_batch_ndims, array_ops.rank(t))
+    eq = check_ops.assert_equal(samples_batch_shape, broadcasted_batch_shape)
+    return ge, eq
+  checks = list(itertools.chain(*[check(t) for t in parameters]))
   with ops.control_dependencies(checks):
-    return array_ops.identity(array_ops.identity(tensor))
+    return array_ops.identity(samples)
 
 
 def true_mean_confidence_interval_by_dkwm(
@@ -577,9 +687,13 @@ def assert_true_mean_equal_by_dkwm_two_sample(
       # I want to assert
       #   not (max_mean_1 < min_mean_2 or min_mean_1 > max_mean_2),
       # but I think I only have and-combination of asserts, so use DeMorgan.
-      clause1_op = check_ops.assert_greater_equal(max_mean_1, min_mean_2)
-      with ops.control_dependencies([clause1_op]):
-        return check_ops.assert_less_equal(min_mean_1, max_mean_2)
+      check_confidence_intervals_can_intersect = check_ops.assert_greater_equal(
+          max_mean_1, min_mean_2, message="Confidence intervals do not "
+          "intersect: samples1 has a smaller mean than samples2")
+      with ops.control_dependencies([check_confidence_intervals_can_intersect]):
+        return check_ops.assert_less_equal(
+            min_mean_1, max_mean_2, message="Confidence intervals do not "
+            "intersect: samples2 has a smaller mean than samples1")
 
 
 def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 8c67647a618d22a58428d78865c4ebf7d98bdf9e..887981d64ef077e2636f8031581c390f177edac8 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -66,7 +66,7 @@ class _VectorStudentT(transformed_distribution.TransformedDistribution):
   This distribution is an Affine transformation of iid
   [Student's t-distributions](
   https://en.wikipedia.org/wiki/Student%27s_t-distribution)
-  and should not be confused with the [Multivate Student's t-distribution](
+  and should not be confused with the [Multivariate Student's t-distribution](
   https://en.wikipedia.org/wiki/Multivariate_t-distribution). The
   traditional Multivariate Student's t-distribution is type of
   [elliptical distribution](
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 9d2ca07c3a25fa7acb9b0f5806b763d9a57b51fa..9a3b780af888a597d2440b243ffb8dc98d764f18 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,12 +1,8 @@
 # Eager Execution
 
-> *WARNING*: This is a preview/pre-alpha version. The API and performance
-> characteristics are subject to change.
-
-Eager execution is an experimental interface to TensorFlow that provides an
-imperative programming style (à la [NumPy](http://www.numpy.org)). When you
-enable eager execution, TensorFlow operations execute immediately; you do not
-execute a pre-constructed graph with
+Eager execution provides an imperative interface to TensorFlow (similiar to
+[NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
+operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
 
 For example, consider a simple computation in TensorFlow:
@@ -33,7 +29,7 @@ print(m)
 ## Caveats
 
 This feature is in early stages and work remains to be done in terms of smooth
-support for distributed and multi-GPU training and CPU performance.
+support for distributed and multi-GPU training and performance.
 
 - [Known issues](https://github.com/tensorflow/tensorflow/issues?q=is%3Aissue%20is%3Aopen%20label%3Acomp%3Aeager)
 - Feedback is welcome, please consider
@@ -41,21 +37,23 @@ support for distributed and multi-GPU training and CPU performance.
 
 ## Installation
 
-Eager execution is included in TensorFlow versions 1.5 and above.
+Eager execution is included in TensorFlow versions 1.7 and above.
 Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
 
 For an introduction to eager execution in TensorFlow, see:
 
-- [User Guide](python/g3doc/guide.md)
+- [User Guide](https://www.tensorflow.org/programmers_guide/eager) ([source](../../docs_src/programmers_guide/eager.md))
 - Notebook: [Basic Usage](python/examples/notebooks/1_basics.ipynb)
 - Notebook: [Gradients](python/examples/notebooks/2_gradients.ipynb)
 - Notebook: [Importing Data](python/examples/notebooks/3_datasets.ipynb)
 
 ## Changelog
 
-- 2017/10/31: Initial preview release.
+- 2017/10/31: Initial preview release (in TensorFlow 1.5)
 - 2017/12/01: Example of dynamic neural network:
   [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
   See [README.md](python/examples/spinn/README.md) for details.
+- 2017/03: Core functionality moved out of the experimental tf.contrib namespace
+  in TensorFlow 1.7.
diff --git a/tensorflow/contrib/eager/proto/BUILD b/tensorflow/contrib/eager/proto/BUILD
index aedfec8924e7314addd22349c0576a84a58d9aa3..b016d2dcb504044372c895e1eedf3511751bc13e 100644
--- a/tensorflow/contrib/eager/proto/BUILD
+++ b/tensorflow/contrib/eager/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "checkpointable_object_graph_proto",
     srcs = [
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 80176397c02f22095a3a9be3d12c2115ec4eca29..d97048405d1cd50dad857ec03fc81bb00466b2e1 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -80,6 +80,7 @@ cuda_py_test(
         "//tensorflow/python/data",
         "//tensorflow/python/eager:test",
     ],
+    tags = ["noguitar"],
 )
 
 py_library(
@@ -119,13 +120,13 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/eager/python:checkpointable_utils",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
@@ -139,11 +140,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/contrib/summary:summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -160,10 +161,10 @@ py_library(
     deps = [
         ":datasets",
         ":metrics",
-        "//tensorflow/contrib/summary:summary_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
         "@six_archive//:six",
@@ -271,21 +272,7 @@ cuda_py_test(
         "//tensorflow/python/keras",
     ],
     tags = [
-        "no_oss",  # b/74395663
         "no_windows",  # TODO: needs investigation on Windows
-        "notsan",
+        "notsan",  # b/74395663
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils.py b/tensorflow/contrib/eager/python/checkpointable_utils.py
index 91a7aded11db6b4c8bcb061da6d6c69253603c85..34cb8d0e0887bd5e440873bae117bf27597de11b 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import functools
 import weakref
 
 from tensorflow.contrib.eager.proto import checkpointable_object_graph_pb2
@@ -867,3 +868,115 @@ class Checkpoint(core_checkpointable.Checkpointable):
     # initialization when executing eagerly.
     self._maybe_create_save_counter()
     return status
+
+
+class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
+  """Wraps save and restore callbacks as a `SaveableObject`."""
+
+  def __init__(self, name, dtype, save_callback, restore_callback):
+    self._restore_callback = restore_callback
+    spec = saver_lib.BaseSaverBuilder.SaveSpec(
+        tensor=save_callback,
+        slice_spec="",
+        name=name,
+        dtype=dtype)
+    super(_CallbackSaveable, self).__init__(
+        save_callback, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return self._restore_callback(tensor)
+
+
+class _SplitDependency(core_checkpointable.CheckpointableBase):
+  """Looks like a regular variable while synchronizing save/restores."""
+
+  def __init__(self, save_buffer, restore_buffer, name, dtype, num_components,
+               fill_save_buffer_fn, consume_restore_buffer_fn):
+    self._save_buffer = save_buffer
+    self._restore_buffer = restore_buffer
+    self._name = name
+    self._dtype = dtype
+    self._num_components = num_components
+    self._fill_save_buffer_fn = fill_save_buffer_fn
+    self._consume_restore_buffer_fn = consume_restore_buffer_fn
+
+  def _save(self):
+    """Pull from the shared buffer, populating it if necessary."""
+    if self._name not in self._save_buffer:
+      if self._save_buffer:
+        raise AssertionError(
+            ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+             "be saved together.") % (self._name, self))
+      self._fill_save_buffer_fn(self._save_buffer)
+    return self._save_buffer.pop(self._name)
+
+  def _restore(self, tensor):
+    """Push into the shared buffer, flushing it if necessary."""
+    if self._name in self._restore_buffer:
+      raise AssertionError(
+          ("Split dependency %s (%s) unsynchronized. Split dependencies must "
+           "be restored together.") % (self._name, self))
+    self._restore_buffer[self._name] = tensor
+    if len(self._restore_buffer) == self._num_components:
+      op = self._consume_restore_buffer_fn(self._restore_buffer)
+      self._restore_buffer.clear()
+      return op
+    else:
+      return control_flow_ops.no_op()
+
+  def _gather_saveables_for_checkpoint(self):
+    """Looks to Checkpointable like a regular variable."""
+    return {
+        core_checkpointable.VARIABLE_VALUE_KEY:
+        functools.partial(_CallbackSaveable,
+                          dtype=self._dtype,
+                          save_callback=self._save,
+                          restore_callback=self._restore)
+    }
+
+
+def split_dependency(component_names, component_dtypes,
+                     fill_save_buffer_fn, consume_restore_buffer_fn):
+  """Creates multiple dependencies with a synchronized save/restore.
+
+  Useful when a single op produces `Tensor`s which should each be saved under
+  different objects, or when `Tensor`s saved with many different objects need to
+  be restored together as inputs to a single op (i.e. an object which uses a
+  single fused op may be swapped out for a subgraph of objects, and these two
+  programs are checkpoint compatible).
+
+  Args:
+    component_names: A sequence of names for the split
+      dependencies. `fill_save_buffer_fn` must add these keys to the dictionary
+      it is passed, and `consume_restore_buffer_fn` will receive a dictionary
+      with these keys.
+    component_dtypes: Data types for the `Tensor`s being saved and restored, a
+      sequence corresponding to `component_names`.
+    fill_save_buffer_fn: A function which takes an empty dictionary as an
+      argument and adds `Tensor`s with `component_names` as keys. These
+      `Tensor`s will be saved as if they were individual variables.
+    consume_restore_buffer_fn: A function which takes a dictionary with
+      `component_names` as keys mapping to restored individual `Tensor`s and
+      returns a restore op (or if executing eagerly, runs the restoration and
+      may return `None`).
+
+  Returns:
+    A dictionary mapping from names to Checkpointable objects. If one is
+    reachable from an object as a dependency, the others should be too; adding
+    dependencies on some but not all of the objects will result in errors.
+  """
+  save_buffer = {}
+  restore_buffer = {}
+  split_dependencies = {}
+  for name, dtype in zip(component_names, component_dtypes):
+    split_dependencies[name] = _SplitDependency(
+        save_buffer=save_buffer,
+        restore_buffer=restore_buffer,
+        name=name,
+        dtype=dtype,
+        num_components=len(component_names),
+        fill_save_buffer_fn=fill_save_buffer_fn,
+        consume_restore_buffer_fn=consume_restore_buffer_fn)
+  return split_dependencies
diff --git a/tensorflow/contrib/eager/python/checkpointable_utils_test.py b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
index a8c47d76d1682296850c488f09aa6c358c5e6ee1..b344d50e7f3e407d22e5ae35c65c702c9916136a 100644
--- a/tensorflow/contrib/eager/python/checkpointable_utils_test.py
+++ b/tensorflow/contrib/eager/python/checkpointable_utils_test.py
@@ -25,13 +25,16 @@ from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import sequential
 from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.layers import core
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -68,6 +71,87 @@ class MyModel(training.Model):
     return ret
 
 
+def _split_variable_closure(variable):
+  def _fill_save_buffer_fn(save_buffer):
+    save_buffer["first_half"] = variable[:2]
+    save_buffer["second_half"] = variable[2:]
+  return _fill_save_buffer_fn
+
+
+def _combine_variable_closure(variable):
+  def _consume_restore_buffer_fn(restore_buffer):
+    return variable.assign(
+        array_ops.concat([restore_buffer["first_half"],
+                          restore_buffer["second_half"]],
+                         axis=0))
+  return _consume_restore_buffer_fn
+
+
+class SaveTensorSlicesAsDeps(checkpointable.CheckpointableBase):
+
+  def __init__(self):
+    self.combined = resource_variable_ops.ResourceVariable([0., 0., 0., 0.])
+    split_dependencies = checkpointable_utils.split_dependency(
+        component_names=("first_half", "second_half"),
+        component_dtypes=(self.combined.dtype,) * 2,
+        fill_save_buffer_fn=_split_variable_closure(
+            self.combined),
+        consume_restore_buffer_fn=_combine_variable_closure(
+            self.combined))
+    for name, dep in split_dependencies.items():
+      self._track_checkpointable(dep, name=name)
+
+
+class HasRegularDeps(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+    self.second_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class OnlyOneDep(checkpointable.Checkpointable):
+
+  def __init__(self):
+    self.first_half = resource_variable_ops.ResourceVariable([0., 0.])
+
+
+class SplitTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testSaveRestoreSplitDep(self):
+    save_checkpoint = checkpointable_utils.Checkpoint(
+        dep=SaveTensorSlicesAsDeps())
+    self.evaluate(save_checkpoint.dep.combined.assign([1., 2., 3., 4.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_checkpoint.save(checkpoint_prefix)
+
+    regular_deps = HasRegularDeps()
+    regular_restore_checkpoint = checkpointable_utils.Checkpoint(
+        dep=regular_deps)
+    regular_restore_checkpoint.restore(
+        save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(regular_deps.first_half))
+    self.assertAllEqual([3., 4.], self.evaluate(regular_deps.second_half))
+
+    one_dep = OnlyOneDep()
+    one_dep_restore_checkpoint = checkpointable_utils.Checkpoint(dep=one_dep)
+    status = one_dep_restore_checkpoint.restore(save_path)
+    with self.assertRaises(AssertionError):
+      # Missing the second dependency.
+      status.assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2.], self.evaluate(one_dep.first_half))
+
+    restore_checkpoint = checkpointable_utils.Checkpoint()
+    status = restore_checkpoint.restore(save_path)
+    restore_checkpoint.dep = SaveTensorSlicesAsDeps()
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual(
+        [1., 2., 3., 4.],
+        self.evaluate(restore_checkpoint.dep.combined))
+
+
 class InterfaceTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
@@ -497,6 +581,50 @@ class CheckpointingTests(test.TestCase):
         self.assertEqual(training_continuation + 1,
                          self.evaluate(root.save_counter))
 
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
   def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
@@ -631,7 +759,7 @@ class CheckpointingTests(test.TestCase):
     status.run_restore_ops()
     self.assertEqual(-14., self.evaluate(loaded_dep_after_var.dep.var))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testDeferredSlotRestoration(self):
     checkpoint_directory = self.get_temp_dir()
 
@@ -696,7 +824,7 @@ class CheckpointingTests(test.TestCase):
       self.evaluate(train_op)
     slot_status.assert_consumed()
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testOverlappingRestores(self):
     checkpoint_directory = self.get_temp_dir()
     save_root = checkpointable.Checkpointable()
@@ -747,7 +875,7 @@ class CheckpointingTests(test.TestCase):
     second_status.run_restore_ops()
     self.assertEqual(12., self.evaluate(load_dep.var))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testAmbiguousLoad(self):
     # Not OK to split one checkpoint object into two
     checkpoint_directory = self.get_temp_dir()
@@ -770,7 +898,7 @@ class CheckpointingTests(test.TestCase):
                                  "resolved to different objects"):
       load_root.dep_two.dep_three = checkpointable.Checkpointable()
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testObjectsCombined(self):
     # Currently fine to load two checkpoint objects into one Python object
     checkpoint_directory = self.get_temp_dir()
@@ -1036,10 +1164,42 @@ class CheckpointingTests(test.TestCase):
         beta1_power, _ = optimizer._get_beta_accumulators()
         self.assertAllEqual(3., self.evaluate(beta1_power))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_sequential(self):
+    model = sequential.Sequential()
+    checkpoint = checkpointable_utils.Checkpoint(model=model)
+    model.add(core.Dense(4))
+    second_dense = core.Dense(5)
+    model.add(second_dense)
+    model(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([1., 2., 3., 4., 5.])))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.evaluate(second_dense.bias.assign(
+        constant_op.constant([5., 6., 7., 8., 9.])))
+    checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.], self.evaluate(second_dense.bias))
+
+    deferred_sequential = sequential.Sequential()
+    deferred_sequential_checkpoint = checkpointable_utils.Checkpoint(
+        model=deferred_sequential)
+    status = deferred_sequential_checkpoint.restore(save_path)
+    deferred_sequential.add(core.Dense(4))
+    deferred_sequential(constant_op.constant([[1.]]))
+    deferred_second_dense = core.Dense(5)
+    deferred_sequential.add(deferred_second_dense)
+    deferred_sequential(constant_op.constant([[1.]]))
+    status.run_restore_ops()
+    self.assertAllEqual([1., 2., 3., 4., 5.],
+                        self.evaluate(deferred_second_dense.bias))
+
 
 class TemplateTests(test.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def test_checkpointable_save_restore(self):
 
     def _templated():
@@ -1070,7 +1230,7 @@ class TemplateTests(test.TestCase):
     self.assertAllEqual([13.], self.evaluate(var_plus_one))
     self.assertAllEqual([14.], self.evaluate(var2))
 
-  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  @test_util.run_in_graph_and_eager_modes()
   def test_checkpointable_save_restore_nested(self):
 
     def _inner_template():
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index a4c3283dac9194880a1297371ea7591af6dddb2b..0783d1b5d70e502e6edd80b59f37fdd93b413e12 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -71,8 +71,15 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
       dataset: A `tf.data.Dataset` object.
 
     Raises:
+      TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
+    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
+      raise TypeError(
+          "`tf.contrib.data.prefetch_to_device()` is not compatible with "
+          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
+          "over the dataset instead.")
+
     super(Iterator, self).__init__(dataset)
     if not context.context().device_spec.device_type:
       is_remote_device = False
@@ -98,7 +105,6 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             f=remote_fn,
             target_device=target,
             buffer_size=10,
-            thread_pool_size=1,
             container="",
             shared_name=_generate_shared_name("function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
@@ -108,16 +114,20 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    if self._buffer_resource_handle is not None:
-      with ops.device(self._device):
-        ret = prefetching_ops.function_buffering_resource_get_next(
-            function_buffer_resource=self._buffer_resource_handle,
-            output_types=self._flat_output_types)
-      return sparse.deserialize_sparse_tensors(
-          nest.pack_sequence_as(self._output_types, ret), self._output_types,
-          self._output_shapes, self._output_classes)
-    else:
-      return super(Iterator, self)._next_internal()
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      if self._buffer_resource_handle is not None:
+        with ops.device(self._device):
+          ret = prefetching_ops.function_buffering_resource_get_next(
+              function_buffer_resource=self._buffer_resource_handle,
+              output_types=self._flat_output_types)
+        return sparse.deserialize_sparse_tensors(
+            nest.pack_sequence_as(self._output_types, ret), self._output_types,
+            self._output_shapes, self._output_classes)
+      else:
+        return super(Iterator, self)._next_internal()
 
   # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset
   # attributes(potential).
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index c658505de41bb6a0007440f4850fef720c3e97f1..f76a896d3d8d795b5a7a0e97b5f688fb0291575a 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -24,6 +24,7 @@ import time
 import numpy as np
 
 from tensorflow.contrib import lookup
+from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.contrib.data.python.ops import threadpool
 from tensorflow.contrib.data.python.ops import unique
 from tensorflow.contrib.eager.python import checkpointable_utils
@@ -192,6 +193,18 @@ class IteratorTest(test.TestCase):
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
+  def testTensorsExplicitPrefetchToDevice(self):
+    ds = Dataset.from_tensor_slices([0., 1.])
+    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
+
+    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
+      datasets.Iterator(ds)
+
+    for i, x in enumerate(ds):
+      with ops.device(test.gpu_device_name()):
+        x = math_ops.add(x, x)
+        self.assertEqual(float(i) + float(i), x.numpy())
+
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 37c8f0d47adbde6932bf409cdcae9a1845d700b5..7949a3f6da293abdd85512209242bae76ab4d816 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,12 +22,12 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 
 
 class Evaluator(object):
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
index f86331af6f7928f0f86c888e22706c6e0a5978b2..2f6cfdf31e852d5d69a7a87980c9a441da504cf2 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/BUILD
@@ -22,6 +22,7 @@ cuda_py_test(
         ":linear_regression",
         "//tensorflow:tensorflow_py",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 cuda_py_test(
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index a90048d813bf345e8be32e9674a452175471b268..be5d60449d7e08c99cc28e76befce56f468c77fd 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -315,32 +315,37 @@ def main(_):
   have_gpu = tfe.num_gpus() > 0
   use_cudnn_rnn = not FLAGS.no_use_cudnn_rnn and have_gpu
 
-  with tfe.restore_variables_on_create(
-      tf.train.latest_checkpoint(FLAGS.logdir)):
-    with tf.device("/device:GPU:0" if have_gpu else None):
-      # Make learning_rate a Variable so it can be included in the checkpoint
-      # and we can resume training with the last saved learning_rate.
-      learning_rate = tfe.Variable(20.0, name="learning_rate")
-      sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
-      model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
-                       FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
-                       use_cudnn_rnn)
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-
-      best_loss = None
-      for _ in range(FLAGS.epoch):
-        train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
-        eval_loss = evaluate(model, eval_data)
-        if not best_loss or eval_loss < best_loss:
-          if FLAGS.logdir:
-            tfe.Saver(model.trainable_weights + [learning_rate]).save(
-                os.path.join(FLAGS.logdir, "ckpt"))
-          best_loss = eval_loss
-        else:
-          learning_rate.assign(learning_rate / 4.0)
-          sys.stderr.write("eval_loss did not reduce in this epoch, "
-                           "changing learning rate to %f for the next epoch\n" %
-                           learning_rate.numpy())
+  with tf.device("/device:GPU:0" if have_gpu else None):
+    # Make learning_rate a Variable so it can be included in the checkpoint
+    # and we can resume training with the last saved learning_rate.
+    learning_rate = tfe.Variable(20.0, name="learning_rate")
+    model = PTBModel(corpus.vocab_size(), FLAGS.embedding_dim,
+                     FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
+                     use_cudnn_rnn)
+    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+    checkpoint = tfe.Checkpoint(
+        learning_rate=learning_rate, model=model,
+        # GradientDescentOptimizer has no state to checkpoint, but noting it
+        # here lets us swap in an optimizer that does.
+        optimizer=optimizer)
+    # Restore existing variables now (learning_rate), and restore new variables
+    # on creation if a checkpoint exists.
+    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.logdir))
+    sys.stderr.write("learning_rate=%f\n" % learning_rate.numpy())
+
+    best_loss = None
+    for _ in range(FLAGS.epoch):
+      train(model, optimizer, train_data, FLAGS.seq_len, FLAGS.clip)
+      eval_loss = evaluate(model, eval_data)
+      if not best_loss or eval_loss < best_loss:
+        if FLAGS.logdir:
+          checkpoint.save(os.path.join(FLAGS.logdir, "ckpt"))
+        best_loss = eval_loss
+      else:
+        learning_rate.assign(learning_rate / 4.0)
+        sys.stderr.write("eval_loss did not reduce in this epoch, "
+                         "changing learning rate to %f for the next epoch\n" %
+                         learning_rate.numpy())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index 667365341829124060b724b8a5d6e542149ba704..9adf47d505fc2933d9c009e5863351bd123c3797 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -173,7 +173,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
         right_in.append(tf.random_normal((1, size * 2)))
         tracking.append(tf.random_normal((1, tracker_size * 2)))
 
-      out = reducer(left_in, right_in=right_in, tracking=tracking)
+      out = reducer(left_in, right_in, tracking=tracking)
       self.assertEqual(batch_size, len(out))
       self.assertEqual(tf.float32, out[0].dtype)
       self.assertEqual((1, size * 2), out[0].shape)
@@ -227,7 +227,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
       self.assertEqual((batch_size, size * 2), stacks[0][0].shape)
 
       for _ in range(2):
-        out1, out2 = tracker(bufs, stacks=stacks)
+        out1, out2 = tracker(bufs, stacks)
         self.assertIsNone(out2)
         self.assertEqual(batch_size, len(out1))
         self.assertEqual(tf.float32, out1[0].dtype)
@@ -260,7 +260,7 @@ class SpinnTest(test_util.TensorFlowTestCase):
       self.assertEqual(tf.int64, transitions.dtype)
       self.assertEqual((num_transitions, 1), transitions.shape)
 
-      out = s(buffers, transitions=transitions, training=True)
+      out = s(buffers, transitions, training=True)
       self.assertEqual(tf.float32, out.dtype)
       self.assertEqual((1, embedding_dims), out.shape)
 
@@ -286,15 +286,12 @@ class SpinnTest(test_util.TensorFlowTestCase):
                                                          vocab_size)
 
       # Invoke model under non-training mode.
-      logits = model(
-          prem, premise_transition=prem_trans, hypothesis=hypo,
-          hypothesis_transition=hypo_trans, training=False)
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=False)
       self.assertEqual(tf.float32, logits.dtype)
       self.assertEqual((batch_size, d_out), logits.shape)
 
       # Invoke model under training model.
-      logits = model(prem, premise_transition=prem_trans, hypothesis=hypo,
-                     hypothesis_transition=hypo_trans, training=True)
+      logits = model(prem, prem_trans, hypo, hypo_trans, training=True)
       self.assertEqual(tf.float32, logits.dtype)
       self.assertEqual((batch_size, d_out), logits.shape)
 
diff --git a/tensorflow/contrib/eager/python/g3doc/guide.md b/tensorflow/contrib/eager/python/g3doc/guide.md
index 11064981c6257a607f88c6f4414418c8d1f8eac7..2d2aba6908b168e0bf63f4706b6344cbb4ca82bd 100644
--- a/tensorflow/contrib/eager/python/g3doc/guide.md
+++ b/tensorflow/contrib/eager/python/g3doc/guide.md
@@ -1,892 +1,18 @@
-# TensorFlow Eager Execution
-
-## What is this?
+# Eager execution
 
 Eager execution is a feature that makes TensorFlow execute operations
-immediately: concrete values are returned, instead of a computational graph to
-be executed later.
-
-As a result, enabling eager execution provides:
-
--   A [NumPy](http://www.numpy.org/)-like library for numerical computation with
-    support for GPU acceleration and automatic differentiation.
--   A flexible platform for machine learning research and experimentation.
-
-Eager execution is under active development. This guide walks through an
-alpha/preview release. In particular, not all TensorFlow APIs currently work
-with eager execution enabled, and some models may be slow to execute, compared
-to models defined without using eager execution.
-
-## Installation
-
-Eager execution is included in TensorFlow versions 1.5 and above.
-Installation instructions at https://www.tensorflow.org/install/
-
-The contents of this guide are compatible with TensorFlow 1.5. However, if you
-run into bugs that are fixed in source but not the release, you may want to
-either [build from source](https://www.tensorflow.org/install/install_sources)
-or try a nightly build. The nightly builds are available as:
-
-- [`pip` packages](https://github.com/tensorflow/tensorflow/blob/master/README.md#installation) and
-
-- [docker](https://hub.docker.com/r/tensorflow/tensorflow/) images.
-
-For example, to run the latest nightly docker image:
-
-```sh
-# If you have a GPU, use https://github.com/NVIDIA/nvidia-docker
-docker pull tensorflow/tensorflow:nightly-gpu
-docker run --runtime=nvidia -it -p 8888:8888 tensorflow/tensorflow:nightly-gpu
-
-# If you do not have a GPU, use the CPU-only image
-docker pull tensorflow/tensorflow:nightly
-docker run -it -p 8888:8888 tensorflow/tensorflow:nightly
-```
-
-And then visit http://localhost:8888 in your browser for a Jupyter notebook
-environment.
-
-## Getting Started
-
-With TensorFlow installed, eager execution is enabled via a single call:
-
-```python
-import tensorflow as tf
-
-import tensorflow.contrib.eager as tfe
-
-tfe.enable_eager_execution()
-```
-
-Enabling eager execution changes how TensorFlow functions behave (in particular,
-`Tensor` objects will reference concrete values instead of being symbolic
-handles to nodes in a computational graph). As a result, eager execution should
-be enabled at the beginning of a program and cannot be disabled afterwards in
-the same program.
-
-Code examples in the rest of this guide assume that eager execution has been
-enabled.
-
-## A library for numerical computation
-
-A significant fraction of the [TensorFlow
-API](https://www.tensorflow.org/api_docs/python/) consists of numerical
-operations:
-[arithmetic operations](https://www.tensorflow.org/api_guides/python/math_ops#Arithmetic_Operators),
-[matrix operations](https://www.tensorflow.org/api_guides/python/math_ops#Matrix_Math_Functions),
-[linear algebra operations](https://www.tensorflow.org/versions/master/api_docs/python/tf/linalg),
-etc.
-
-With eager execution enabled, these operations consume and return
-multi-dimensional arrays as `Tensor` objects, similar to NumPy
-[`ndarray`s](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.ndarray.html).
-For example:
-
-```python
-# Multiply two 2x2 matrices
-x = tf.matmul([[1, 2],
-               [3, 4]],
-              [[4, 5],
-               [6, 7]])
-# Add one to each element
-# (tf.add supports broadcasting)
-y = tf.add(x, 1)
-
-# Create a random random 5x3 matrix
-z = tf.random_uniform([5, 3])
-
-print(x)
-print(y)
-print(z)
-```
-
-Output:
-
-```
-tf.Tensor(
-[[16 19]
- [36 43]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[17 20]
- [37 44]], shape=(2, 2), dtype=int32)
-tf.Tensor(
-[[ 0.25058532  0.0929395   0.54113817]
- [ 0.3108716   0.93350542  0.84909797]
- [ 0.53081679  0.12788558  0.01767385]
- [ 0.29725885  0.33540785  0.83588314]
- [ 0.38877153  0.39720535  0.78914213]], shape=(5, 3), dtype=float32)
-```
-
-For convenience, these operations can also be triggered via operator overloading
-of the `Tensor` object. For example, the `+` operator is equivalent to `tf.add`,
-`-` to `tf.subtract`, `*` to `tf.multiply`, etc.:
-
-```python
-x = (tf.ones([1], dtype=tf.float32) + 1) * 2 - 1
-print(x)
-```
-
-Output:
-
-```
-tf.Tensor([ 3.], shape=(1,), dtype=float32)
-```
-
-### Converting to and from NumPy
-
-The operations above automatically convert Python objects (like lists of
-numbers) and NumPy arrays to `Tensor` objects. `Tensor` objects can also be used
-as NumPy arrays by numpy operations.
-
-```python
-import numpy as np
-
-x = tf.add(1, 1)                     # tf.Tensor with a value of 2
-y = tf.add(np.array(1), np.array(1)) # tf.Tensor with a value of 2
-z = np.multiply(x, y)                # numpy.int64 with a value of 4
-```
-
-Alternatively, they can be explicitly converted using
-[`tf.constant`](https://www.tensorflow.org/api_docs/python/tf/constant), as
-shown in the next example.
-
-Conversely, you can call the `numpy()` method of a `Tensor` object' to obtain
-its NumPy `ndarray` value. For example:
-
-```python
-import numpy as np
-
-np_x = np.array(2., dtype=np.float32)
-x = tf.constant(np_x)
-
-py_y = 3.
-y = tf.constant(py_y)
-
-z = x + y + 1
-
-print(z)
-print(z.numpy())
-```
-
-Output:
-
-```
-tf.Tensor(6.0, shape=(), dtype=float32)
-6.0
-```
-
-### GPU acceleration
-
-Many TensorFlow operations support GPU acceleration. With eager execution
-enabled, [computation is *not* automatically
-offloaded](https://www.tensorflow.org/tutorials/using_gpu) to GPUs. Instead, you
-must explicitly specify when GPUs should be used.
-
-The simplest way to do this is to enclose your computation in a `with
-tf.device('/gpu:0')` block. Also of interest is the `tfe.num_gpus()` function,
-which returns the number of available GPUs.
-
-For example, consider this snippet to measure the time to multiply two 1000x1000
-matrices on CPU:
-
-```python
-import time
-
-def measure(x):
-  # The very first time a GPU is used by TensorFlow, it is initialized.
-  # So exclude the first run from timing.
-  tf.matmul(x, x)
-
-  start = time.time()
-  for i in range(10):
-    tf.matmul(x, x)
-  end = time.time()
-
-  return "Took %s seconds to multiply a %s matrix by itself 10 times" % (end - start, x.shape)
-
-# Run on CPU:
-with tf.device("/cpu:0"):
-  print("CPU: %s" % measure(tf.random_normal([1000, 1000])))
-
-# If a GPU is available, run on GPU:
-if tfe.num_gpus() > 0:
-  with tf.device("/gpu:0"):
-    print("GPU: %s" % measure(tf.random_normal([1000, 1000])))
-```
-
-Output (exact numbers will depend on the characteristics of the hardware):
-
-```python
-CPU: Took 0.145531892776 seconds to multiply a (1000, 1000) matrix by itself 10 times
-GPU: Took 0.000458955764771 seconds to multiply a (1000, 1000) matrix by itself 10 times
-```
-
-Alternatively, methods on the `Tensor` object can be used to explicitly copy the
-`Tensor` to a different device. Operations are typically executed on the device
-on which the inputs are placed. For example:
-
-```python
-x = tf.random_normal([10, 10])
-
-x_gpu0 = x.gpu()
-x_cpu = x.cpu()
-
-_ = tf.matmul(x_cpu, x_cpu)  # Runs on CPU
-_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
-
-if tfe.num_gpus() > 1:
-  x_gpu1 = x.gpu(1)
-  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
-```
-
-### Automatic Differentiation
-
-[Automatic
-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) is
-very useful when implementing many machine learning algorithms (e.g.,
-[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
-neural networks). For this purpose, TensorFlow eager execution provides an
-[autograd](https://github.com/HIPS/autograd)-style API for automatic
-differentiation. Specifically, the functions:
-
--   `tfe.gradients_function(f)`: Returns a Python function that computes the
-    derivatives of the Python function `f` with respect to its arguments. `f`
-    must return a scalar value. When the returned function is invoked, it
-    returns a list of `Tensor` objects (one element for each argument of `f`).
--   `tfe.value_and_gradients_function(f)`: Similar to `tfe.gradients_function`,
-    except that when the returned function is invoked, it returns the value of
-    `f` in addition to the list of derivatives of `f` with respect to its
-    arguments.
-
-These functions naturally apply to higher order differentiation as well. For
-example:
-
-```python
-def f(x):
-  return tf.multiply(x, x)  # Or x * x
-assert 9 == f(3.).numpy()
-
-df = tfe.gradients_function(f)
-assert 6 == df(3.)[0].numpy()
-
-# Second order deriviative.
-d2f = tfe.gradients_function(lambda x: df(x)[0])
-assert 2 == d2f(3.)[0].numpy()
-
-# Third order derivative: Will be None
-d3f = tfe.gradients_function(lambda x : d2f(x)[0])
-assert None == d3f(3.)[0]
-```
-
-These functions can be used to train models. For example, consider the following
-simple linear regression model:
-
-```python
-def prediction(input, weight, bias):
-  return input * weight + bias
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# A loss function: Mean-squared error
-def loss(weight, bias):
-  error = prediction(training_inputs, weight, bias) - training_outputs
-  return tf.reduce_mean(tf.square(error))
-
-# Function that returns the derivative of loss with respect to
-# weight and bias
-grad = tfe.gradients_function(loss)
-
-# Train for 200 steps (starting from some random choice for W and B, on the same
-# batch of data).
-W = 5.
-B = 10.
-learning_rate = 0.01
-print("Initial loss: %f" % loss(W, B).numpy())
-for i in range(200):
-  (dW, dB) = grad(W, B)
-  W -= dW * learning_rate
-  B -= dB * learning_rate
-  if i % 20 == 0:
-    print("Loss at step %d: %f" % (i, loss(W, B).numpy()))
-print("Final loss: %f" % loss(W, B).numpy())
-print("W, B = %f, %f" % (W.numpy(), B.numpy()))
-```
-
-Output: (the exact numbers may vary depending on the randomness in noise)
-
-```
-Initial loss: 66.730003
-Loss at step 0: 64.200096
-Loss at step 20: 29.872814
-Loss at step 40: 14.233772
-Loss at step 60: 7.090570
-Loss at step 80: 3.819887
-Loss at step 100: 2.318821
-Loss at step 120: 1.628385
-Loss at step 140: 1.310142
-Loss at step 160: 1.163167
-Loss at step 180: 1.095162
-Final loss: 1.064711
-W, B = 3.094944, 2.161383
-```
-
-To utilize the GPU, place the code above within a `with tf.device("/gpu:0"):`
-block. (However, this particular model, with only two floating point parameters,
-is unlikely to benefit from GPU acceleration.)
-
-### Customizing gradients
-
-One may want to define custom gradients for an operation, or for a function.
-This may be useful for multiple reasons, including providing a more efficient
-or more [numerically stable](https://en.wikipedia.org/wiki/Numerical_stability)
-gradient for a sequence of operations.
-
-For example, consider the function `log(1 + e^x)`, which commonly occurs in the
-computation of cross entropy and log likelihoods.
-
-```python
-def log1pexp(x):
-  return tf.log(1 + tf.exp(x))
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works fine at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# Returns a `nan` at x = 100 due to numerical instability.
-import math
-assert math.isnan(float(grad_log1pexp(100.)[0]))
-```
-
-We can define a custom gradient for the above function that analytically
-simplifies the gradient expression.
-
-```python
-@tfe.custom_gradient
-def log1pexp(x):
-  e = tf.exp(x)
-  def grad(dy):
-    return dy * (1 - 1 / (1 + e))
-  return tf.log(1 + e), grad
-grad_log1pexp = tfe.gradients_function(log1pexp)
-
-# Works as before at x = 0.
-assert 0.5 == float(grad_log1pexp(0.)[0])
-
-# But now works at x = 100 as well.
-assert 1.0 == float(grad_log1pexp(100.)[0])
-```
-Also notice how the gradient function implementation reuses an expression
-(`tf.exp(x)`) computed during the forward pass, hence making the gradient
-computation more efficient by avoiding redundant computation.
-
-## Building and training models
-
-In practice, your computation may have many parameters to be optimized (by
-computing derivatives). Encapsulating them into re-usable classes/objects
-makes the code easier to follow than writing a single top-level function with
-many arguments.
-
-In fact, eager execution encourages use of the [Keras](https://keras.io)-style
-"Layer" classes in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module.
-
-Furthermore, you may want to apply more sophisticated techniques to compute
-parameter updates, such as those in
-[`tf.train.Optimizer`](https://www.tensorflow.org/api_guides/python/train#Optimizers)
-implementations.
-
-This next section walks through using the same `Optimizer` and `Layer` APIs used
-to build trainable TensorFlow graphs in an environment where eager execution is
-enabled.
-
-### Variables and Optimizers
-
-`tfe.Variable` objects store mutable `Tensor` values that can be accessed during
-training, making automatic differentiation easier. In particular, parameters of
-a model can be encapsulated in Python classes as variables.
-
-`tfe.gradients_function(f)` introduced earlier computes the derivatives of `f`
-with respect to its arguments. However, it requires all parameters of interest
-to be arguments of `f`, which becomes cumbersome when `f` depends on a large
-number of trainable parameters.
-
-`tfe.implicit_gradients` is an alternative function with some useful properties:
-
--   It computes the derivatives of `f` with respect to all the `tfe.Variable`s
-    used by `f`.
--   When the returned function is invoked, it returns a list of
-    (gradient value, Variable object) tuples.
-
-Representing model parameters as `Variable` objects, along with the use of
-`tfe.implicit_gradients`, typically results in better encapsulation. For
-example, the linear regression model described above can be written into a
-class:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.W = tfe.Variable(5., name='weight')
-    self.B = tfe.Variable(10., name='bias')
-
-  def predict(self, inputs):
-    return inputs * self.W + self.B
-
-
-# The loss function to be optimized
-def loss(model, inputs, targets):
-  error = model.predict(inputs) - targets
-  return tf.reduce_mean(tf.square(error))
-
-# A toy dataset of points around 3 * x + 2
-NUM_EXAMPLES = 1000
-training_inputs = tf.random_normal([NUM_EXAMPLES])
-noise = tf.random_normal([NUM_EXAMPLES])
-training_outputs = training_inputs * 3 + 2 + noise
-
-# Define:
-# 1. A model
-# 2. Derivatives of a loss function with respect to model parameters
-# 3. A strategy for updating the variables based on the derivatives
-model = Model()
-grad = tfe.implicit_gradients(loss)
-optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
-
-# The training loop
-print("Initial loss: %f" %
-      loss(model, training_inputs, training_outputs).numpy())
-for i in range(201):
-  optimizer.apply_gradients(grad(model, training_inputs, training_outputs))
-  if i % 20 == 0:
-    print("Loss at step %d: %f" %
-          (i, loss(model, training_inputs, training_outputs).numpy()))
-print("Final loss: %f" % loss(model, training_inputs, training_outputs).numpy())
-print("W, B = %s, %s" % (model.W.numpy(), model.B.numpy()))
-```
-
-Output:
-
-```
-Initial loss: 69.693184
-Loss at step 0: 66.987854
-Loss at step 20: 30.553387
-Loss at step 40: 14.250237
-Loss at step 60: 6.955020
-Loss at step 80: 3.690550
-Loss at step 100: 2.229739
-Loss at step 120: 1.576032
-Loss at step 140: 1.283496
-Loss at step 160: 1.152584
-Loss at step 180: 1.093999
-Final loss: 1.067780
-W, B = 3.0114281, 2.0865183
-```
-
-Using `implicit_gradients` avoids the need to provide all the trainable
-parameters of the model as arguments to the `loss` function.
-
-### Using Keras and the Layers API
-
-[Keras](https://keras.io) is a popular API for defining model structures. The
-[`tf.keras.layers`](https://www.tensorflow.org/api_docs/python/tf/keras/layers)
-module provides a set of building blocks for models and is implemented using the
-`tf.layers.Layer` subclasses in the
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers)
-module. We encourage the use of these same building blocks when using
-TensorFlow's eager execution feature. For example, the very same linear
-regression model can be built using `tf.layers.Dense`:
-
-```python
-class Model(object):
-  def __init__(self):
-    self.layer = tf.layers.Dense(1)
-
-  def predict(self, inputs):
-    return self.layer(inputs)
-```
-
-The `tf.layers` API makes it more convenient to define more sophisticated
-models. For example, the following will train an MNIST model:
-
-```python
-class MNISTModel(object):
-  def __init__(self, data_format):
-    # 'channels_first' is typically faster on GPUs
-    # while 'channels_last' is typically faster on CPUs.
-    # See: https://www.tensorflow.org/performance/performance_guide#data_formats
-    if data_format == 'channels_first':
-      self._input_shape = [-1, 1, 28, 28]
-    else:
-      self._input_shape = [-1, 28, 28, 1]
-    self.conv1 = tf.layers.Conv2D(32, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.max_pool2d = tf.layers.MaxPooling2D(
-        (2, 2), (2, 2), padding='same', data_format=data_format)
-    self.conv2 = tf.layers.Conv2D(64, 5,
-                                  padding='same',
-                                  activation=tf.nn.relu,
-                                  data_format=data_format)
-    self.dense1 = tf.layers.Dense(1024, activation=tf.nn.relu)
-    self.dropout = tf.layers.Dropout(0.5)
-    self.dense2 = tf.layers.Dense(10)
-
-  def predict(self, inputs):
-    x = tf.reshape(inputs, self._input_shape)
-    x = self.max_pool2d(self.conv1(x))
-    x = self.max_pool2d(self.conv2(x))
-    x = tf.layers.flatten(x)
-    x = self.dropout(self.dense1(x))
-    return self.dense2(x)
-
-def loss(model, inputs, targets):
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(
-          logits=model.predict(inputs), labels=targets))
-
-
-# Load the training and validation data
-from tensorflow.examples.tutorials.mnist import input_data
-data = input_data.read_data_sets("./mnist_data", one_hot=True)
-
-# Train
-device = "gpu:0" if tfe.num_gpus() else "cpu:0"
-model = MNISTModel('channels_first' if tfe.num_gpus() else 'channels_last')
-optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)
-grad = tfe.implicit_gradients(loss)
-for i in range(20001):
-  with tf.device(device):
-    (inputs, targets) = data.train.next_batch(50)
-    optimizer.apply_gradients(grad(model, inputs, targets))
-    if i % 100 == 0:
-      print("Step %d: Loss on training set : %f" %
-            (i, loss(model, inputs, targets).numpy()))
-print("Loss on test set: %f" % loss(model, data.test.images, data.test.labels).numpy())
-```
-
-For a more complete example, see [the example in the tensorflow/models
-repository](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-### Checkpointing trained variables
-
-TensorFlow Variables (`tfe.Variable`) provide a way to represent shared,
-persistent state of your model. The `tfe.Checkpoint` class provides a means to
-save and restore variables to and from _checkpoints_.
-
-For example:
-
-```python
-# Create variables.
-x = tfe.Variable(10.)
-y = tfe.Variable(5.)
-
-# Indicate that the variables should be saved as "x" and "y".
-checkpoint = tfe.Checkpoint(x=x, y=y)
-
-# Assign new values to the variables and save.
-x.assign(2.)
-save_path = checkpoint.save('/tmp/ckpt')
-
-# Change the variable after saving.
-x.assign(11.)
-assert 16. == (x + y).numpy()  # 11 + 5
-
-# Restore the values in the checkpoint.
-checkpoint.restore(save_path)  # save_path='/tmp/ckpt-1'
-
-assert 7. == (x + y).numpy()  # 2 + 5
-```
-
-### `tf.keras.Model`
-
-You may often want to organize your models using classes, like the `MNISTModel`
-class described above. We recommend inheriting from the `tf.keras.Model` class
-as it provides conveniences like keeping track of all model variables.
-
-Sub-classes of `tf.keras.Model` may register `Layer`s (like classes in
-[`tf.layers`](https://www.tensorflow.org/api_docs/python/tf/layers), or [Keras
-layers](https://www.tensorflow.org/api_docs/python/tf/keras/layers)) by
-assigning them to attributes (`self.name = layer_object`) and define the
-computation in an implementation of `call()`.
-
-Note that `tf.layers.Layer` objects (like `tf.layers.Dense`) create variables
-lazily, when the first input is encountered.
-
-For example, consider the following two-layer neural network:
-
-```python
-class TwoLayerNet(tf.keras.Model):
-  def __init__(self):
-    super(TwoLayerNet, self).__init__()
-    self.layer1 = tf.layers.Dense(2, activation=tf.nn.relu, use_bias=False)
-    self.layer2 = tf.layers.Dense(3, use_bias=False)
-
-  def call(self, x):
-    return self.layer2(self.layer1(x))
-
-net = TwoLayerNet()
-
-# No variables created yet
-assert 0 == len(net.variables)
-
-# They are created on first input:
-inp = tf.constant([[1.]])
-
-# Since input is a 1x1 matrix, net.l1 has 2 units and net.l2 has 3 units,
-# the output is the product of a 1x1 matrix with a 1x2 matrix with a 2x3
-# matrix.
-assert [1, 3] == net(inp).shape.as_list()  # Invoke net; get output shape.
-assert 1 == len(net.layer1.variables)
-assert 1 == len(net.layer2.variables)
-assert 2 == len(net.variables)  # weights for each layer.
-assert [1, 2] == net.variables[0].shape.as_list()  # weights of layer1.
-assert [2, 3] == net.variables[1].shape.as_list()  # weights of layer2.
-```
-
-The `tf.keras.Model` class is itself a sub-class of `tf.layers.Layer`. This
-allows instances of `tf.keras.Model` to be embedded in other models. For
-example:
-
-```python
-class ThreeLayerNet(tf.keras.Model):
-  def __init__(self):
-    super(ThreeLayerNet, self).__init__()
-    self.a = TwoLayerNet()
-    self.b = tf.layers.Dense(4, use_bias=False)
-
-  def call(self, x):
-    return self.b(self.a(x))
-
-net = ThreeLayerNet()
-
-assert [1, 4] == net(inp).shape.as_list()
-assert 3 == len(net.variables)
-assert [1, 2] == net.variables[0].shape.as_list()
-assert [2, 3] == net.variables[1].shape.as_list()
-assert [3, 4] == net.variables[2].shape.as_list()
-```
-
-See more examples in
-[`tensorflow/contrib/eager/python/examples`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples).
-
-`tfe.Checkpoint` provides a convenient way to save and load training
-checkpoints. Let's define something simple to train. We set an objective for the
-output of our network, choose an optimizer, and a location for the checkpoint:
-
-```python
-objective = tf.constant([[2., 3., 4., 5.]])
-optimizer = tf.train.AdamOptimizer(0.01)
-checkpoint_directory = '/tmp/tfe_example'
-checkpoint_prefix = os.path.join(checkpoint_directory, 'ckpt')
-net = ThreeLayerNet()
-```
-
-We group them in a `tfe.Checkpoint` and request that it be restored. This
-ensures that variables created by these objects are restored before their values
-are used. Our training loop is the same whether starting training or resuming
-from a previous checkpoint:
-
-```python
-global_step = tf.train.get_or_create_global_step()
-checkpoint = tfe.Checkpoint(
-    global_step=global_step, optimizer=optimizer, network=net)
-checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
-for _ in range(100):
-  loss_fn = lambda: tf.norm(net(inp) - objective)
-  optimizer.minimize(loss_fn, global_step=global_step)
-  if tf.equal(global_step % 20, 0):
-    print("Step %d, output %s" % (global_step.numpy(),
-                                  net(inp).numpy()))
-    # Save the checkpoint.
-    checkpoint.save(checkpoint_prefix)
-```
-
-The first time it runs, `Model` variables are initialized randomly. Then the
-output is trained to match the objective we've set:
-
-```
-Step 20, output [[ 0.03575622  0.29863232  0.03474367  0.24735749]]
-Step 40, output [[ 0.40646029  0.9856872   0.46851286  0.95358551]]
-Step 60, output [[ 1.74541104  2.800704    1.79055595  2.74783421]]
-Step 80, output [[ 2.14977384  3.44340849  3.96120024  5.16242075]]
-Step 100, output [[ 1.99943113  3.02364397  3.93500996  4.9610076 ]]
-```
-
-In subsequent iterations, variables are initialized with the values read from
-the latest checkpoint. Running the same code again, we continue from where we
-left off:
-
-```
-Step 120, output [[ 1.99234128  3.0271616   3.98732996  4.96401167]]
-Step 140, output [[ 2.00133467  3.01270437  4.00616646  5.00406504]]
-Step 160, output [[ 1.99647415  2.9956708   3.99064088  4.99632359]]
-Step 180, output [[ 2.00699997  3.00904822  4.00706148  5.01193142]]
-Step 200, output [[ 1.98334622  2.98249531  3.97375059  4.97123432]]
-```
-
-
-### Summaries, metrics and TensorBoard
-
-[TensorBoard](https://www.tensorflow.org/get_started/summaries_and_tensorboard)
-is a popular tool for understanding, debugging and optimizing the model training
-process. To benefit from the visualizations offered by TensorBoard, summary
-events need to be written during the course of execution of your program. You
-might find many Tensorflow programs that include the
-[`tf.summary`](https://www.tensorflow.org/api_guides/python/summary) operations
-during graph construction.
-
-`tf.summary` operations are *not* compatible with eager execution, but an
-equivalent alternative exists in
-[`tf.contrib.summary`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/summary)
-that is compatible with both eager execution and graph construction.
-
-During model construction simply insert summary operations like
-`tf.contrib.summary.scalar`. These operations do nothing by default, unless a
-summary writer is currently active and a writing policy is set.
-
-For example, to record summaries once every 100 global steps, use:
-
-```python
-tf.train.get_or_create_global_step()  # Ensuring the global step variable exists
-writer = tf.contrib.summary.create_file_writer(logdir)
-
-for _ in range(iterations):
-  with writer.as_default():
-    with tf.contrib.summary.record_summaries_every_n_global_steps(100):
-      # your model code goes here
-      tf.contrib.summary.scalar('loss', loss)
-      # ...
-```
-
-See the full mnist example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
-for a full model using `tf.contrib.summary`.
-
-Similarly to summaries, the metrics in `tf.metrics` are currently not compatible
-with eager execution. We instead provide object-oriented metrics in the
-`tfe.metrics` package, which are compatible with graph construction as well.
-
-Metrics in the `tfe.metrics`, such as `tfe.metrics.Mean` and
-`tfe.Metrics.Accuracy`, all implement an intuitive object-oriented
-interface. Here's an example of how to use the `tfe.metrics.Mean` metric:
-
-```python
-# Metrics are objects, which can be created and destroyed.
-my_mean = tfe.metrics.Mean(name='my_mean')
-# While a metric is active, you can call it as a function to accumulate into its
-# internal state.
-my_mean(0.0)
-my_mean(10.0)
-# Once you've finished updating the metric, you can get its result. In this case
-# a simple average over all the calls to it. If a summary writer is active the
-# metric will write the appropriate summaries using the metric name.
-assert 5.0 == my_mean.result().numpy()
-```
-
-For a full example of a model using metrics for evaluation, see the mnist
-example in
-[`tensorflow/contrib/eager/python/examples/mnist`](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist).
-
-### Input Pipelines
-
-The discussion above has been centered around the computation executed by your
-model. The
-[`tf.data`](https://www.tensorflow.org/api_docs/python/tf/data)
-module provides APIs to build complex input pipelines from simple, reusable
-pieces.
-
-If you're familiar with constructing `tf.data.Dataset` objects when building
-TensorFlow graphs, the same API calls are used when eager execution is enabled.
-However, the process of iterating over elements of the dataset differs between
-eager execution and graph construction. When eager execution is enabled, the
-discussion on iterator creation using `make_one_shot_iterator()` and
-`get_next()` in the
-[Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is
-*not* applicable. Instead, a more Pythonic `Iterator` class is available.
-
-For example:
-
-```python
-# Create a source Dataset from in-memory numpy arrays.
-# For reading from files on disk, you may want to use other Dataset classes
-# like the TextLineDataset or the TFRecordDataset.
-dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6])
-
-# Apply transformations, shuffling, batching etc.
-dataset = dataset.map(tf.square).shuffle(2).batch(2)
-
-# Use tfe.Iterator to iterate over the dataset.
-for x in tfe.Iterator(dataset):
-  print(x)
-```
-
-Output:
-
-```
-tf.Tensor([4 9], shape=(2,), dtype=int32)
-tf.Tensor([16 25], shape=(2,), dtype=int32)
-tf.Tensor([36  1], shape=(2,), dtype=int32)
-```
-
-## Interoperating with Graphs
-
-Eager execution improves the process of model development in Python; however,
-because it is in its earliest stages, it does not yet support some features
-available to [TensorFlow
-graphs](https://www.tensorflow.org/get_started/get_started#the_computational_graph)
-that are desirable when deploying models in production. In particular, eager
-execution does not yet support distributed training, exporting models (to other
-[programming languages](https://www.tensorflow.org/api_docs/), [TensorFlow
-serving](https://www.tensorflow.org/serving/), and mobile applications), and
-various memory and computation optimizations that are applied to TensorFlow's
-dataflow graphs.
-
-That said, the APIs used to build modes are exactly the same whether executing
-eagerly or constructing graphs. This means that you can iteratively develop your
-model with eager execution enabled and later, if needed, use the same code to
-reap the benefits of representing models as computational graphs.
-
-For example, the same model definition used to construct a graph in
-[mnist.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist.py)
-can be trained with eager execution enabled as in [`mnist_eager.py`](https://github.com/tensorflow/models/tree/master/official/mnist/mnist_eager.py).
-
-Other models in the [examples
-directory](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/)
-demonstrate this as well.
-
-Some differences worth noting:
-
--   There is no notion of a `tf.placeholder` or a `tf.Session` when eager
-    execution is enabled.
--   Many properties on the `tf.Tensor` object, like `tf.Tensor.name`,
-    `tf.Tensor.op`, `tf.Tensor.inputs` are not meaningful when eager execution
-    is enabled and their use will raise an `AttributeError`.
--   To use `tfe.implicit_gradients` in graph construction, variables must be
-    created with [`use_resource=True`] provided to
-    [`tf.get_variable()`](https://www.tensorflow.org/api_docs/python/tf/get_variable)
-    or
-    [`tf.variable_scope()`](https://www.tensorflow.org/api_docs/python/tf/variable_scope).
--   Some API calls (such as the functional-style `tf.layers.dense`,
-    `tf.layers.conv2d`) are not compatible with eager execution. Use of such
-    methods should raise an error indicating the alternative (e.g., the
-    `tf.layers.Dense` and `tf.layers.Conv2D` classes).
-
-## What next?
+immediately: concrete values are returned, instead of creating a computational
+graph that is executed later.
 
-Please give eager execution a spin. This feature is in early stages and is
-evolving, so we welcome your feedback via issues on GitHub (see [known
-issues](https://github.com/tensorflow/tensorflow/labels/comp:eager)).
+A user guide is available: https://www.tensorflow.org/programmers_guide/eager
+([source file](../../../../docs_src/programmers_guide/eager.md))
 
-You may want to browse through some sample code, including benchmarks for some:
+We welcome feedback through [GitHub issues](https://github.com/tensorflow/tensorflow/labels/comp:eager).
 
--   [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
--   [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
--   [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
--   [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
--   [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
+Sample code is available, including benchmarks for some:
 
+- [Linear Regression](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/linear_regression)
+- [MNIST handwritten digit classifier](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/mnist)
+- [ResNet50 image classification](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/resnet50)
+- [RNN to generate colors](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_colorbot)
+- [RNN language model](https://www.tensorflow.org/code/tensorflow/contrib/eager/python/examples/rnn_ptb)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 2f2347736a073c7d9b3fb6685f52f8d58cc40570..907f9204c2d31a652ca2a0539a23db4722b4e154 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import re
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -29,6 +28,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpointable
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 15ac889191e0fe51269bc5740d5e0ab1bc0e2b72..28f5f286eb767ca199dc27c43537a976ef4ebaad 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -23,7 +23,6 @@ import tempfile
 
 from tensorflow.contrib.eager.python import checkpointable_utils
 from tensorflow.contrib.eager.python import metrics
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -31,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.training import training_util
 
 
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index e55a9276ab53f44f76dc5e537b3bdde7c975f463..2f8721324f5fc12565d047a64af22b8df215a92b 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -25,6 +25,7 @@ import weakref
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
+from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import checkpoint_utils
@@ -176,7 +177,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = base._get_default_graph_uid_map()
+        name_uid_map = keras_base_layer.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
@@ -326,6 +327,8 @@ class Network(base.Layer):
       raise TypeError(
           "Network.track_layer() passed type %s, not a tf.layers.Layer" %
           (type(layer),))
+    # Always use `ResourceVariable` with legacy layers.
+    layer._use_resource_variables = True
     if isinstance(layer, Network):
       layer._finalize_name(parent_network=self)
     else:
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index 3329fc6c513265deff41a368f5688dd605209c14..f43376d5d777a7f17d975e07b746f7b1c731e8ea 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -20,12 +20,10 @@ import gc
 
 from tensorflow.contrib.eager.python import network
 from tensorflow.contrib.layers.python.layers import regularizers
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import math_ops
@@ -469,36 +467,6 @@ class NetworkTest(test.TestCase):
     self.assertIsInstance(net.trainable_weights[0],
                           resource_variable_ops.ResourceVariable)
 
-  def testGraphOpNames(self):
-    """Network operation names should match variable naming."""
-
-    def _check_op_prefixes(expected_prefix, checked_ops):
-      for operation in ops.get_default_graph().get_operations():
-        if operation.name == "ignore":
-          continue
-        if operation.name in checked_ops:
-          continue
-        checked_ops.add(operation.name)
-        self.assertStartsWith(expected_start=expected_prefix,
-                              actual=operation.name)
-        self.assertNotIn("my_network", operation.name[len(expected_prefix):])
-        self.assertNotIn("dense", operation.name[len(expected_prefix):])
-
-    with context.graph_mode():
-      net = MyNetwork()
-      zero = constant_op.constant([[0.]], name="ignore")
-      net(zero)
-      checked_ops = set()
-      _check_op_prefixes(expected_prefix="my_network/dense/",
-                         checked_ops=checked_ops)
-      net.net2 = net.track_layer(MyNetwork())
-      net.net2(zero)
-      _check_op_prefixes(expected_prefix="my_network/my_network/dense/",
-                         checked_ops=checked_ops)
-      MyNetwork()(zero)
-      _check_op_prefixes(expected_prefix="my_network_1/dense/",
-                         checked_ops=checked_ops)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableRegularizers(self):
     net = RegularizedNetwork()
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index c846343d6d23198726153e6b693660f61232bee5..9f4cd44afbede286966ba0e7357c5dac92a2b729 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -9,23 +9,12 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":extenders",
@@ -34,10 +23,41 @@ py_library(
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
+        ":rnn",
         "//tensorflow/python:util",
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["python/estimator/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:boosted_trees",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["python/estimator/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["python/estimator/dnn.py"],
@@ -70,6 +90,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -110,6 +131,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -247,6 +269,7 @@ py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
         "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -390,3 +413,57 @@ cuda_py_test(
         "notap",
     ],
 )
+
+py_library(
+    name = "rnn",
+    srcs = ["python/estimator/rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":extenders",
+        "//tensorflow/contrib/feature_column:feature_column_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:head",
+        "//tensorflow/python/estimator:optimizers",
+        "//tensorflow/python/feature_column",
+        "@six_archive//:six",
+    ],
+)
+
+py_test(
+    name = "rnn_test",
+    size = "medium",
+    srcs = ["python/estimator/rnn_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":rnn",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 6b9f9575b606f1822d760e8597c55994dd8af04c..be20d1b7770d3f3df21ac9c0f811d924bf4152ee 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
 from tensorflow.contrib.estimator.python.estimator.dnn import *
 from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
@@ -27,6 +28,7 @@ from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
+from tensorflow.contrib.estimator.python.estimator.rnn import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -44,11 +46,14 @@ _allowed_symbols = [
     'DNNEstimator',
     'DNNLinearCombinedEstimator',
     'LinearEstimator',
+    'boosted_trees_classifier_train_in_memory',
+    'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
     'dnn_logit_fn_builder',
     'linear_logit_fn_builder',
     'replicate_model_fn',
     'TowerOptimizer',
+    'RNNClassifier',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..314c54ed00372eca62ffc6930e6d492dd7d57163
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -0,0 +1,323 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Boosted Trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+
+
+class _BoostedTreesEstimator(estimator.Estimator):
+  """An Estimator for Tensorflow Boosted Trees models."""
+
+  def __init__(self,
+               feature_columns,
+               n_batches_per_layer,
+               head,
+               model_dir=None,
+               weight_column=None,
+               n_trees=100,
+               max_depth=6,
+               learning_rate=0.1,
+               l1_regularization=0.,
+               l2_regularization=0.,
+               tree_complexity=0.,
+               config=None):
+    """Initializes a `BoostedTreesEstimator` instance.
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      head: the `Head` instance defined for Estimator.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    # pylint:disable=protected-access
+    # HParams for the model.
+    tree_hparams = canned_boosted_trees._TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return canned_boosted_trees._bt_model_fn(
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(_BoostedTreesEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+    # pylint:enable=protected-access
+
+
+def boosted_trees_classifier_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    n_classes=canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT,
+    weight_column=None,
+    label_vocabulary=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree classifier with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def input_fn_train():
+    dataset = create-dataset-from-training-data
+    # Don't use repeat or cache, since it is assumed to be one epoch
+    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    return dataset
+
+  classifier = boosted_trees_classifier_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = classifier.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    n_classes: number of label classes. Default is binary classification.
+      Multiclass support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    label_vocabulary: A list of strings represents possible label values. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. If it is not given, that means labels are
+      already encoded as integer or float within [0, 1] for `n_classes=2` and
+      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+      Also there will be errors if vocabulary is not provided and labels are
+      string.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == canned_boosted_trees._HOLD_FOR_MULTI_CLASS_SUPPORT:
+    n_classes = 2
+  head, closed_form = (
+      canned_boosted_trees._create_classification_head_and_closed_form(
+          n_classes, weight_column, label_vocabulary=label_vocabulary))
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        closed_form_grad_and_hess_fn=closed_form,
+        train_in_memory=True)
+
+  in_memory_classifier = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_classifier.train(input_fn=train_input_fn, hooks=train_hooks)
+
+  return in_memory_classifier
+  # pylint: enable=protected-access
+
+
+def boosted_trees_regressor_train_in_memory(
+    train_input_fn,
+    feature_columns,
+    model_dir=None,
+    label_dimension=canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT,
+    weight_column=None,
+    n_trees=100,
+    max_depth=6,
+    learning_rate=0.1,
+    l1_regularization=0.,
+    l2_regularization=0.,
+    tree_complexity=0.,
+    config=None,
+    train_hooks=None):
+  """Trains a boosted tree regressor with in memory dataset.
+
+  Example:
+
+  ```python
+  bucketized_feature_1 = bucketized_column(
+    numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+  bucketized_feature_2 = bucketized_column(
+    numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+  def input_fn_train():
+    dataset = create-dataset-from-training-data
+    # Don't use repeat or cache, since it is assumed to be one epoch
+    # This is either tf.data.Dataset, or a tuple of feature dict and label.
+    return dataset
+
+  regressor = boosted_trees_regressor_train_in_memory(
+      train_input_fn,
+      feature_columns=[bucketized_feature_1, bucketized_feature_2],
+      n_trees=100,
+      ... <some other params>
+  )
+
+  def input_fn_eval():
+    ...
+    return dataset
+
+  metrics = regressor.evaluate(input_fn=input_fn_eval, steps=10)
+  ```
+
+  Args:
+    train_input_fn: the input function returns a dataset containing a single
+      epoch of *unbatched* features and labels.
+    feature_columns: An iterable containing all the feature columns used by
+      the model. All items in the set should be instances of classes derived
+      from `FeatureColumn`.
+    model_dir: Directory to save model parameters, graph and etc. This can
+      also be used to load checkpoints from the directory into a estimator
+      to continue training a previously saved model.
+    label_dimension: Number of regression targets per example.
+      Multi-dimensional support is not yet implemented.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to downweight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    n_trees: number trees to be created.
+    max_depth: maximum depth of the tree to grow.
+    learning_rate: shrinkage parameter to be used when a tree added to the
+      model.
+    l1_regularization: regularization multiplier applied to the absolute
+      weights of the tree leafs.
+    l2_regularization: regularization multiplier applied to the square weights
+      of the tree leafs.
+    tree_complexity: regularization factor to penalize trees with more leaves.
+    config: `RunConfig` object to configure the runtime settings.
+    train_hooks: a list of Hook instances to be passed to estimator.train().
+
+  Returns:
+    a `BoostedTreesClassifier` instance created with the given arguments and
+      trained with the data loaded up on memory from the input_fn.
+
+  Raises:
+    ValueError: when wrong arguments are given or unsupported functionalities
+       are requested.
+  """
+  # pylint: disable=protected-access
+  # TODO(nponomareva): Extend it to multi-dimension cases.
+  if label_dimension == canned_boosted_trees._HOLD_FOR_MULTI_DIM_SUPPORT:
+    label_dimension = 1
+  head = canned_boosted_trees._create_regression_head(label_dimension,
+                                                      weight_column)
+
+  # HParams for the model.
+  tree_hparams = canned_boosted_trees._TreeHParams(
+      n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+      tree_complexity)
+
+  def _model_fn(features, labels, mode, config):
+    return canned_boosted_trees._bt_model_fn(
+        features,
+        labels,
+        mode,
+        head,
+        feature_columns,
+        tree_hparams,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=True)
+
+  in_memory_regressor = estimator.Estimator(
+      model_fn=_model_fn, model_dir=model_dir, config=config)
+
+  in_memory_regressor.train(input_fn=train_input_fn, hooks=train_hooks)
+
+  return in_memory_regressor
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee59106876f6c44725bcbba1ef3d3c803475dbf
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -0,0 +1,176 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.estimator.python.estimator import boosted_trees
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.estimator.canned import boosted_trees as canned_boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features = dict(FEATURES_DICT)
+    if is_classification:
+      labels = CLASSIFICATION_LABELS
+    else:
+      labels = REGRESSION_LABELS
+    return features, labels
+
+  return _input_fn
+
+
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._head = canned_boosted_trees._create_regression_head(label_dimension=1)
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
+
+  def testTrainAndEvaluateEstimator(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
+
+  def testInferEstimator(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5,
+        head=self._head)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+  def testBinaryClassifierTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check eval.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testRegressorTrainInMemoryAndEvalAndInfer(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_regressor_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5)
+    # It will stop after 5 steps because of the max depth and num trees.
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    # Check eval.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
index b5e4d34dc70ccaa4806ae8b8ed5001bd971ee7b4..dd009a6753f3231638f93e50fc8f19eae8820139 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -52,7 +53,9 @@ def _dnn_only_estimator_fn(
     config=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       dnn_feature_columns=feature_columns,
       dnn_optimizer=optimizer,
@@ -100,7 +103,9 @@ def _linear_only_estimator_fn(
     partitioner=None):
   return dnn_linear_combined.DNNLinearCombinedEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       model_dir=model_dir,
       linear_feature_columns=feature_columns,
       linear_optimizer=optimizer,
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_test.py b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
index 71f810acec856d42d389260e7b9fea32123348b4..75e3107670d658e55ce23d983e47311f1c180104 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -41,7 +42,9 @@ def _dnn_estimator_fn(weight_column=None, label_dimension=1, *args, **kwargs):
   """Returns a DNNEstimator that uses regression_head."""
   return dnn.DNNEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 266ae933052b11b9ab3edb662e95c90aae207dae..201699ed775f701bc9f215fff11a688175d51645 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -97,7 +97,10 @@ def add_metrics(estimator, metric_fn):
   return estimator_lib.Estimator(
       model_fn=new_model_fn,
       model_dir=estimator.model_dir,
-      config=estimator.config)
+      config=estimator.config,
+      # pylint: disable=protected-access
+      warm_start_from=estimator._warm_start_settings)
+      # pylint: enable=protected-access
 
 
 def clip_gradients_by_norm(optimizer, clip_norm):
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 74da2cbb3f4557b4ddbbeb6debaae085407a0023..ae2fd8b4902e850292bd6672ec34f084a230dfbe 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -41,11 +41,10 @@ from tensorflow.python.training import training_util
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
-# TODO(b/65403806): Switch loss_reduction default to SUM_OVER_BATCH_SIZE.
 def multi_class_head(n_classes,
                      weight_column=None,
                      label_vocabulary=None,
-                     loss_reduction=losses.Reduction.SUM,
+                     loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
                      name=None):
   """Creates a `_Head` for multi class classification.
@@ -86,7 +85,8 @@ def multi_class_head(n_classes,
       have any value in `label_vocabulary`. Note that errors will be raised if
       `label_vocabulary` is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -111,7 +111,7 @@ def binary_classification_head(
     weight_column=None,
     thresholds=None,
     label_vocabulary=None,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     loss_fn=None,
     name=None):
   """Creates a `_Head` for single label binary classification.
@@ -155,7 +155,8 @@ def binary_classification_head(
       `label_vocabulary`. Note that errors will be raised if `label_vocabulary`
       is not provided but labels are strings.
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
+      weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
@@ -178,7 +179,7 @@ def binary_classification_head(
 
 def regression_head(weight_column=None,
                     label_dimension=1,
-                    loss_reduction=losses.Reduction.SUM,
+                    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                     loss_fn=None,
                     inverse_link_fn=None,
                     name=None):
@@ -218,7 +219,9 @@ def regression_head(weight_column=None,
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     loss_fn: Optional loss function. Defaults to `mean_squared_error`.
     inverse_link_fn: Optional inverse link function, also known as 'mean
       function'. Defaults to identity.
@@ -243,7 +246,7 @@ def regression_head(weight_column=None,
 def poisson_regression_head(
     weight_column=None,
     label_dimension=1,
-    loss_reduction=losses.Reduction.SUM,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
     compute_full_loss=True,
     name=None):
   """Creates a `_Head` for poisson regression using `tf.nn.log_poisson_loss`.
@@ -275,7 +278,9 @@ def poisson_regression_head(
       of the last dimension of the labels `Tensor` (typically, this has shape
       `[batch_size, label_dimension]`).
     loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
-      reduce training loss over batch. Defaults to `SUM`.
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
     compute_full_loss: Whether to include the constant `log(z!)` term in
       computing the poisson loss. See `tf.nn.log_poisson_loss` for the full
       documentation.
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 8837dfdc6c2d83495157f0d30b80ac8f6f245c60..98962ca4277a3e8fbbdb3fb2d26df9acc45168b5 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -1162,8 +1162,8 @@ class PoissonRegressionHead(test.TestCase):
     #         exp(-1) - 2 * (-1) + 2*ln(2) - 2 + 0.5*ln(2*pi*2),
     #         exp(1) - 3 * 1 + 3*ln(3) - 3 + 0.5*ln(2*pi*3)]
     #      = [1.0, 3.020, 1.482]
-    # sum_loss = 5.502
-    expected_loss = 5.502
+    # training_loss = (1.0 + 3.020 + 1.482) / 3
+    expected_loss = 1.834
     atol = 0.001
     expected_train_result = b'my_train_op'
     def _train_op_fn(loss):
diff --git a/tensorflow/contrib/estimator/python/estimator/linear_test.py b/tensorflow/contrib/estimator/python/estimator/linear_test.py
index c63514eb688af48577f0a3b7ce9e7478309f2c30..c41996b9c6871d294f157411662f2eb9d4c09e5c 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/linear_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -42,7 +43,9 @@ def _linear_estimator_fn(
   """Returns a LinearEstimator that uses regression_head."""
   return linear.LinearEstimator(
       head=head_lib.regression_head(
-          weight_column=weight_column, label_dimension=label_dimension),
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
       *args, **kwargs)
 
 
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head.py b/tensorflow/contrib/estimator/python/estimator/multi_head.py
index bbbc19cc4dfb4b23f9b707023fbfdd124f1f48de..ce758992140d43529037b14cbbf958d5aa763fb4 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head.py
@@ -345,7 +345,7 @@ class _MultiHead(head_lib._Head):  # pylint:disable=protected-access
         if k == _DEFAULT_SERVING_KEY:
           key = head_name
         else:
-          key = '%s/%s' % (k, head_name)
+          key = '%s/%s' % (head_name, k)
         export_outputs[key] = v
         if (k == head_lib._PREDICT_SERVING_KEY and  # pylint:disable=protected-access
             isinstance(v, export_output_lib.PredictOutput)):
diff --git a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
index 74d3d6d728554587290301b6ddd5b9aaeb8cebac..3d6fccb1180c435f64552667306be004437f62ba 100644
--- a/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/multi_head_test.py
@@ -127,8 +127,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -169,11 +169,11 @@ class MultiHeadTest(test.TestCase):
       self.assertAllClose(
           expected_probabilities['head1'],
           sess.run(
-              spec.export_outputs['predict/head1'].outputs['probabilities']))
+              spec.export_outputs['head1/predict'].outputs['probabilities']))
       self.assertAllClose(
           expected_probabilities['head2'],
           sess.run(
-              spec.export_outputs['predict/head2'].outputs['probabilities']))
+              spec.export_outputs['head2/predict'].outputs['probabilities']))
 
   def test_predict_two_heads_logits_tensor(self):
     """Tests predict with logits as Tensor."""
@@ -197,8 +197,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'classification/head1',
-         'predict/head1', 'head2', 'classification/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/classification',
+         'head1/predict', 'head2', 'head2/classification', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -254,8 +254,8 @@ class MultiHeadTest(test.TestCase):
         logits=logits)
 
     self.assertItemsEqual(
-        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'regression/head1',
-         'predict/head1', 'head2', 'regression/head2', 'predict/head2'),
+        (_DEFAULT_SERVING_KEY, 'predict', 'head1', 'head1/regression',
+         'head1/predict', 'head2', 'head2/regression', 'head2/predict'),
         spec.export_outputs.keys())
 
     # Assert predictions and export_outputs.
@@ -483,14 +483,14 @@ class MultiHeadTest(test.TestCase):
                            [[2., 2., 0.], [2., 2., 0.]]], dtype=np.float32),
     }
     # Loss for the first head:
-    # loss1 = (1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
-    #         (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2
-    #       = 28
+    # loss1 = ((1+1)^2 + (0-1)^2 + (1+1)^2 + (0-1)^2 +
+    #          (1.5+1.5)^2 + (1.5-1.5)^2 + (1.5+1.5)^2 + (1.5-1.5)^2) / 8
+    #       = 3.5
     # Loss for the second head:
-    # loss2 = (0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
-    #         (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2
-    #       = 74
-    expected_training_loss = 28. + 74.
+    # loss2 = ((0-2)^2 + (1+2)^2 + (0-2)^2 + (0-2)^2 + (1+2)^2 + (0-2)^2 +
+    #          (2+2)^2 + (2-2)^2 + (0+2)^2 + (2+2)^2 + (2-2)^2 + (0+2)^2) / 12
+    #       = 6.167
+    expected_training_loss = 3.5 + 6.167
 
     training_loss = multi_head.create_loss(
         features={},
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..b475c12f5af3aedc766a0880a98c5c1e29bddbb7
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -0,0 +1,481 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent Neural Network estimators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.contrib.estimator.python.estimator import extenders
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.canned import optimizers
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core as core_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.summary import summary
+from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import training_util
+
+
+# The defaults are historical artifacts of the initial implementation, but seem
+# reasonable choices.
+_DEFAULT_LEARNING_RATE = 0.05
+_DEFAULT_CLIP_NORM = 5.0
+
+_CELL_TYPES = {'basic_rnn': rnn_cell.BasicRNNCell,
+               'lstm': rnn_cell.BasicLSTMCell,
+               'gru': rnn_cell.GRUCell}
+
+# Indicates no value was provided by the user to a kwarg.
+USE_DEFAULT = object()
+
+
+def _single_rnn_cell(num_units, cell_type):
+  cell_type = _CELL_TYPES.get(cell_type, cell_type)
+  if not cell_type or not issubclass(cell_type, rnn_cell.RNNCell):
+    raise ValueError('Supported cell types are {}; got {}'.format(
+        list(_CELL_TYPES.keys()), cell_type))
+  return cell_type(num_units=num_units)
+
+
+def _make_rnn_cell_fn(num_units, cell_type='basic_rnn'):
+  """Convenience function to create `rnn_cell_fn` for canned RNN Estimators.
+
+  Args:
+    num_units: Iterable of integer number of hidden units per RNN layer.
+    cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+      the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+      `'gru'`.
+
+  Returns:
+    A function that takes a single argument, an instance of
+    `tf.estimator.ModeKeys`, and returns an instance derived from
+    `tf.nn.rnn_cell.RNNCell`.
+
+  Raises:
+    ValueError: If cell_type is not supported.
+  """
+  def rnn_cell_fn(mode):
+    # Unused. Part of the rnn_cell_fn interface since user specified functions
+    # may need different behavior across modes (e.g. dropout).
+    del mode
+    cells = [_single_rnn_cell(n, cell_type) for n in num_units]
+    if len(cells) == 1:
+      return cells[0]
+    return rnn_cell.MultiRNNCell(cells)
+  return rnn_cell_fn
+
+
+def _concatenate_context_input(sequence_input, context_input):
+  """Replicates `context_input` across all timesteps of `sequence_input`.
+
+  Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
+  This value is appended to `sequence_input` on dimension 2 and the result is
+  returned.
+
+  Args:
+    sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
+      padded_length, d0]`.
+    context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
+
+  Returns:
+    A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
+    d0 + d1]`.
+
+  Raises:
+    ValueError: If `sequence_input` does not have rank 3 or `context_input` does
+      not have rank 2.
+  """
+  seq_rank_check = check_ops.assert_rank(
+      sequence_input,
+      3,
+      message='sequence_input must have rank 3',
+      data=[array_ops.shape(sequence_input)])
+  seq_type_check = check_ops.assert_type(
+      sequence_input,
+      dtypes.float32,
+      message='sequence_input must have dtype float32; got {}.'.format(
+          sequence_input.dtype))
+  ctx_rank_check = check_ops.assert_rank(
+      context_input,
+      2,
+      message='context_input must have rank 2',
+      data=[array_ops.shape(context_input)])
+  ctx_type_check = check_ops.assert_type(
+      context_input,
+      dtypes.float32,
+      message='context_input must have dtype float32; got {}.'.format(
+          context_input.dtype))
+  with ops.control_dependencies(
+      [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
+    padded_length = array_ops.shape(sequence_input)[1]
+    tiled_context_input = array_ops.tile(
+        array_ops.expand_dims(context_input, 1),
+        array_ops.concat([[1], [padded_length], [1]], 0))
+  return array_ops.concat([sequence_input, tiled_context_input], 2)
+
+
+def _select_last_activations(activations, sequence_lengths):
+  """Selects the nth set of activations for each n in `sequence_length`.
+
+  Returns a `Tensor` of shape `[batch_size, k]`. If `sequence_length` is not
+  `None`, then `output[i, :] = activations[i, sequence_length[i] - 1, :]`. If
+  `sequence_length` is `None`, then `output[i, :] = activations[i, -1, :]`.
+
+  Args:
+    activations: A `Tensor` with shape `[batch_size, padded_length, k]`.
+    sequence_lengths: A `Tensor` with shape `[batch_size]` or `None`.
+  Returns:
+    A `Tensor` of shape `[batch_size, k]`.
+  """
+  with ops.name_scope(
+      'select_last_activations', values=[activations, sequence_lengths]):
+    activations_shape = array_ops.shape(activations)
+    batch_size = activations_shape[0]
+    padded_length = activations_shape[1]
+    output_units = activations_shape[2]
+    if sequence_lengths is None:
+      sequence_lengths = padded_length
+    start_indices = math_ops.to_int64(
+        math_ops.range(batch_size) * padded_length)
+    last_indices = start_indices + sequence_lengths - 1
+    reshaped_activations = array_ops.reshape(
+        activations, [batch_size * padded_length, output_units])
+
+    last_activations = array_ops.gather(reshaped_activations, last_indices)
+    last_activations.set_shape([activations.shape[0], activations.shape[2]])
+    return last_activations
+
+
+def _rnn_logit_fn_builder(output_units, rnn_cell_fn, sequence_feature_columns,
+                          context_feature_columns, input_layer_partitioner):
+  """Function builder for a rnn logit_fn.
+
+  Args:
+    output_units: An int indicating the dimension of the logit layer.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent sequential input.
+    context_feature_columns: An iterable containing the `FeatureColumn`s
+      that represent contextual input.
+    input_layer_partitioner: Partitioner for input layer.
+
+  Returns:
+    A logit_fn (see below).
+
+  Raises:
+    ValueError: If output_units is not an int.
+  """
+  if not isinstance(output_units, int):
+    raise ValueError('output_units must be an int.  Given type: {}'.format(
+        type(output_units)))
+
+  def rnn_logit_fn(features, mode):
+    """Recurrent Neural Network logit_fn.
+
+    Args:
+      features: This is the first item returned from the `input_fn`
+                passed to `train`, `evaluate`, and `predict`. This should be a
+                single `Tensor` or `dict` of same.
+      mode: Optional. Specifies if this training, evaluation or prediction. See
+            `ModeKeys`.
+
+    Returns:
+      A `Tensor` representing the logits.
+    """
+    with variable_scope.variable_scope(
+        'sequence_input_layer',
+        values=tuple(six.itervalues(features)),
+        partitioner=input_layer_partitioner):
+      sequence_input, sequence_length = seq_fc.sequence_input_layer(
+          features=features, feature_columns=sequence_feature_columns)
+      summary.histogram('sequence_length', sequence_length)
+
+      if context_feature_columns:
+        context_input = feature_column_lib.input_layer(
+            features=features,
+            feature_columns=context_feature_columns)
+        sequence_input = _concatenate_context_input(sequence_input,
+                                                    context_input)
+
+    cell = rnn_cell_fn(mode)
+    # Ignore output state.
+    rnn_outputs, _ = rnn.dynamic_rnn(
+        cell=cell,
+        inputs=sequence_input,
+        dtype=dtypes.float32,
+        time_major=False)
+    last_activations = _select_last_activations(rnn_outputs, sequence_length)
+
+    with variable_scope.variable_scope('logits', values=(rnn_outputs,)):
+      logits = core_layers.dense(
+          last_activations,
+          units=output_units,
+          activation=None,
+          kernel_initializer=init_ops.glorot_uniform_initializer())
+    return logits
+
+  return rnn_logit_fn
+
+
+def _rnn_model_fn(features,
+                  labels,
+                  mode,
+                  head,
+                  rnn_cell_fn,
+                  sequence_feature_columns,
+                  context_feature_columns,
+                  optimizer='Adagrad',
+                  input_layer_partitioner=None,
+                  config=None):
+  """Recurrent Neural Net model_fn.
+
+  Args:
+    features: dict of `Tensor` and `SparseTensor` objects returned from
+      `input_fn`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] with labels.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+      returns an object of type `tf.nn.rnn_cell.RNNCell`.
+    sequence_feature_columns: Iterable containing `FeatureColumn`s that
+      represent sequential model inputs.
+    context_feature_columns: Iterable containing `FeatureColumn`s that
+      represent model inputs not associated with a specific timestep.
+    optimizer: String, `tf.Optimizer` object, or callable that creates the
+      optimizer to use for training. If not specified, will use the Adagrad
+      optimizer with a default learning rate of 0.05 and gradient clip norm of
+      5.0.
+    input_layer_partitioner: Partitioner for input layer. Defaults
+      to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+    config: `RunConfig` object to configure the runtime settings.
+
+  Returns:
+    An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: If mode or optimizer is invalid, or features has the wrong type.
+  """
+  if not isinstance(features, dict):
+    raise ValueError('features should be a dictionary of `Tensor`s. '
+                     'Given type: {}'.format(type(features)))
+
+  # If user does not provide an optimizer instance, use the optimizer specified
+  # by the string with default learning rate and gradient clipping.
+  if not isinstance(optimizer, optimizer_lib.Optimizer):
+    optimizer = optimizers.get_optimizer_instance(
+        optimizer, learning_rate=_DEFAULT_LEARNING_RATE)
+    optimizer = extenders.clip_gradients_by_norm(optimizer, _DEFAULT_CLIP_NORM)
+
+  num_ps_replicas = config.num_ps_replicas if config else 0
+  partitioner = partitioned_variables.min_max_variable_partitioner(
+      max_partitions=num_ps_replicas)
+  with variable_scope.variable_scope(
+      'rnn',
+      values=tuple(six.itervalues(features)),
+      partitioner=partitioner):
+    input_layer_partitioner = input_layer_partitioner or (
+        partitioned_variables.min_max_variable_partitioner(
+            max_partitions=num_ps_replicas,
+            min_slice_size=64 << 20))
+
+    logit_fn = _rnn_logit_fn_builder(
+        output_units=head.logits_dimension,
+        rnn_cell_fn=rnn_cell_fn,
+        sequence_feature_columns=sequence_feature_columns,
+        context_feature_columns=context_feature_columns,
+        input_layer_partitioner=input_layer_partitioner)
+    logits = logit_fn(features=features, mode=mode)
+
+    def _train_op_fn(loss):
+      """Returns the op to optimize the loss."""
+      return optimizer.minimize(
+          loss,
+          global_step=training_util.get_global_step())
+
+    return head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+
+class RNNClassifier(estimator.Estimator):
+  """A classifier for TensorFlow RNN models.
+
+  Trains a recurrent neural network model to classify instances into one of
+  multiple classes.
+
+  Example:
+
+  ```python
+  token_sequence = sequence_categorical_column_with_hash_bucket(...)
+  token_emb = embedding_column(categorical_column=token_sequence, ...)
+
+  estimator = RNNClassifier(
+      num_units=[32, 16], cell_type='lstm',
+      sequence_feature_columns=[token_emb])
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `sequence_feature_columns`:
+    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
+  * for each `column` in `context_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss is calculated by using softmax cross entropy.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_units=None,
+               cell_type=USE_DEFAULT,
+               rnn_cell_fn=None,
+               model_dir=None,
+               n_classes=2,
+               weight_column=None,
+               label_vocabulary=None,
+               optimizer='Adagrad',
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `RNNClassifier` instance.
+
+    Args:
+      sequence_feature_columns: An iterable containing the `FeatureColumn`s
+        that represent sequential input. All items in the set should either be
+        sequence columns (e.g. `sequence_numeric_column`) or constructed from
+        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
+        input).
+      context_feature_columns: An iterable containing the `FeatureColumn`s
+        for contextual input. The data represented by these columns will be
+        replicated and given to the RNN at each timestep. These columns must be
+        instances of classes derived from `_DenseColumn` such as
+        `numeric_column`, not the sequential variants.
+      num_units: Iterable of integer number of hidden units per RNN layer. If
+        set, `cell_type` must also be specified and `rnn_cell_fn` must be
+        `None`.
+      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
+        must be `None`.
+      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
+        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
+        This is for advanced users who need additional customization beyond
+        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
+        needed for stacked RNNs.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      n_classes: Number of label classes. Defaults to 2, namely binary
+        classification. Must be > 1.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to down weight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
+        to Adagrad optimizer.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
+        compatible.
+    """
+    if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
+      raise ValueError(
+          'num_units and cell_type must not be specified when using rnn_cell_fn'
+      )
+    if not rnn_cell_fn:
+      if cell_type == USE_DEFAULT:
+        cell_type = 'basic_rnn'
+      rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+
+    if n_classes == 2:
+      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
+          weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    else:
+      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
+          n_classes, weight_column=weight_column,
+          label_vocabulary=label_vocabulary)
+    def _model_fn(features, labels, mode, config):
+      return _rnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          rnn_cell_fn=rnn_cell_fn,
+          sequence_feature_columns=tuple(sequence_feature_columns or []),
+          context_feature_columns=tuple(context_feature_columns or []),
+          optimizer=optimizer,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(RNNClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..393f94f5c7de02c56d93993bbeb8aaec4ea8234c
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -0,0 +1,1131 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rnn.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import random
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import rnn
+from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import prediction_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import input as input_lib
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import training_util
+
+
+# Names of variables created by BasicRNNCell model.
+TOKEN_EMBEDDING_NAME = 'rnn/sequence_input_layer/input_layer/tokens_sequential_embedding/embedding_weights'
+CELL_WEIGHTS_NAME = 'rnn/rnn/basic_rnn_cell/kernel'
+CELL_BIAS_NAME = 'rnn/rnn/basic_rnn_cell/bias'
+MULTI_CELL_WEIGHTS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/kernel'
+MULTI_CELL_BIAS_NAME_PATTERN = 'rnn/rnn/multi_rnn_cell/cell_%d/basic_rnn_cell/bias'
+LOGITS_WEIGHTS_NAME = 'rnn/logits/dense/kernel'
+LOGITS_BIAS_NAME = 'rnn/logits/dense/bias'
+
+
+def _assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def create_checkpoint(rnn_weights, rnn_biases, logits_weights, logits_biases,
+                      global_step, model_dir):
+  """Create checkpoint file with provided model weights.
+
+  Args:
+    rnn_weights: Iterable of values of weights for the RNN cell.
+    rnn_biases: Iterable of values of biases for the RNN cell.
+    logits_weights: Iterable of values for matrix connecting RNN output to
+      logits.
+    logits_biases: Iterable of values for logits bias term.
+    global_step: Initial global step to save in checkpoint.
+    model_dir: Directory into which checkpoint is saved.
+  """
+  model_weights = {}
+  model_weights[CELL_WEIGHTS_NAME] = rnn_weights
+  model_weights[CELL_BIAS_NAME] = rnn_biases
+  model_weights[LOGITS_WEIGHTS_NAME] = logits_weights
+  model_weights[LOGITS_BIAS_NAME] = logits_biases
+
+  with ops.Graph().as_default():
+    # Create model variables.
+    for k, v in six.iteritems(model_weights):
+      variables_lib.Variable(v, name=k, dtype=dtypes.float32)
+
+    # Create non-model variables.
+    global_step_var = training_util.create_global_step()
+    assign_op = global_step_var.assign(global_step)
+
+    # Initialize vars and save checkpoint.
+    with monitored_session.MonitoredTrainingSession(
+        checkpoint_dir=model_dir) as sess:
+      sess.run(assign_op)
+
+
+class RNNLogitFnTest(test.TestCase):
+  """Tests correctness of logits calculated from _rnn_logit_fn_builder."""
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_logits(self, mode, rnn_units, logits_dimension, features_fn,
+                   sequence_feature_columns, context_feature_columns,
+                   expected_logits):
+    """Tests that the expected logits are calculated."""
+    with ops.Graph().as_default():
+      # Global step needed for MonitoredSession, which is in turn used to
+      # explicitly set variable weights through a checkpoint.
+      training_util.create_global_step()
+      # Use a variable scope here with 'rnn', emulating the rnn model_fn, so
+      # the checkpoint naming is shared.
+      with variable_scope.variable_scope('rnn'):
+        input_layer_partitioner = (
+            partitioned_variables.min_max_variable_partitioner(
+                max_partitions=0, min_slice_size=64 << 20))
+        logit_fn = rnn._rnn_logit_fn_builder(
+            output_units=logits_dimension,
+            rnn_cell_fn=rnn._make_rnn_cell_fn(rnn_units),
+            sequence_feature_columns=sequence_feature_columns,
+            context_feature_columns=context_feature_columns,
+            input_layer_partitioner=input_layer_partitioner)
+        # Features are constructed within this function, otherwise the Tensors
+        # containing the features would be defined outside this graph.
+        logits = logit_fn(features=features_fn(), mode=mode)
+        with monitored_session.MonitoredTrainingSession(
+            checkpoint_dir=self._model_dir) as sess:
+          self.assertAllClose(expected_logits, sess.run(logits), atol=1e-4)
+
+  def testOneDimLogits(self):
+    """Tests one-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3]] = [[-0.6033]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033]])
+
+  def testMultiDimLogits(self):
+    """Tests multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10]], [[5]]]
+    initial_state = [0, 0]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)]]
+                          = [[0.53, -0.37]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [0.5*0.53 + 0.3*0.37 + 0.4],
+              [0.2*0.53 - 0.1*0.37 + 0.5]
+           = [[-0.6033, 0.7777, 0.5698]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698]])
+
+  def testMultiExampleMultiDim(self):
+    """Tests multiple examples and multi-dimensional logits.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [7]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [tanh(.1*7 + .2*.38 + .3*.10 +.2),
+                              tanh(-.2*7 - .3*.38 - .4*.10 +.5)]]
+                          = [[0.53, -0.37], [0.76, -0.78]
+    logits = [[-1*0.53 - 1*0.37 + 0.3,
+               0.5*0.53 + 0.3*0.37 + 0.4,
+               0.2*0.53 - 0.1*0.37 + 0.5],
+              [-1*0.76 - 1*0.78 + 0.3,
+               0.5*0.76 +0.3*0.78 + 0.4,
+               0.2*0.76 -0.1*0.78 + 0.5]]
+           = [[-0.6033, 0.7777, 0.5698], [-1.2473, 1.0170, 0.5745]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))
+    ]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=3,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033, 0.7777, 0.5698],
+                           [-1.2473, 1.0170, 0.5745]])
+
+  def testMultiExamplesDifferentLength(self):
+    """Tests multiple examples with different lengths.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10], [5]], [[2], [0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.83, -0.91], [0.38, 0.10]]
+    rnn_output_timestep_2 = [[tanh(.1*5 + .2*.83 - .3*.91 +.2),
+                              tanh(-.2*5 - .3*.83 + .4*.91 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.53, -0.37], [<ignored-padding>]]
+    logits = [[-1*0.53 - 1*0.37 + 0.3],
+              [-1*0.38 + 1*0.10 + 0.3]]
+           = [[-0.6033], [0.0197]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.6033], [0.0197]])
+
+  def testMultiExamplesWithContext(self):
+    """Tests multiple examples with context features.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[10, -0.5], [5, -0.5]], [[2, 0.8], [0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.1*10 - 1*.5 + .2*0 + .3*0 +.2),
+                              tanh(-.2*10 - 0.9*.5 - .3*0 - .4*0 +.5)],
+                             [tanh(.1*2 + 1*.8 + .2*0 + .3*0 +.2),
+                              tanh(-.2*2 + .9*.8 - .3*0 - .4*0 +.5)]]
+                          = [[0.60, -0.96], [0.83, 0.68]]
+    rnn_output_timestep_2 = [[tanh(.1*5 - 1*.5 + .2*.60 - .3*.96 +.2),
+                              tanh(-.2*5 - .9*.5 - .3*.60 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.03, -0.63], [<ignored-padding>]]
+    logits = [[-1*0.03 - 1*0.63 + 0.3],
+              [-1*0.83 + 1*0.68 + 0.3]]
+           = [[-0.3662], [0.1414]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # Context features weights are inserted between input and state weights.
+        rnn_weights=[[.1, -.2], [1., 0.9], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'context': [[-0.5], [0.8]],
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    context_feature_columns = [fc.numeric_column('context', shape=(1,))]
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-0.3662], [0.1414]])
+
+  def testMultiExamplesMultiFeatures(self):
+    """Tests examples with multiple sequential feature columns.
+
+    Intermediate values are rounded for ease in reading.
+    input_layer = [[[1, 0, 10], [0, 1, 5]], [[1, 0, 2], [0, 0, 0]]]
+    initial_state = [[0, 0], [0, 0]]
+    rnn_output_timestep_1 = [[tanh(.5*1 + 1*0 + .1*10 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*10 - .3*0 - .4*0 +.5)],
+                             [tanh(.5*1 + 1*0 + .1*2 + .2*0 + .3*0 +.2),
+                              tanh(-.5*1 - 1*0 - .2*2 - .3*0 - .4*0 +.5)]]
+                          = [[0.94, -0.96], [0.72, -0.38]]
+    rnn_output_timestep_2 = [[tanh(.5*0 + 1*1 + .1*5 + .2*.94 - .3*.96 +.2),
+                              tanh(-.5*0 - 1*1 - .2*5 - .3*.94 + .4*.96 +.5)],
+                             [<ignored-padding>]]
+                          = [[0.92, -0.88], [<ignored-padding>]]
+    logits = [[-1*0.92 - 1*0.88 + 0.3],
+              [-1*0.72 - 1*0.38 + 0.3]]
+           = [[-1.5056], [-0.7962]]
+    """
+    base_global_step = 100
+    create_checkpoint(
+        # FeatureColumns are sorted alphabetically, so on_sale weights are
+        # inserted before price.
+        rnn_weights=[[.5, -.5], [1., -1.], [.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=base_global_step,
+        model_dir=self._model_dir)
+
+    def features_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+          'on_sale':
+              sparse_tensor.SparseTensor(
+                  values=[0, 1, 0],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }
+
+    price_column = seq_fc.sequence_numeric_column('price', shape=(1,))
+    on_sale_column = fc.indicator_column(
+        seq_fc.sequence_categorical_column_with_identity(
+            'on_sale', num_buckets=2))
+    sequence_feature_columns = [price_column, on_sale_column]
+    context_feature_columns = []
+
+    for mode in [
+        model_fn.ModeKeys.TRAIN, model_fn.ModeKeys.EVAL,
+        model_fn.ModeKeys.PREDICT
+    ]:
+      self._test_logits(
+          mode,
+          rnn_units=[2],
+          logits_dimension=1,
+          features_fn=features_fn,
+          sequence_feature_columns=sequence_feature_columns,
+          context_feature_columns=context_feature_columns,
+          expected_logits=[[-1.5056], [-0.7962]])
+
+
+class RNNClassifierTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _assert_checkpoint(
+      self, n_classes, input_units, cell_units, expected_global_step):
+
+    shapes = {
+        name: shape for (name, shape) in
+        checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(
+        expected_global_step,
+        checkpoint_utils.load_variable(
+            self._model_dir, ops.GraphKeys.GLOBAL_STEP))
+
+    # RNN Cell variables.
+    if len(cell_units) > 1:
+      for i, cell_unit in enumerate(cell_units):
+        self.assertEqual([input_units + cell_unit, cell_unit],
+                         shapes[MULTI_CELL_WEIGHTS_NAME_PATTERN % i])
+        self.assertEqual([cell_unit],
+                         shapes[MULTI_CELL_BIAS_NAME_PATTERN % i])
+        input_units = cell_unit
+    elif len(cell_units) == 1:
+      self.assertEqual([input_units + cell_unit, cell_unit],
+                       shapes[CELL_WEIGHTS_NAME])
+      self.assertEqual([cell_unit], shapes[CELL_BIAS_NAME])
+
+    # Logits variables.
+    logits_dimension = n_classes if n_classes > 2 else 1
+    self.assertEqual([cell_units[-1], logits_dimension],
+                     shapes[LOGITS_WEIGHTS_NAME])
+    self.assertEqual([logits_dimension], shapes[LOGITS_BIAS_NAME])
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s/part_0:0' % CELL_BIAS_NAME,
+        '%s/part_0:0' % CELL_WEIGHTS_NAME,
+        '%s/part_0:0' % LOGITS_BIAS_NAME,
+        '%s/part_0:0' % LOGITS_WEIGHTS_NAME,
+    ]
+
+    def _minimize(loss, global_step):
+      trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(
+          expected_var_names,
+          [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        return state_ops.assign_add(global_step, 1).op
+      assert_loss = _assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        return state_ops.assign_add(global_step, 1).op
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def testConflictingRNNCellFn(self):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    cell_units = [4, 2]
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          num_units=cell_units)
+
+    with self.assertRaisesRegexp(
+        ValueError,
+        'num_units and cell_type must not be specified when using rnn_cell_fn'):
+      rnn.RNNClassifier(
+          sequence_feature_columns=[embed],
+          rnn_cell_fn=lambda x: x,
+          cell_type='lstm')
+
+  def _testFromScratchWithDefaultOptimizer(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        num_units=cell_units,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=2)
+
+  def testMultiClassFromScratchWithDefaultOptimizer(self):
+    self._testFromScratchWithDefaultOptimizer(n_classes=4)
+
+  def testFromScratchWithCustomRNNCellFn(self):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat'],
+                  indices=[[0, 0], [0, 1], [0, 2]],
+                  dense_shape=[1, 3]),
+      }, [[1]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+    cell_units = [4, 2]
+    n_classes = 2
+
+    def rnn_cell_fn(mode):
+      del mode  # unused
+      cells = [rnn_cell.BasicRNNCell(num_units=n) for n in cell_units]
+      return rnn_cell.MultiRNNCell(cells)
+
+    est = rnn.RNNClassifier(
+        sequence_feature_columns=[embed],
+        rnn_cell_fn=rnn_cell_fn,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def _testExampleWeight(self, n_classes):
+    def train_input_fn():
+      return {
+          'tokens':
+              sparse_tensor.SparseTensor(
+                  values=['the', 'cat', 'sat', 'dog', 'barked'],
+                  indices=[[0, 0], [0, 1], [0, 2], [1, 0], [1, 1]],
+                  dense_shape=[2, 3]),
+          'w': [[1], [2]],
+      }, [[1], [0]]
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    input_units = 2
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=[embed],
+        n_classes=n_classes,
+        weight_column='w',
+        model_dir=self._model_dir)
+
+    # Train for a few steps, and validate final checkpoint.
+    num_steps = 10
+    est.train(input_fn=train_input_fn, steps=num_steps)
+    self._assert_checkpoint(n_classes, input_units, cell_units, num_steps)
+
+  def testBinaryClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=2)
+
+  def testMultiClassWithExampleWeight(self):
+    self._testExampleWeight(n_classes=4)
+
+  def testBinaryClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testBinaryClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=1.119661)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+  def testMultiClassFromCheckpoint(self):
+    initial_global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=initial_global_step,
+        model_dir=self._model_dir)
+
+    def train_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    # Uses same checkpoint and examples as testMultiClassEvaluationMetrics.
+    # See that test for loss calculation.
+    mock_optimizer = self._mock_optimizer(expected_loss=2.662932)
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        optimizer=mock_optimizer,
+        model_dir=self._model_dir)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+    est.train(input_fn=train_input_fn, steps=10)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+
+
+def sorted_key_dict(unsorted_dict):
+  return {k: unsorted_dict[k] for k in sorted(unsorted_dict)}
+
+
+class RNNClassifierEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2.],
+                  indices=[[0, 0], [0, 1], [1, 0]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExamplesWithDifferentLength.
+    # See that test for logits calculation.
+    # logits = [[-0.603282], [0.019719]]
+    # probability = exp(logits) / (1 + exp(logits)) = [[0.353593], [0.504930]]
+    # loss = -label * ln(p) - (1 - label) * ln(1 - p)
+    #      = [[0.436326], [0.683335]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 1.119661,
+        metric_keys.MetricKeys.LOSS_MEAN: 0.559831,
+        metric_keys.MetricKeys.ACCURACY: 1.0,
+        metric_keys.MetricKeys.PREDICTION_MEAN: 0.429262,
+        metric_keys.MetricKeys.LABEL_MEAN: 0.5,
+        metric_keys.MetricKeys.ACCURACY_BASELINE: 0.5,
+        # With default threshold of 0.5, the model is a perfect classifier.
+        metric_keys.MetricKeys.RECALL: 1.0,
+        metric_keys.MetricKeys.PRECISION: 1.0,
+        # Positive example is scored above negative, so AUC = 1.0.
+        metric_keys.MetricKeys.AUC: 1.0,
+        metric_keys.MetricKeys.AUC_PR: 1.0,
+    }
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+  def testMultiClassEvaluationMetrics(self):
+    global_step = 100
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=global_step,
+        model_dir=self._model_dir)
+
+    def eval_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5., 2., 7.],
+                  indices=[[0, 0], [0, 1], [1, 0], [1, 1]],
+                  dense_shape=[2, 2]),
+      }, [[0], [1]]
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        model_dir=self._model_dir)
+    eval_metrics = est.evaluate(eval_input_fn, steps=1)
+
+    # Uses identical numbers to testMultiExampleMultiDim.
+    # See that test for logits calculation.
+    # logits = [[-0.603282, 0.777708, 0.569756],
+    #           [-1.247356, 1.017018, 0.574481]]
+    # logits_exp = exp(logits) / (1 + exp(logits))
+    #            = [[0.547013, 2.176468, 1.767836],
+    #               [0.287263, 2.764937, 1.776208]]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [[0.121793, 0.484596, 0.393611],
+    #                          [0.059494, 0.572639, 0.367866]]
+    # loss = -1. * log(softmax[label])
+    #      = [[2.105432], [0.557500]]
+    expected_metrics = {
+        ops.GraphKeys.GLOBAL_STEP: global_step,
+        metric_keys.MetricKeys.LOSS: 2.662932,
+        metric_keys.MetricKeys.LOSS_MEAN: 1.331466,
+        metric_keys.MetricKeys.ACCURACY: 0.5,
+    }
+
+    self.assertAllClose(
+        sorted_key_dict(expected_metrics), sorted_key_dict(eval_metrics))
+
+
+class RNNClassifierPredictionTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def testBinaryClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1.], [1.]],
+        logits_biases=[0.3],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=2,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testOneDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282]
+    # logistic = exp(-0.6033) / (1 + exp(-0.6033)) = [0.353593]
+    # probabilities = [0.646407, 0.353593]
+    # class_ids = argmax(probabilities) = [0]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose([0.353593],
+                        predictions[prediction_keys.PredictionKeys.LOGISTIC])
+    self.assertAllClose(
+        [0.646407, 0.353593],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([0],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_0'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+  def testMultiClassPredictions(self):
+    create_checkpoint(
+        rnn_weights=[[.1, -.2], [.2, -.3], [.3, -.4]],
+        rnn_biases=[.2, .5],
+        logits_weights=[[-1., 0.5, 0.2], [1., -0.3, 0.1]],
+        logits_biases=[0.3, 0.4, 0.5],
+        global_step=0,
+        model_dir=self._model_dir)
+
+    def predict_input_fn():
+      return {
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[10., 5.],
+                  indices=[[0, 0], [0, 1]],
+                  dense_shape=[1, 2]),
+      }
+
+    sequence_feature_columns = [
+        seq_fc.sequence_numeric_column('price', shape=(1,))]
+    label_vocabulary = ['class_0', 'class_1', 'class_2']
+
+    est = rnn.RNNClassifier(
+        num_units=[2],
+        sequence_feature_columns=sequence_feature_columns,
+        n_classes=3,
+        label_vocabulary=label_vocabulary,
+        model_dir=self._model_dir)
+    # Uses identical numbers to testMultiDimLogits.
+    # See that test for logits calculation.
+    # logits = [-0.603282, 0.777708, 0.569756]
+    # logits_exp = exp(logits) = [0.547013, 2.176468, 1.767836]
+    # softmax_probabilities = logits_exp / logits_exp.sum()
+    #                       = [0.121793, 0.484596, 0.393611]
+    # class_ids = argmax(probabilities) = [1]
+    predictions = next(est.predict(predict_input_fn))
+    self.assertAllClose([-0.603282, 0.777708, 0.569756],
+                        predictions[prediction_keys.PredictionKeys.LOGITS])
+    self.assertAllClose(
+        [0.121793, 0.484596, 0.393611],
+        predictions[prediction_keys.PredictionKeys.PROBABILITIES])
+    self.assertAllClose([1],
+                        predictions[prediction_keys.PredictionKeys.CLASS_IDS])
+    self.assertEqual([b'class_1'],
+                     predictions[prediction_keys.PredictionKeys.CLASSES])
+
+
+class RNNClassifierIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(
+      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
+      batch_size):
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+
+    cell_units = [4, 2]
+    est = rnn.RNNClassifier(
+        num_units=cell_units,
+        sequence_feature_columns=feature_columns,
+        n_classes=n_classes,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    num_steps = 10
+    est.train(train_input_fn, steps=num_steps)
+
+    # EVALUATE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn('loss', six.iterkeys(scores))
+
+    # PREDICT
+    predicted_proba = np.array([
+        x[prediction_keys.PredictionKeys.PROBABILITIES]
+        for x in est.predict(predict_input_fn)
+    ])
+    self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
+
+    # EXPORT
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def testNumpyInputFn(self):
+    """Tests complete flow with numpy_input_fn."""
+    n_classes = 3
+    batch_size = 10
+    words = ['dog', 'cat', 'bird', 'the', 'a', 'sat', 'flew', 'slept']
+    # Numpy only supports dense input, so all examples will have same length.
+    # TODO(b/73160931): Update test when support for prepadded data exists.
+    sequence_length = 3
+
+    features = []
+    for _ in range(batch_size):
+      sentence = random.sample(words, sequence_length)
+      features.append(sentence)
+
+    x_data = np.array(features)
+    y_data = np.random.randint(n_classes, size=batch_size)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        y=y_data,
+        batch_size=batch_size,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'tokens': x_data},
+        batch_size=batch_size,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+  def testParseExampleInputFn(self):
+    """Tests complete flow with input_fn constructed from parse_example."""
+    n_classes = 3
+    batch_size = 10
+    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
+
+    serialized_examples = []
+    for _ in range(batch_size):
+      sequence_length = random.randint(1, len(words))
+      sentence = random.sample(words, sequence_length)
+      label = random.randint(0, n_classes - 1)
+      example = example_pb2.Example(features=feature_pb2.Features(
+          feature={
+              'tokens':
+                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
+                      value=sentence)),
+              'label':
+                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
+                      value=[label])),
+          }))
+      serialized_examples.append(example.SerializeToString())
+
+    feature_spec = {
+        'tokens': parsing_ops.VarLenFeature(dtypes.string),
+        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
+    }
+    def _train_input_fn():
+      features = parsing_ops.parse_example(serialized_examples, feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _eval_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      labels = features.pop('label')
+      return features, labels
+    def _predict_input_fn():
+      features = parsing_ops.parse_example(
+          input_lib.limit_epochs(serialized_examples, num_epochs=1),
+          feature_spec)
+      features.pop('label')
+      return features, None
+
+    self._test_complete_flow(
+        train_input_fn=_train_input_fn,
+        eval_input_fn=_eval_input_fn,
+        predict_input_fn=_predict_input_fn,
+        n_classes=n_classes,
+        batch_size=batch_size)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index ad8568ad44ea84f96b97e98567a276c70520d53d..0a648d5d40e431bedb42017b15cabe078ac22fa7 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -347,16 +347,3 @@ cuda_py_test(
     ],
     main = "python/kernel_tests/masked_matmul_benchmark.py",
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/factorization/examples/BUILD b/tensorflow/contrib/factorization/examples/BUILD
index bbe842bd5ccc7357805adda1df42ba8799fcd8f2..363baa121ab3854a802ca3606e35597d31b35a57 100644
--- a/tensorflow/contrib/factorization/examples/BUILD
+++ b/tensorflow/contrib/factorization/examples/BUILD
@@ -21,14 +21,3 @@ tf_py_test(
     ],
     tags = ["notsan"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/kernels/BUILD b/tensorflow/contrib/factorization/kernels/BUILD
index 44eab56011dad2f6fbe843b3569b4acc5c5e542a..ea8b9a17a27093cb57564861815edd6ecb18a014 100644
--- a/tensorflow/contrib/factorization/kernels/BUILD
+++ b/tensorflow/contrib/factorization/kernels/BUILD
@@ -67,14 +67,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/factorization/python/ops/clustering_ops.py b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
index 23137e0a973c0bdd2cdbd97159f7fd310178bf54..84e80791f4991ad2b67d0a00ee1e00cf0d0daadc 100644
--- a/tensorflow/contrib/factorization/python/ops/clustering_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/clustering_ops.py
@@ -41,11 +41,12 @@ from tensorflow.python.platform import resource_loader
 _clustering_ops = loader.load_op_library(
     resource_loader.get_path_to_datafile('_clustering_ops.so'))
 
-# Euclidean distance between vectors U and V is defined as ||U - V||_F which is
-# the square root of the sum of the absolute squares of the elements difference.
+# Euclidean distance between vectors U and V is defined as \\(||U - V||_F\\)
+# which is the square root of the sum of the absolute squares of the elements
+# difference.
 SQUARED_EUCLIDEAN_DISTANCE = 'squared_euclidean'
 # Cosine distance between vectors U and V is defined as
-# 1 - (U \dot V) / (||U||_F ||V||_F)
+# \\(1 - (U \dot V) / (||U||_F ||V||_F)\\)
 COSINE_DISTANCE = 'cosine'
 
 RANDOM_INIT = 'random'
@@ -472,8 +473,8 @@ class KMeans(object):
         # Locally compute the sum of inputs mapped to each id.
         # For a cluster with old cluster value x, old count n, and with data
         # d_1,...d_k newly assigned to it, we recompute the new value as
-        # x += (sum_i(d_i) - k * x) / (n + k).
-        # Compute sum_i(d_i), see comment above.
+        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
+        # Compute \\(sum_i(d_i)\\), see comment above.
         cluster_center_updates = math_ops.unsorted_segment_sum(
             inp, unique_idx, num_unique_cluster_idx)
         # Shape to enable broadcasting count_updates and learning_rate to inp.
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 8e0ed1d80ec2603862aedb19cef1532626edb37c..811fa89bc38c61b16710a441b99d9e5dfac67668 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -51,9 +51,9 @@ class WALSModel(object):
   r"""A model for Weighted Alternating Least Squares matrix factorization.
 
   It minimizes the following loss function over U, V:
-   \\(
-   \|\sqrt W \odot (A - U V^T) \|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
-   )\\
+  $$
+   \|\sqrt W \odot (A - U V^T)\|_F^2 + \lambda (\|U\|_F^2 + \|V\|_F^2)
+  $$
     where,
     A: input matrix,
     W: weight matrix. Note that the (element-wise) square root of the weights
@@ -61,12 +61,12 @@ class WALSModel(object):
     U, V: row_factors and column_factors matrices,
     \\(\lambda)\\: regularization.
   Also we assume that W is of the following special form:
-  \\( W_{ij} = W_0 + R_i * C_j )\\  if \\(A_{ij} \ne 0)\\,
-  \\(W_{ij} = W_0)\\ otherwise.
+  \\( W_{ij} = W_0 + R_i * C_j \\)  if \\(A_{ij} \ne 0\\),
+  \\(W_{ij} = W_0\\) otherwise.
   where,
-  \\(W_0)\\: unobserved_weight,
-  \\(R_i)\\: row_weights,
-  \\(C_j)\\: col_weights.
+  \\(W_0\\): unobserved_weight,
+  \\(R_i\\): row_weights,
+  \\(C_j\\): col_weights.
 
   Note that the current implementation supports two operation modes: The default
   mode is for the condition where row_factors and col_factors can individually
@@ -82,14 +82,15 @@ class WALSModel(object):
   normalized as follows:
     _, _, unregularized_loss, regularization, sum_weights =
         update_row_factors(sp_input)
-  if sp_input contains the rows {A_i, i \in I}, and the input matrix A has n
-  total rows, then the minibatch loss = unregularized_loss + regularization is
-   \\(
+  if sp_input contains the rows \\({A_i, i \in I}\\), and the input matrix A
+  has n total rows, then the minibatch loss = unregularized_loss +
+  regularization is
+   $$
    (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 + \lambda \|U_I\|_F^2) * n / |I| +
    \lambda \|V\|_F^2
-   )\\
+   $$
   The sum_weights tensor contains the normalized sum of weights
-  sum(W_I) * n / |I|.
+  \\(sum(W_I) * n / |I|\\).
 
   A typical usage example (pseudocode):
 
@@ -223,7 +224,7 @@ class WALSModel(object):
         factor shard. In this case,  w_ij = unobserved_weight +
                                             row_weights[i] * col_weights[j].
         - If this is a single non-negative real number, this value is used for
-        all row weights and w_ij = unobserved_weight + row_weights *
+        all row weights and \\(w_ij\\) = unobserved_weight + row_weights *
                                    col_weights[j].
         Note that it is allowed to have row_weights as a list while col_weights
         a single number or vice versa.
@@ -665,18 +666,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the rows {A_{i, :}, i \in I}, and the input
-        matrix A has n total rows, then the unregularized loss is:
-        (\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|
+        term. If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the
+        input matrix A has n total rows, then the unregularized loss is:
+        \\(\|\sqrt W_I \odot (A_I - U_I V^T)\|_F^2 * n / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the rows {A_{i, :}, i \in I}, and the input matrix
-        A has n total rows, then the regularization term is:
-        \lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2.
+        If sp_input contains the rows \\({A_{i, :}, i \in I}\\), and the input
+        matrix A has n total rows, then the regularization term is:
+        \\(\lambda \|U_I\|_F^2) * n / |I| + \lambda \|V\|_F^2\\).
       sum_weights: The sum of the weights W_I corresponding to sp_input,
-        normalized by a factor of n / |I|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(n / |I|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         True, sp_input=sp_input, transpose_input=transpose_input)
@@ -698,18 +699,18 @@ class WALSModel(object):
         factors.
       unregularized_loss: A tensor (scalar) that contains the normalized
         minibatch loss corresponding to sp_input, without the regularization
-        term. If sp_input contains the columns {A_{:, j}, j \in J}, and the
-        input matrix A has m total columns, then the unregularized loss is:
-        (\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|
+        term. If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and
+        the input matrix A has m total columns, then the unregularized loss is:
+        \\(\|\sqrt W_J \odot (A_J - U V_J^T)\|_F^2 * m / |I|\\)
         The total loss is unregularized_loss + regularization.
       regularization: A tensor (scalar) that contains the normalized
         regularization term for the minibatch loss corresponding to sp_input.
-        If sp_input contains the columns {A_{:, j}, j \in J}, and the input
-        matrix A has m total columns, then the regularization term is:
-        \lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2.
+        If sp_input contains the columns \\({A_{:, j}, j \in J}\\), and the
+        input matrix A has m total columns, then the regularization term is:
+        \\(\lambda \|V_J\|_F^2) * m / |J| + \lambda \|U\|_F^2\\).
       sum_weights: The sum of the weights W_J corresponding to sp_input,
-        normalized by a factor of m / |J|. The root weighted squared error is:
-        \sqrt(unregularized_loss / sum_weights).
+        normalized by a factor of \\(m / |J|\\). The root weighted squared
+        error is: \sqrt(unregularized_loss / sum_weights).
     """
     return self._process_input_helper(
         False, sp_input=sp_input, transpose_input=transpose_input)
@@ -720,8 +721,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the row factors.
 
-    This computes the row embedding u_i for an observed row a_i by solving
-    one iteration of the update equations.
+    This computes the row embedding \\(u_i\\) for an observed row \\(a_i\\) by
+    solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of rows. Please note that the
@@ -753,8 +754,8 @@ class WALSModel(object):
                           projection_weights=None):
     """Projects the column factors.
 
-    This computes the column embedding v_j for an observed column a_j by solving
-    one iteration of the update equations.
+    This computes the column embedding \\(v_j\\) for an observed column
+    \\(a_j\\) by solving one iteration of the update equations.
 
     Args:
       sp_input: A SparseTensor representing a set of columns. Please note that
@@ -938,7 +939,7 @@ class WALSModel(object):
     loss_sp_input = (sparse_ops.sparse_transpose(new_sp_input)
                      if transpose_input else new_sp_input)
     # sp_approx is the low rank estimate of the input matrix, formed by
-    # computing the product <u_i, v_j> for (i, j) in loss_sp_input.indices.
+    # computing the product <\\(u_i, v_j\\)> for (i, j) in loss_sp_input.indices.
     sp_approx_vals = gen_factorization_ops.masked_matmul(
         new_left_values,
         right,
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
index 002f9cfbddd67b6b124f4e22dd43b808c4d48b2a..bb5140aeb3bf0238ca7cb52067ea6328dd1736d5 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops_test.py
@@ -283,8 +283,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 3 column feature vectors.
-      # This is expected to reproduce the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
@@ -462,8 +462,8 @@ class WalsModelTest(test.TestCase):
 
       # Test column projection.
       # Using the specified projection weights for the 2 column feature vectors.
-      # This is expected to reproduce the same column factors in the model as the
-      # weights and feature vectors are identical to that used in model
+      # This is expected to reproduce the same column factors in the model as
+      # the weights and feature vectors are identical to that used in model
       # training.
       projected_cols = wals_model.project_col_factors(
           sp_input=sp_feeder,
diff --git a/tensorflow/contrib/factorization/python/ops/gmm_ops.py b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
index 14d4c733e379a35d1ea3085bc633df174d12b01c..5d77bc77e124378e13667673e4e841c0a1135b31 100644
--- a/tensorflow/contrib/factorization/python/ops/gmm_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/gmm_ops.py
@@ -357,8 +357,8 @@ class GmmAlgorithm(object):
     # Shape broadcasting.
     probs = array_ops.expand_dims(self._probs[shard_id], 0)
     # Membership weights are computed as:
-    # w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}
-    #               {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}
+    # $$w_{ik} = \frac{\alpha_k f(\mathbf{y_i}|\mathbf{\theta}_k)}$$
+    # $$            {\sum_{m=1}^{K}\alpha_mf(\mathbf{y_i}|\mathbf{\theta}_m)}$$
     # where "i" is the i-th example, "k" is the k-th mixture, theta are
     # the model parameters and y_i the observations.
     # These are defined for each shard.
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index 38faca119d0b5ee883de3b215428a0db8a021016..bfe338c9f9a7b761cfcd627b92f1682af97630c9 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -374,11 +374,11 @@ class KMeansClustering(estimator.Estimator):
               than `num_clusters`, a TensorFlow runtime error occurs.
       distance_metric: The distance metric used for clustering. One of:
         * `KMeansClustering.SQUARED_EUCLIDEAN_DISTANCE`: Euclidean distance
-             between vectors `u` and `v` is defined as `||u - v||_2` which is
-             the square root of the sum of the absolute squares of the elements'
-             difference.
+             between vectors `u` and `v` is defined as `\\(||u - v||_2\\)`
+             which is the square root of the sum of the absolute squares of
+             the elements' difference.
         * `KMeansClustering.COSINE_DISTANCE`: Cosine distance between vectors
-             `u` and `v` is defined as `1 - (u . v) / (||u||_2 ||v||_2)`.
+             `u` and `v` is defined as `\\(1 - (u . v) / (||u||_2 ||v||_2)\\)`.
       random_seed: Python integer. Seed for PRNG used to initialize centers.
       use_mini_batch: A boolean specifying whether to use the mini-batch k-means
         algorithm. See explanation above.
diff --git a/tensorflow/contrib/factorization/python/ops/wals.py b/tensorflow/contrib/factorization/python/ops/wals.py
index 62db3bb4c40e0b1e7adfeb682734f8efbfff9cdb..ca46c39baa16a7fddb96121e0402fc35d24ce1c2 100644
--- a/tensorflow/contrib/factorization/python/ops/wals.py
+++ b/tensorflow/contrib/factorization/python/ops/wals.py
@@ -216,7 +216,7 @@ def _wals_factorization_model_function(features, labels, mode, params):
         name=WALSMatrixFactorization.LOSS,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     # The root weighted squared error =
-    #   \sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )
+    #   \\(\sqrt( \sum_{i,j} w_ij * (a_ij - r_ij)^2 / \sum_{i,j} w_ij )\\)
     rwse_var = variable_scope.variable(
         0.,
         trainable=False,
@@ -490,11 +490,11 @@ class WALSMatrixFactorization(estimator.Estimator):
           and the problem simplifies to ALS. Note that, in this case,
           col_weights must also be set to "None".
         - List of lists of non-negative scalars, of the form
-          [[w_0, w_1, ...], [w_k, ... ], [...]],
+          \\([[w_0, w_1, ...], [w_k, ... ], [...]]\\),
           where the number of inner lists equal to the number of row factor
           shards and the elements in each inner list are the weights for the
           rows of that shard. In this case,
-          w_ij = unonbserved_weight + row_weights[i] * col_weights[j].
+          \\(w_ij = unonbserved_weight + row_weights[i] * col_weights[j]\\).
         - A non-negative scalar: This value is used for all row weights.
           Note that it is allowed to have row_weights as a list and col_weights
           as a scalar, or vice-versa.
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index 3614b2b15a6cbdd73f9f24c7e4e4534228d31499..aab7d0c9e8874269bfa5f33193b0dc0ba4bbc9cd 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/ffmpeg/BUILD b/tensorflow/contrib/ffmpeg/BUILD
index eccce99071dc1477cf4f3bb152f3304b3b0fc35a..f7b3273a4d35eadb9fad49399b7bf18d4bd33503 100644
--- a/tensorflow/contrib/ffmpeg/BUILD
+++ b/tensorflow/contrib/ffmpeg/BUILD
@@ -180,15 +180,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/ffmpeg/default/BUILD b/tensorflow/contrib/ffmpeg/default/BUILD
index 6b455567d766dbe6d380a498bd7f521db27e077b..59bad8982dd163f89f37e1a0a9d5017d0c495de3 100644
--- a/tensorflow/contrib/ffmpeg/default/BUILD
+++ b/tensorflow/contrib/ffmpeg/default/BUILD
@@ -74,15 +74,3 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index ac043fda0638e61f422e769ab3047a53a1b377bd..b1c8ad49eaf8d2400e431fcf4820fca6e0314557 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -321,15 +321,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py
index 3cad1fee1984042e3a9ab91a0af70cbaca25cece..5b150339953f961c756c0909dd1795341159b9cd 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope.py
@@ -68,7 +68,7 @@ from tensorflow.python.util import tf_decorator
 
 __all__ = [
     'arg_scope', 'add_arg_scope', 'current_arg_scope', 'has_arg_scope',
-    'arg_scoped_arguments'
+    'arg_scoped_arguments', 'arg_scope_func_key'
 ]
 
 _ARGSTACK = [{}]
@@ -89,7 +89,7 @@ def current_arg_scope():
   return stack[-1]
 
 
-def _key_op(op):
+def arg_scope_func_key(op):
   return getattr(op, '_key_op', str(op))
 
 
@@ -103,9 +103,9 @@ def _kwarg_names(func):
 
 
 def _add_op(op):
-  key_op = _key_op(op)
-  if key_op not in _DECORATED_OPS:
-    _DECORATED_OPS[key_op] = _kwarg_names(op)
+  key = arg_scope_func_key(op)
+  if key not in _DECORATED_OPS:
+    _DECORATED_OPS[key] = _kwarg_names(op)
 
 
 @tf_contextlib.contextmanager
@@ -147,16 +147,16 @@ def arg_scope(list_ops_or_scope, **kwargs):
     try:
       current_scope = current_arg_scope().copy()
       for op in list_ops_or_scope:
-        key_op = _key_op(op)
+        key = arg_scope_func_key(op)
         if not has_arg_scope(op):
           raise ValueError('%s is not decorated with @add_arg_scope',
                            _name_op(op))
-        if key_op in current_scope:
-          current_kwargs = current_scope[key_op].copy()
+        if key in current_scope:
+          current_kwargs = current_scope[key].copy()
           current_kwargs.update(kwargs)
-          current_scope[key_op] = current_kwargs
+          current_scope[key] = current_kwargs
         else:
-          current_scope[key_op] = kwargs.copy()
+          current_scope[key] = kwargs.copy()
       _get_arg_stack().append(current_scope)
       yield current_scope
     finally:
@@ -176,14 +176,14 @@ def add_arg_scope(func):
   def func_with_args(*args, **kwargs):
     current_scope = current_arg_scope()
     current_args = kwargs
-    key_func = _key_op(func)
+    key_func = arg_scope_func_key(func)
     if key_func in current_scope:
       current_args = current_scope[key_func].copy()
       current_args.update(kwargs)
     return func(*args, **current_args)
 
   _add_op(func)
-  setattr(func_with_args, '_key_op', _key_op(func))
+  setattr(func_with_args, '_key_op', arg_scope_func_key(func))
   return tf_decorator.make_decorator(func, func_with_args)
 
 
@@ -196,7 +196,7 @@ def has_arg_scope(func):
   Returns:
     a boolean.
   """
-  return _key_op(func) in _DECORATED_OPS
+  return arg_scope_func_key(func) in _DECORATED_OPS
 
 
 def arg_scoped_arguments(func):
@@ -209,4 +209,4 @@ def arg_scoped_arguments(func):
     a list of kwargs names.
   """
   assert has_arg_scope(func)
-  return _DECORATED_OPS[_key_op(func)]
+  return _DECORATED_OPS[arg_scope_func_key(func)]
diff --git a/tensorflow/contrib/framework/python/ops/arg_scope_test.py b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
index 7ba9d4ffa90f6860629b15a2ea91e0c573bf6368..4c3879d4fc08b53ea8be5f1256a830a64fb39af6 100644
--- a/tensorflow/contrib/framework/python/ops/arg_scope_test.py
+++ b/tensorflow/contrib/framework/python/ops/arg_scope_test.py
@@ -170,6 +170,30 @@ class ArgScopeTest(test.TestCase):
         self.assertTupleEqual(args, func1_args)
         self.assertDictEqual(kwargs, func1_kwargs)
 
+  def testNestedArgScopeObjectCreatedOutsideScopeOverridesArgScope(self):
+
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    scope_object = get_scope_object()
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(scope_object):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1]})
+
+  def testArgScopeObjectCreatedWithinScopeInheritsArgScope(self):
+    def get_scope_object():
+      with arg_scope([func1], a=1, b=None, c=[1]) as sc:
+        return sc
+
+    with arg_scope([func1], b=2, d=10):
+      with arg_scope(get_scope_object()):
+        args, kwargs = func1(0)
+        self.assertTupleEqual(args, (0,))
+        self.assertDictEqual(kwargs, {'a': 1, 'b': None, 'c': [1], 'd': 10})
+
   def testSharedArgScope(self):
     func1_args = (0,)
     func1_kwargs = {'a': 1, 'b': None, 'c': [1]}
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index ce37672895b37275770d2f5410f662e9acf1de9d..0eb6889db1fae1c74aeb4392441b308392b091a5 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -157,15 +157,3 @@ cuda_py_test(
         "requires_cudnn6",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index 0eb0e3cbe20f5804db5476c08167d4e1c9080cfa..b305f37791d71f5a6edeada2bb710a2e5f23087d 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -354,6 +354,7 @@ py_test(
     name = "classifier_metrics_test",
     srcs = ["python/eval/python/classifier_metrics_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":classifier_metrics",
         "//tensorflow/core:protos_all_py",
@@ -363,6 +364,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -544,15 +546,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 082c42eba180917e732bb7890129dfa94bf00fec..e3fc6bf0f034051fc33ff5966e2f4ea85aa538db 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -88,8 +88,8 @@ class GANEstimator(estimator.Estimator):
           discriminator_fn=discriminator_fn,
           generator_loss_fn=tfgan.losses.wasserstein_generator_loss,
           discriminator_loss_fn=tfgan.losses.wasserstein_discriminator_loss,
-          generator_optimizer=tf.train.AdamOptimizier(0.1, 0.5),
-          discriminator_optimizer=tf.train.AdamOptimizier(0.1, 0.5))
+          generator_optimizer=tf.train.AdamOptimizer(0.1, 0.5),
+          discriminator_optimizer=tf.train.AdamOptimizer(0.1, 0.5))
 
       # Train estimator.
       gan_estimator.train(train_input_fn, steps)
diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
index 663e49bdca3cb2dd9257da326488c877fcc4256d..4fb8d58bc9125664d42260de72b83b2362eff9ba 100644
--- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_test.py
@@ -22,6 +22,7 @@ import os
 import tarfile
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 from scipy import linalg as scp_linalg
 
@@ -182,13 +183,20 @@ def _run_with_mock(function, *args, **kwargs):
     return function(*args, **kwargs)
 
 
-class ClassifierMetricsTest(test.TestCase):
+class ClassifierMetricsTest(test.TestCase, parameterized.TestCase):
 
-  def test_run_inception_graph(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph(self, use_default_graph_def):
     """Test `run_inception` graph construction."""
     batch_size = 7
     img = array_ops.ones([batch_size, 299, 299, 3])
-    logits = _run_with_mock(classifier_metrics.run_inception, img)
+
+    if use_default_graph_def:
+      logits = _run_with_mock(classifier_metrics.run_inception, img)
+    else:
+      logits = classifier_metrics.run_inception(img, _get_dummy_graphdef())
 
     self.assertTrue(isinstance(logits, ops.Tensor))
     logits.shape.assert_is_compatible_with([batch_size, 1001])
@@ -196,14 +204,23 @@ class ClassifierMetricsTest(test.TestCase):
     # Check that none of the model variables are trainable.
     self.assertListEqual([], variables.trainable_variables())
 
-  def test_run_inception_graph_pool_output(self):
+  @parameterized.named_parameters(
+      ('GraphDef', False),
+      ('DefaultGraphDefFn', True))
+  def test_run_inception_graph_pool_output(self, use_default_graph_def):
     """Test `run_inception` graph construction with pool output."""
     batch_size = 3
     img = array_ops.ones([batch_size, 299, 299, 3])
-    pool = _run_with_mock(
-        classifier_metrics.run_inception,
-        img,
-        output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+
+    if use_default_graph_def:
+      pool = _run_with_mock(
+          classifier_metrics.run_inception,
+          img,
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
+    else:
+      pool = classifier_metrics.run_inception(
+          img, _get_dummy_graphdef(),
+          output_tensor=classifier_metrics.INCEPTION_FINAL_POOL)
 
     self.assertTrue(isinstance(pool, ops.Tensor))
     pool.shape.assert_is_compatible_with([batch_size, 2048])
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
index 0d1afad72da8a8e087239868e25ddebe23490d1e..508f487722fba89cc8391a340f73673a526e86c4 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_impl.py
@@ -31,6 +31,7 @@ __all__ = [
     'add_image_comparison_summaries',
     'add_gan_model_summaries',
     'add_regularization_loss_summaries',
+    'add_cyclegan_image_summaries',
 ]
 
 
@@ -51,14 +52,9 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     ValueError: If real and generated data aren't images.
   """
   if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_summaries'):
-      add_gan_model_image_summaries(gan_model.model_y2x, **saved_params)
-    return
-
+    raise ValueError(
+        '`add_gan_model_image_summaries` does not take CycleGANModels. Please '
+        'use `add_cyclegan_image_summaries` instead.')
   _assert_is_image(gan_model.real_data)
   _assert_is_image(gan_model.generated_data)
 
@@ -89,6 +85,49 @@ def add_gan_model_image_summaries(gan_model, grid_size=4, model_summaries=True):
     add_gan_model_summaries(gan_model)
 
 
+def add_cyclegan_image_summaries(cyclegan_model):
+  """Adds image summaries for CycleGAN.
+
+  There are two summaries, one for each generator. The first image is the
+  generator input, the second is the generator output, and the third is G(F(x)).
+
+  Args:
+    cyclegan_model: A CycleGANModel tuple.
+
+  Raises:
+    ValueError: If `cyclegan_model` isn't a CycleGANModel.
+    ValueError: If generated data, generator inputs, and reconstructions aren't
+      images.
+    ValueError: If the generator input, generated data, and reconstructions
+      aren't all the same size.
+  """
+  if not isinstance(cyclegan_model, namedtuples.CycleGANModel):
+    raise ValueError('`cyclegan_model` was not a CycleGANModel. Instead, was '
+                     '%s' % type(cyclegan_model))
+
+  _assert_is_image(cyclegan_model.model_x2y.generator_inputs)
+  _assert_is_image(cyclegan_model.model_x2y.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_x)
+  _assert_is_image(cyclegan_model.model_y2x.generator_inputs)
+  _assert_is_image(cyclegan_model.model_y2x.generated_data)
+  _assert_is_image(cyclegan_model.reconstructed_y)
+
+  def _add_comparison_summary(gan_model, reconstructions):
+    image_list = (array_ops.unstack(gan_model.generator_inputs[:1]) +
+                  array_ops.unstack(gan_model.generated_data[:1]) +
+                  array_ops.unstack(reconstructions[:1]))
+    summary.image(
+        'image_comparison', eval_utils.image_reshaper(
+            image_list, num_cols=len(image_list)), max_outputs=1)
+
+  with ops.name_scope('x2y_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_x2y, cyclegan_model.reconstructed_x)
+  with ops.name_scope('y2x_image_comparison_summaries'):
+    _add_comparison_summary(
+        cyclegan_model.model_y2x, cyclegan_model.reconstructed_y)
+
+
 def add_image_comparison_summaries(gan_model, num_comparisons=2,
                                    display_diffs=False):
   """Adds image summaries to compare triplets of images.
@@ -109,15 +148,6 @@ def add_image_comparison_summaries(gan_model, num_comparisons=2,
     ValueError: If the generator input, real, and generated data aren't all the
       same size.
   """
-  if isinstance(gan_model, namedtuples.CycleGANModel):
-    saved_params = locals()
-    saved_params.pop('gan_model', None)
-    with ops.name_scope('cyclegan_x2y_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_x2y, **saved_params)
-    with ops.name_scope('cyclegan_y2x_image_comparison_summaries'):
-      add_image_comparison_summaries(gan_model.model_y2x, **saved_params)
-    return
-
   _assert_is_image(gan_model.generator_inputs)
   _assert_is_image(gan_model.generated_data)
   _assert_is_image(gan_model.real_data)
diff --git a/tensorflow/contrib/gan/python/eval/python/summaries_test.py b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
index 45eb108586bed07434ac29595164745eac6054c1..33d51bfc218ab93fb52439b1eefed98a4568c4a1 100644
--- a/tensorflow/contrib/gan/python/eval/python/summaries_test.py
+++ b/tensorflow/contrib/gan/python/eval/python/summaries_test.py
@@ -65,15 +65,14 @@ def get_cyclegan_model():
   return namedtuples.CycleGANModel(
       model_x2y=model_x2y,
       model_y2x=model_y2x,
-      reconstructed_x=array_ops.zeros([3, 30, 35, 6]),
-      reconstructed_y=array_ops.zeros([3, 30, 35, 6]))
+      reconstructed_x=array_ops.zeros([4, 32, 32, 3]),
+      reconstructed_y=array_ops.zeros([4, 32, 32, 3]))
 
 
 class SummariesTest(test.TestCase):
 
-  def _test_add_gan_model_image_summaries_impl(self, get_model_fn,
-                                               expected_num_summary_ops,
-                                               model_summaries):
+  def _test_add_gan_model_image_summaries_impl(
+      self, get_model_fn, expected_num_summary_ops, model_summaries):
     summaries.add_gan_model_image_summaries(get_model_fn(), grid_size=2,
                                             model_summaries=model_summaries)
 
@@ -89,8 +88,9 @@ class SummariesTest(test.TestCase):
   def test_add_gan_model_image_summaries_no_model(self):
     self._test_add_gan_model_image_summaries_impl(get_gan_model, 2, False)
 
-  def test_add_gan_model_image_summaries_for_cyclegan(self):
-    self._test_add_gan_model_image_summaries_impl(get_cyclegan_model, 10, True)
+  def test_cyclegan_image_summaries_dont_work(self):
+    with self.assertRaises(ValueError):
+      summaries.add_gan_model_image_summaries(get_cyclegan_model())
 
   def _test_add_gan_model_summaries_impl(self, get_model_fn,
                                          expected_num_summary_ops):
@@ -137,7 +137,11 @@ class SummariesTest(test.TestCase):
     self._test_add_image_comparison_summaries_impl(get_gan_model, 1)
 
   def test_add_image_comparison_summaries_for_cyclegan(self):
-    self._test_add_image_comparison_summaries_impl(get_cyclegan_model, 2)
+    summaries.add_cyclegan_image_summaries(get_cyclegan_model())
+
+    self.assertEquals(2, len(ops.get_collection(ops.GraphKeys.SUMMARIES)))
+    with self.test_session(use_gpu=True):
+      summary.merge_all().eval()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 39588b7219ebac1cc4855532be3fcc38e6381134..1ba3a641671c7f2a411a0c5f99228ca16eee1080 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -306,6 +306,7 @@ def wasserstein_gradient_penalty(
     discriminator_scope,
     epsilon=1e-10,
     target=1.0,
+    one_sided=False,
     weights=1.0,
     scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -327,6 +328,8 @@ def wasserstein_gradient_penalty(
       computing the gradient norm.
     target: Optional Python number or `Tensor` indicating the target value of
       gradient norm. Defaults to 1.0.
+    one_sided: If `True`, penalty proposed in https://arxiv.org/abs/1709.08894
+      is used. Defaults to `False`.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `real_data` and `generated_data`, and must be broadcastable to
       them (i.e., all dimensions must be either `1`, or the same as the
@@ -377,10 +380,13 @@ def wasserstein_gradient_penalty(
     # For numerical stability, add epsilon to the sum before taking the square
     # root. Note tf.norm does not add epsilon.
     slopes = math_ops.sqrt(gradient_squares + epsilon)
-    penalties = math_ops.square(slopes / target - 1.0)
+    penalties = slopes / target - 1.0
+    if one_sided:
+      penalties = math_ops.maximum(0., penalties)
+    penalties_squared = math_ops.square(penalties)
     penalty = losses.compute_weighted_loss(
-        penalties, weights, scope=scope, loss_collection=loss_collection,
-        reduction=reduction)
+        penalties_squared, weights, scope=scope,
+        loss_collection=loss_collection, reduction=reduction)
 
     if add_summaries:
       summary.scalar('gradient_penalty_loss', penalty)
@@ -665,7 +671,7 @@ def least_squares_discriminator_loss(
     loss_collection=ops.GraphKeys.LOSSES,
     reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS,
     add_summaries=False):
-  """Least squares generator loss.
+  """Least squares discriminator loss.
 
   This loss comes from `Least Squares Generative Adversarial Networks`
   (https://arxiv.org/abs/1611.04076).
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index dbaa624ae9d6a5a5949db692e52c0c1deb18b8df..2889e937436d2faa66b5693c19046e122cbaf652 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -481,6 +481,28 @@ class GradientPenaltyTest(test.TestCase, _PenaltyTest):
                       })
       self.assertAlmostEqual(self._expected_loss, loss, 5)
 
+  def test_loss_using_one_sided_mode(self):
+    generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+    real_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
+
+    loss = tfgan_losses.wasserstein_gradient_penalty(
+        generated_data,
+        real_data,
+        self._kwargs['generator_inputs'],
+        self._kwargs['discriminator_fn'],
+        self._kwargs['discriminator_scope'],
+        one_sided=True)
+    self.assertEqual(generated_data.dtype, loss.dtype)
+
+    with self.test_session() as sess:
+      variables.global_variables_initializer().run()
+      loss = sess.run(loss,
+                      feed_dict={
+                          generated_data: self._generated_data_np,
+                          real_data: self._real_data_np,
+                      })
+      self.assertAlmostEqual(self._expected_loss, loss, 5)
+
   def test_loss_with_gradient_norm_target(self):
     """Test loss value with non default gradient norm target."""
     generated_data = array_ops.placeholder(dtypes.float32, shape=(None, None))
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 776eb11ecb1624544d24611d8fe6ca19768b8313..73acd05b60a5fb02601423fd9234a56a34f75276 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -461,6 +461,7 @@ def gan_loss(
     gradient_penalty_weight=None,
     gradient_penalty_epsilon=1e-10,
     gradient_penalty_target=1.0,
+    gradient_penalty_one_sided=False,
     mutual_information_penalty_weight=None,
     aux_cond_generator_weight=None,
     aux_cond_discriminator_weight=None,
@@ -485,6 +486,8 @@ def gan_loss(
     gradient_penalty_target: If `gradient_penalty_weight` is not None, a Python
       number or `Tensor` indicating the target value of gradient norm. See the
       CIFAR10 section of https://arxiv.org/abs/1710.10196. Defaults to 1.0.
+    gradient_penalty_one_sided: If `True`, penalty proposed in
+      https://arxiv.org/abs/1709.08894 is used. Defaults to `False`.
     mutual_information_penalty_weight: If not `None`, must be a non-negative
       Python number or Tensor indicating how much to weight the mutual
       information penalty. See https://arxiv.org/abs/1606.03657 for more
@@ -546,6 +549,7 @@ def gan_loss(
         model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
+        one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index f9bdaa74c948ecee11d5cfd89f06087924f8dace..3ebbe55d059e5e72607bc4efdbf95a6c96d99f11 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -359,10 +359,12 @@ class GANLossTest(test.TestCase):
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
   # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn):
+  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
-    loss_gp = train.gan_loss(model, gradient_penalty_weight=1.0)
+    loss_gp = train.gan_loss(model,
+                             gradient_penalty_weight=1.0,
+                             gradient_penalty_one_sided=one_sided)
     self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
 
     # Check values.
@@ -394,6 +396,25 @@ class GANLossTest(test.TestCase):
   def test_grad_penalty_callable_acgan(self):
     self._test_grad_penalty_helper(create_callable_acgan_model)
 
+  def test_grad_penalty_one_sided_gan(self):
+    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_gan(self):
+    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_infogan(self):
+    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_infogan(self):
+    self._test_grad_penalty_helper(
+        create_callable_infogan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_acgan(self):
+    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
+
+  def test_grad_penalty_one_sided_callable_acgan(self):
+    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
+
   # Test mutual information penalty option.
   def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
     train.gan_loss(create_gan_model_fn(),
diff --git a/tensorflow/contrib/gdr/BUILD b/tensorflow/contrib/gdr/BUILD
index 707ae25d485c64f15694ee0e357f32b619d3cd33..e534fdc17749974ebe713c2730682bea6d7a85e4 100644
--- a/tensorflow/contrib/gdr/BUILD
+++ b/tensorflow/contrib/gdr/BUILD
@@ -9,18 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/graph_editor/BUILD b/tensorflow/contrib/graph_editor/BUILD
index 967ad2fc090906e93f22c777816eede37f9a1b04..1711100e3a857dba0d15c5b4f6c96cddc568e800 100644
--- a/tensorflow/contrib/graph_editor/BUILD
+++ b/tensorflow/contrib/graph_editor/BUILD
@@ -39,18 +39,6 @@ py_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "match",
     srcs = ["tests/match.py"],
diff --git a/tensorflow/contrib/graph_editor/select.py b/tensorflow/contrib/graph_editor/select.py
index 3ea6ff4d6163b107ca0daaf3b9ad1daf0ccc1f6f..d700e6e1a7523622f845acbbc353eb0f438c9bc2 100644
--- a/tensorflow/contrib/graph_editor/select.py
+++ b/tensorflow/contrib/graph_editor/select.py
@@ -383,6 +383,7 @@ def get_within_boundary_ops(ops,
 def get_forward_walk_ops(seed_ops,
                          inclusive=True,
                          within_ops=None,
+                         within_ops_fn=None,
                          stop_at_ts=(),
                          control_outputs=None):
   """Do a forward graph walk and return all the visited ops.
@@ -395,6 +396,9 @@ def get_forward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_outputs: a `util.ControlOutputs` instance or None.
       If not `None`, it will be used while walking the graph forward.
@@ -423,7 +427,8 @@ def get_forward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -450,6 +455,7 @@ def get_forward_walk_ops(seed_ops,
 def get_backward_walk_ops(seed_ops,
                           inclusive=True,
                           within_ops=None,
+                          within_ops_fn=None,
                           stop_at_ts=(),
                           control_inputs=False):
   """Do a backward graph walk and return all the visited ops.
@@ -462,6 +468,9 @@ def get_backward_walk_ops(seed_ops,
     within_ops: an iterable of `tf.Operation` within which the search is
       restricted. If `within_ops` is `None`, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     stop_at_ts: an iterable of tensors at which the graph walk stops.
     control_inputs: if True, control inputs will be used while moving backward.
   Returns:
@@ -488,7 +497,8 @@ def get_backward_walk_ops(seed_ops,
     seed_ops &= within_ops
 
   def is_within(op):
-    return within_ops is None or op in within_ops
+    return (within_ops is None or op in within_ops) and (
+        within_ops_fn is None or within_ops_fn(op))
 
   result = list(seed_ops)
   wave = set(seed_ops)
@@ -516,6 +526,7 @@ def get_walks_intersection_ops(forward_seed_ops,
                                forward_inclusive=True,
                                backward_inclusive=True,
                                within_ops=None,
+                               within_ops_fn=None,
                                control_inputs=False,
                                control_outputs=None,
                                control_ios=None):
@@ -535,6 +546,9 @@ def get_walks_intersection_ops(forward_seed_ops,
     within_ops: an iterable of tf.Operation within which the search is
       restricted. If within_ops is None, the search is performed within
       the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -555,11 +569,13 @@ def get_walks_intersection_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return [op for op in forward_ops if op in backward_ops]
 
@@ -569,6 +585,7 @@ def get_walks_union_ops(forward_seed_ops,
                         forward_inclusive=True,
                         backward_inclusive=True,
                         within_ops=None,
+                        within_ops_fn=None,
                         control_inputs=False,
                         control_outputs=None,
                         control_ios=None):
@@ -587,6 +604,9 @@ def get_walks_union_ops(forward_seed_ops,
       resulting set.
     within_ops: restrict the search within those operations. If within_ops is
       None, the search is done within the whole graph.
+    within_ops_fn: if provided, a function on ops that should return True iff
+      the op is within the graph traversal. This can be used along within_ops,
+      in which case an op is within if it is also in within_ops.
     control_inputs: A boolean indicating whether control inputs are enabled.
     control_outputs: An instance of util.ControlOutputs or None. If not None,
       control outputs are enabled.
@@ -607,11 +627,13 @@ def get_walks_union_ops(forward_seed_ops,
       forward_seed_ops,
       inclusive=forward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_outputs=control_outputs)
   backward_ops = get_backward_walk_ops(
       backward_seed_ops,
       inclusive=backward_inclusive,
       within_ops=within_ops,
+      within_ops_fn=within_ops_fn,
       control_inputs=control_inputs)
   return util.concatenate_unique(forward_ops, backward_ops)
 
diff --git a/tensorflow/contrib/graph_editor/tests/select_test.py b/tensorflow/contrib/graph_editor/tests/select_test.py
index 82f999637d0c1866a5a329974f021fe2e30fd33f..d12c6d3cbd11dde2b609a59154297a8907b0cadc 100644
--- a/tensorflow/contrib/graph_editor/tests/select_test.py
+++ b/tensorflow/contrib/graph_editor/tests/select_test.py
@@ -77,12 +77,10 @@ class SelectTest(test.TestCase):
     """Test for ge.get_ops_ios."""
     control_outputs = ge.util.ControlOutputs(self.graph)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.h.op, control_ios=control_outputs)), 3)
+        len(ge.get_ops_ios(self.h.op, control_ios=control_outputs)), 3)
     self.assertEqual(len(ge.get_ops_ios(self.h.op)), 2)
     self.assertEqual(
-        len(ge.get_ops_ios(
-            self.c.op, control_ios=control_outputs)), 6)
+        len(ge.get_ops_ios(self.c.op, control_ios=control_outputs)), 6)
     self.assertEqual(len(ge.get_ops_ios(self.c.op)), 5)
 
   def test_compute_boundary_ts_0(self):
@@ -135,16 +133,49 @@ class SelectTest(test.TestCase):
     ops = ge.get_walks_intersection_ops([self.c.op], [self.g.op])
     self.assertEqual(len(ops), 2)
 
+    ops = ge.get_walks_intersection_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.a.op in ops)
+    self.assertTrue(self.c.op in ops)
+    self.assertTrue(self.f.op in ops)
+
+    within_ops = [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops=within_ops)
+    self.assertEqual(len(ops), 0)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.f.op]
+    ops = ge.get_walks_intersection_ops(
+        [self.a.op], [self.f.op], within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 0)
+
   def test_get_walks_union(self):
     """Test for ge.get_walks_union_ops."""
     ops = ge.get_walks_union_ops([self.f.op], [self.g.op])
     self.assertEqual(len(ops), 6)
 
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op])
+    self.assertEqual(len(ops), 8)
+
+    within_ops = [self.a.op, self.c.op, self.d.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops=within_ops)
+    self.assertEqual(len(ops), 4)
+    self.assertTrue(self.b.op not in ops)
+
+    within_ops_fn = lambda op: op in [self.a.op, self.c.op, self.f.op]
+    ops = ge.get_walks_union_ops([self.a.op], [self.f.op],
+                                 within_ops_fn=within_ops_fn)
+    self.assertEqual(len(ops), 3)
+    self.assertTrue(self.b.op not in ops)
+    self.assertTrue(self.d.op not in ops)
+
   def test_select_ops(self):
     parameters = (
         (("^foo/",), 7),
         (("^foo/bar/",), 4),
-        (("^foo/bar/", "a"), 5),)
+        (("^foo/bar/", "a"), 5),
+    )
     for param, length in parameters:
       ops = ge.select_ops(*param, graph=self.graph)
       self.assertEqual(len(ops), length)
@@ -152,7 +183,8 @@ class SelectTest(test.TestCase):
   def test_select_ts(self):
     parameters = (
         (".*:0", 8),
-        (r".*/bar/\w+:0", 4),)
+        (r".*/bar/\w+:0", 4),
+    )
     for regex, length in parameters:
       ts = ge.select_ts(regex, graph=self.graph)
       self.assertEqual(len(ts), length)
@@ -160,12 +192,121 @@ class SelectTest(test.TestCase):
   def test_select_ops_and_ts(self):
     parameters = (
         (("^foo/.*",), 7, 0),
-        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),)
+        (("^foo/.*", "(?#ts)^foo/bar/.*"), 7, 4),
+    )
     for param, l0, l1 in parameters:
       ops, ts = ge.select_ops_and_ts(*param, graph=self.graph)
       self.assertEqual(len(ops), l0)
       self.assertEqual(len(ts), l1)
 
+  def test_forward_walk_ops(self):
+    seed_ops = [self.a.op, self.d.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.e.op.
+    within_ops_fn = lambda op: op not in (self.e.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # No b.op since it's an independent source node.
+      # No g.op from within_ops.
+      # No e.op from within_ops_fn.
+      # No h.op from stop_at_ts and within_ops.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(
+          set(ops), set([self.a.op, self.c.op, self.d.op, self.f.op]))
+
+      # Also no a.op and d.op when inclusive=False
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.f.op]))
+
+      # Not using within_ops_fn adds e.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.c.op, self.e.op, self.f.op]))
+
+      # Not using stop_at_ts adds back h.op.
+      ops = ge.select.get_forward_walk_ops(
+          seed_ops, inclusive=False, within_ops=within_ops)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.h.op]))
+
+      # Starting just form a (the tensor, not op) omits a, b, d.
+      ops = ge.select.get_forward_walk_ops([self.a], inclusive=True)
+      self.assertEqual(
+          set(ops), set([self.c.op, self.e.op, self.f.op, self.g.op,
+                         self.h.op]))
+
+  def test_backward_walk_ops(self):
+    seed_ops = [self.h.op]
+    # Include all ops except for self.g.op
+    within_ops = [
+        x.op for x in [self.a, self.b, self.c, self.d, self.e, self.f, self.h]
+    ]
+    # For the fn, exclude self.c.op.
+    within_ops_fn = lambda op: op not in (self.c.op,)
+    stop_at_ts = (self.f,)
+
+    with self.graph.as_default():
+      # Backward walk only includes h since we stop at f and g is not within.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set([self.h.op]))
+
+      # If we do inclusive=False, the result is empty.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=False,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn,
+          stop_at_ts=stop_at_ts)
+      self.assertEqual(set(ops), set())
+
+      # Removing stop_at_fs adds f.op, d.op.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops,
+          inclusive=True,
+          within_ops=within_ops,
+          within_ops_fn=within_ops_fn)
+      self.assertEqual(set(ops), set([self.d.op, self.f.op, self.h.op]))
+
+      # Not using within_ops_fn adds back ops for a, b, c.
+      ops = ge.select.get_backward_walk_ops(
+          seed_ops, inclusive=True, within_ops=within_ops)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.h.op
+          ]))
+
+      # Vanially backward search via self.h.op includes everything excpet e.op.
+      ops = ge.select.get_backward_walk_ops(seed_ops, inclusive=True)
+      self.assertEqual(
+          set(ops),
+          set([
+              self.a.op, self.b.op, self.c.op, self.d.op, self.f.op, self.g.op,
+              self.h.op
+          ]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/grid_rnn/BUILD b/tensorflow/contrib/grid_rnn/BUILD
index d601a1ec6f7a219bcd461d819ab2dfc64135a3ae..d0b44640667010b58c017d933d50ae5f87e8b275 100644
--- a/tensorflow/contrib/grid_rnn/BUILD
+++ b/tensorflow/contrib/grid_rnn/BUILD
@@ -41,15 +41,3 @@ cuda_py_tests(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/hooks/BUILD b/tensorflow/contrib/hooks/BUILD
index 1b528d7afc1112f5dc0667ae299ade02bc8fd04b..d65b2d6026dd89959aa62b57e07b073eef84572c 100644
--- a/tensorflow/contrib/hooks/BUILD
+++ b/tensorflow/contrib/hooks/BUILD
@@ -23,14 +23,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
index 324035100df366b80f57af9052c4bd935655b248..e39c60b252a1b49a68d51302fff47734869dddfe 100644
--- a/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
+++ b/tensorflow/contrib/hvx/clock_cycle_profiling/BUILD
@@ -13,18 +13,6 @@ exports_files(["LICENSE"])
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "clock_cycle_profiling",
     testonly = 1,
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
index 909dc396a33b6fef1b2d51c3f52fab7782fc8ea5..0081fb61770075a2c36e92f65e01126f657edeb4 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/BUILD
@@ -10,17 +10,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
-
 tf_cc_binary(
     name = "hvx_ops_support_checker",
     testonly = 1,
diff --git a/tensorflow/contrib/image/BUILD b/tensorflow/contrib/image/BUILD
index 79eb3762edbc17e5c4682ac42dff87ae423bddfe..da450480b30b548484e69c61c85667d6dd390417 100755
--- a/tensorflow/contrib/image/BUILD
+++ b/tensorflow/contrib/image/BUILD
@@ -384,15 +384,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
index b71ff9cd507faac66b3a33d3c02ec9b5901d814a..645abbf0b0ea5465dadf55d065e997e16940c18d 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op_gpu.cu.cc
@@ -53,7 +53,7 @@ void AdjustHsvInYiqGPU::operator()(OpKernelContext* ctx, int channel_count,
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                           DT_FLOAT, TensorShape({kChannelSize * kChannelSize}),
                           &tranformation_matrix));
-  // TODO(huangyp): It takes about 3.5 us to comute tranformation_matrix
+  // TODO(huangyp): It takes about 3.5 us to compute tranformation_matrix
   // with one thread. Improve its performance if necessary.
   internal::compute_tranformation_matrix_cuda<<<1, 1, 0, cu_stream>>>(
       delta_h, scale_s, scale_v, tranformation_matrix.flat<float>().data(),
diff --git a/tensorflow/contrib/image/ops/distort_image_ops.cc b/tensorflow/contrib/image/ops/distort_image_ops.cc
index b169b0b2b22ad6432baed2cc96711da5ca995875..ca49635d5d0bc7bb84b19508a74be74362d96ddf 100644
--- a/tensorflow/contrib/image/ops/distort_image_ops.cc
+++ b/tensorflow/contrib/image/ops/distort_image_ops.cc
@@ -36,9 +36,9 @@ REGISTER_OP("AdjustHsvInYiq")
 Adjust the YIQ hue of one or more images.
 
 `images` is a tensor of at least 3 dimensions.  The last dimension is
-interpretted as channels, and must be three.
+interpreted as channels, and must be three.
 
-We used linear transfomation described in:
+We used linear transformation described in:
  beesbuzz.biz/code/hsv_color_transforms.php
 The input image is considered in the RGB colorspace. Conceptually, the RGB
 colors are first mapped into YIQ space, rotated around the Y channel by
diff --git a/tensorflow/contrib/image/ops/image_ops.cc b/tensorflow/contrib/image/ops/image_ops.cc
index 68771b3d054a64ba94141c092e20df1ed6b2339b..ebdcaea7abae2a967786831b62b331897aa3f6a3 100644
--- a/tensorflow/contrib/image/ops/image_ops.cc
+++ b/tensorflow/contrib/image/ops/image_ops.cc
@@ -93,7 +93,7 @@ row_to_col_match_indices: A vector of length num_rows, which is the number of
   If `row_to_col_match_indices[i]` is not -1, row i is matched to column
   `row_to_col_match_indices[i]`.
 col_to_row_match_indices: A vector of length num_columns, which is the number
-  of columns of the input ditance matrix.
+  of columns of the input distance matrix.
   If `col_to_row_match_indices[j]` is not -1, column j is matched to row
   `col_to_row_match_indices[j]`.
 )doc");
diff --git a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
index 8139d4272d6950815bd39a64e86e0f7422e6f799..bd784c6bda0344c092c1ae0af2c60be50fdff102 100755
--- a/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
+++ b/tensorflow/contrib/image/ops/single_image_random_dot_stereograms_ops.cc
@@ -69,7 +69,7 @@ Outputs a single image random dot stereogram for export via encode_PNG/JPG OP.
 Given the 2-D tensor 'depth_values' with encoded Z values, this operation will
 encode 3-D data into a 2-D image.  The output of this Op is suitable for the
 encode_PNG/JPG ops.  Be careful with image compression as this may corrupt the
-encode 3-D data witin the image.
+encode 3-D data within the image.
 
 This Op is based upon:
 'http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper'
@@ -111,7 +111,7 @@ output_image_shape: Output size of returned image in X,Y, Channels 1-grayscale,
 output_data_window: Size of "DATA" window, must be equal to or smaller than 'output_image_shape', will be centered
   and use 'convergence_dots_size' for best fit to avoid overlap if possible
 
-image:= A tensor of size 'output_image_shape' with the encloded 'depth_values'
+image:= A tensor of size 'output_image_shape' with the encoded 'depth_values'
 )doc");
 
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index c139ae89d8d682d6b87813c3a21703ffa762f28e..cd984c80543886be1f682933e2e003bd3374e425 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -433,7 +433,7 @@ def bipartite_match(distance_mat,
       of rows of the input `distance_matrix`. If `row_to_col_match_indices[i]`
       is not -1, row i is matched to column `row_to_col_match_indices[i]`.
     col_to_row_match_indices: A vector of length num_columns, which is the
-      number of columns of the input ditance matrix.
+      number of columns of the input distance matrix.
       If `col_to_row_match_indices[j]` is not -1, column j is matched to row
       `col_to_row_match_indices[j]`.
   """
diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
index d4a6a5bcbb52511d4093587814100b2a0e8b2420..0ceb683ff4c6965a5ee4bcb04846a69d4d8ea0a5 100755
--- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
+++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py
@@ -45,7 +45,7 @@ def single_image_random_dot_stereograms(depth_values,
   Given the 2-D tensor 'depth_values' with encoded Z values, this operation
   will encode 3-D data into a 2-D image.  The output of this Op is suitable
   for the encode_PNG/JPG ops.  Be careful with image compression as this may
-  corrupt the encode 3-D data witin the image.
+  corrupt the encode 3-D data within the image.
 
   Based upon [this
   paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper).
diff --git a/tensorflow/contrib/input_pipeline/BUILD b/tensorflow/contrib/input_pipeline/BUILD
index 9d6b4d5d87e24d72b29ab33ee805fe0d068cc30a..0e34315db45d61282af1882631dc769a72965c3e 100644
--- a/tensorflow/contrib/input_pipeline/BUILD
+++ b/tensorflow/contrib/input_pipeline/BUILD
@@ -114,14 +114,3 @@ tf_cc_tests(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/input_pipeline/kernels/BUILD b/tensorflow/contrib/input_pipeline/kernels/BUILD
index f20a6e38d4e80f869e9274d6fc49338a95fc6788..797605b8fe66e8375edcc70668a07a8d2a6d73f3 100644
--- a/tensorflow/contrib/input_pipeline/kernels/BUILD
+++ b/tensorflow/contrib/input_pipeline/kernels/BUILD
@@ -17,14 +17,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/integrate/BUILD b/tensorflow/contrib/integrate/BUILD
index 66948c1ea1f3f239d3f43a57626f8c229fe24ad9..0b7d64f4edd7587000ca5b9ecae257fe8fedd4a1 100644
--- a/tensorflow/contrib/integrate/BUILD
+++ b/tensorflow/contrib/integrate/BUILD
@@ -42,14 +42,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/integrate/__init__.py b/tensorflow/contrib/integrate/__init__.py
index 68bf511099ab473d158108df6ff07827819297d9..694f0c14bd4e74535c70fab76c5f7ac58f452559 100644
--- a/tensorflow/contrib/integrate/__init__.py
+++ b/tensorflow/contrib/integrate/__init__.py
@@ -18,6 +18,7 @@
 See the @{$python/contrib.integrate} guide.
 
 @@odeint
+@@odeint_fixed
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/kafka/BUILD b/tensorflow/contrib/kafka/BUILD
index 1c3974871c62911c0cb47677eb92d28286837142..3913c9dc7abfba2829bde5e86fe2927e8fc29a9d 100644
--- a/tensorflow/contrib/kafka/BUILD
+++ b/tensorflow/contrib/kafka/BUILD
@@ -119,17 +119,3 @@ tf_py_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/keras/BUILD b/tensorflow/contrib/keras/BUILD
index 7e0019ce4ad6c96e09ac9e222e2f4e2840273983..7a4cab20d1a3471af2a2a402a6d1443a90fa7f9b 100644
--- a/tensorflow/contrib/keras/BUILD
+++ b/tensorflow/contrib/keras/BUILD
@@ -52,15 +52,3 @@ py_library(
         "//tensorflow/python/keras",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/BUILD b/tensorflow/contrib/kernel_methods/BUILD
index eff7dfeb4c1117e40f4faf43c5e92a52cffd6528..87c2dcd89b63fa9f92d93c87abce91fd3460d44e 100644
--- a/tensorflow/contrib/kernel_methods/BUILD
+++ b/tensorflow/contrib/kernel_methods/BUILD
@@ -90,15 +90,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kernel_methods/python/losses.py b/tensorflow/contrib/kernel_methods/python/losses.py
index f182fef067b7f523bc5ca63227265be40528b171..4ef0a66a52429233c6e6f70667a451466493629c 100644
--- a/tensorflow/contrib/kernel_methods/python/losses.py
+++ b/tensorflow/contrib/kernel_methods/python/losses.py
@@ -43,10 +43,10 @@ def sparse_multiclass_hinge_loss(
 
   This is a generalization of standard (binary) hinge loss. For a given instance
   with correct label c*, the loss is given by:
-    loss = max_{c != c*} logits_c - logits_{c*} + 1.
+    $$loss = max_{c != c*} logits_c - logits_{c*} + 1.$$
   or equivalently
-    loss = max_c { logits_c - logits_{c*} + I_{c != c*} }
-  where I_{c != c*} = 1 if c != c* and 0 otherwise.
+    $$loss = max_c { logits_c - logits_{c*} + I_{c != c*} }$$
+  where \\(I_{c != c*} = 1\ \text{if}\ c != c*\\) and 0 otherwise.
 
   Args:
     labels: `Tensor` of shape [batch_size] or [batch_size, 1]. Corresponds to
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
index 9dc01124ab195ae17b8795a11e4ebefe3f2c746b..9a721a9d440e66eb30bb94daf2b6878318f1e75f 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features.py
@@ -34,33 +34,31 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
   r"""Class that implements Random Fourier Feature Mapping (RFFM) in TensorFlow.
 
   The RFFM mapping is used to approximate the Gaussian (RBF) kernel:
-  ```
-  exp(-||x-y||_2^2 / (2 * sigma^2))
-  ```
+  $$(exp(-||x-y||_2^2 / (2 * \sigma^2))$$
 
   The implementation of RFFM is based on the following paper:
   "Random Features for Large-Scale Kernel Machines" by Ali Rahimi and Ben Recht.
   (link: https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf)
 
-  The mapping uses a matrix `Omega \in R^{d x D}` and a bias vector `b \in R^D`
-  where `d` is the input dimension (number of dense input features) and `D` is
-  the output dimension (i.e., dimension of the feature space the input is mapped
-  to). Each entry of `Omega` is sampled i.i.d. from a (scaled) Gaussian
-  distribution and each entry of `b` is sampled independently and uniformly from
-  [0, 2 * pi].
-
-  For a single input feature vector x in R^d, its RFFM is defined as:
-  ```
-      sqrt(2/D) * cos(x * Omega + b)
-  ```
-  where `cos` is the element-wise cosine function and `x, b` are represented as
-  row vectors. The aforementioned paper shows that the linear kernel of
-  RFFM-mapped vectors approximates the Gaussian kernel of the initial vectors.
+  The mapping uses a matrix \\(\Omega \in R^{d x D}\\) and a bias vector
+  \\(b \in R^D\\) where \\(d\\) is the input dimension (number of dense input
+  features) and \\(D\\) is the output dimension (i.e., dimension of the feature
+  space the input is mapped to). Each entry of \\(\Omega\\) is sampled i.i.d.
+  from a (scaled) Gaussian distribution and each entry of \\(b\\) is sampled
+  independently and uniformly from [0, \\(2 * \pi\\)].
+
+  For a single input feature vector \\(x \in R^d\\), its RFFM is defined as:
+  $$\sqrt(2/D) * cos(x * \Omega + b)$$
+
+  where \\(cos\\) is the element-wise cosine function and \\(x, b\\) are
+  represented as row vectors. The aforementioned paper shows that the linear
+  kernel of RFFM-mapped vectors approximates the Gaussian kernel of the initial
+  vectors.
 
   """
 
   def __init__(self, input_dim, output_dim, stddev=1.0, seed=1, name=None):
-    """Constructs a RandomFourierFeatureMapper instance.
+    r"""Constructs a RandomFourierFeatureMapper instance.
 
     Args:
       input_dim: The dimension (number of features) of the tensors to be mapped.
@@ -68,11 +66,11 @@ class RandomFourierFeatureMapper(dkm.DenseKernelMapper):
       stddev: The standard deviation of the Gaussian kernel to be approximated.
         The error of the classifier trained using this approximation is very
         sensitive to this parameter.
-      seed: An integer used to initialize the parameters (`Omega` and `b`) of
-        the mapper. For repeatable sequences across different invocations of the
-        mapper object (for instance, to ensure consistent mapping both at
-        training and eval/inference if these happen in different invocations),
-        set this to the same integer.
+      seed: An integer used to initialize the parameters (\\(\Omega\\) and
+        \\(b\\)) of the mapper. For repeatable sequences across different
+        invocations of the mapper object (for instance, to ensure consistent
+        mapping both at training and eval/inference if these happen in
+        different invocations), set this to the same integer.
       name: name for the mapper object.
     """
     # TODO(sibyl-vie3Poto): Maybe infer input_dim and/or output_dim (if not explicitly
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 6f4a264485993ab737723171409042b4a9673669..91929184a2e6f3cccae92cb819501a7c6ef81673 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -34,7 +34,7 @@ def _inner_product(x, y):
   """Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
-  returns x * y^T.
+  returns \\(x * y^T\\).
 
   Args:
     x: input tensor in row format
diff --git a/tensorflow/contrib/kfac/BUILD b/tensorflow/contrib/kfac/BUILD
index 9a5759bf14f753bbc50d3ef8f54ceab7daf745ab..b719046b37ac761d56e8d5aa34772103be691cd6 100644
--- a/tensorflow/contrib/kfac/BUILD
+++ b/tensorflow/contrib/kfac/BUILD
@@ -24,15 +24,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/BUILD b/tensorflow/contrib/kfac/examples/BUILD
index 89965eda374b2b403f680fc77eb923d0e660d1e2..8186fa1c62cb952f86614a96c3965bcddae1686e 100644
--- a/tensorflow/contrib/kfac/examples/BUILD
+++ b/tensorflow/contrib/kfac/examples/BUILD
@@ -28,8 +28,28 @@ py_library(
 )
 
 py_binary(
-    name = "convnet_mnist_main",
-    srcs = ["convnet_mnist_main.py"],
+    name = "convnet_mnist_single_main",
+    srcs = ["convnet_mnist_single_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_multi_tower_main",
+    srcs = ["convnet_mnist_multi_tower_main.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":convnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_binary(
+    name = "convnet_mnist_distributed_main",
+    srcs = ["convnet_mnist_distributed_main.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":convnet",
@@ -58,15 +78,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index 39d80addaac1fe855a37255b32bf4412b99df46a..e8e3353091df25e135b1247bf976bb9ce177d1a7 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -37,6 +37,8 @@ import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import mlp
 from tensorflow.contrib.kfac.examples import mnist
+from tensorflow.contrib.kfac.python.ops import optimizer as opt
+
 
 lc = tf.contrib.kfac.layer_collection
 oq = tf.contrib.kfac.op_queue
@@ -48,12 +50,18 @@ __all__ = [
     "linear_layer",
     "build_model",
     "minimize_loss_single_machine",
-    "minimize_loss_distributed",
+    "distributed_grads_only_and_ops_chief_worker",
+    "distributed_grads_and_ops_dedicated_workers",
     "train_mnist_single_machine",
-    "train_mnist_distributed",
+    "train_mnist_distributed_sync_replicas",
+    "train_mnist_multitower"
 ]
 
 
+# Inverse update ops will be run every _INVERT_EVRY iterations.
+_INVERT_EVERY = 10
+
+
 def conv_layer(layer_id, inputs, kernel_size, out_channels):
   """Builds a convolutional layer with ReLU non-linearity.
 
@@ -161,8 +169,9 @@ def build_model(examples, labels, num_labels, layer_collection):
   accuracy = tf.reduce_mean(
       tf.cast(tf.equal(labels, tf.argmax(logits, axis=1)), dtype=tf.float32))
 
-  tf.summary.scalar("loss", loss)
-  tf.summary.scalar("accuracy", accuracy)
+  with tf.device("/cpu:0"):
+    tf.summary.scalar("loss", loss)
+    tf.summary.scalar("accuracy", accuracy)
 
   # Register parameters. K-FAC needs to know about the inputs, outputs, and
   # parameters of each conv/fully connected layer and the logits powering the
@@ -181,41 +190,59 @@ def build_model(examples, labels, num_labels, layer_collection):
 def minimize_loss_single_machine(loss,
                                  accuracy,
                                  layer_collection,
+                                 device="/gpu:0",
                                  session_config=None):
   """Minimize loss with K-FAC on a single machine.
 
-  A single Session is responsible for running all of K-FAC's ops.
+  A single Session is responsible for running all of K-FAC's ops. The covariance
+  and inverse update ops are placed on `device`. All model variables are on CPU.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and invserse
+      update ops are run on this device.
     session_config: None or tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     final value for 'accuracy'.
   """
   # Train with K-FAC.
-  global_step = tf.train.get_or_create_global_step()
+  g_step = tf.train.get_or_create_global_step()
   optimizer = opt.KfacOptimizer(
       learning_rate=0.0001,
       cov_ema_decay=0.95,
       damping=0.001,
       layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=[device],
+      inv_devices=[device],
       momentum=0.9)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  with tf.device(device):
+    train_op = optimizer.minimize(loss, global_step=g_step)
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
 
-      if global_step_ % 100 == 0:
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
@@ -250,16 +277,62 @@ def _num_gradient_tasks(num_tasks):
   return int(np.ceil(0.6 * num_tasks))
 
 
-def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
-                              checkpoint_dir, loss, accuracy, layer_collection):
-  """Minimize loss with an synchronous implementation of K-FAC.
+def _make_distributed_train_op(
+    task_id,
+    num_worker_tasks,
+    num_ps_tasks,
+    layer_collection
+):
+  """Creates optimizer and distributed training op.
 
-  Different tasks are responsible for different parts of K-FAC's Ops. The first
-  60% of tasks update weights; the next 20% accumulate covariance statistics;
-  the last 20% invert the matrices used to precondition gradients.
+  Constructs KFAC optimizer and wraps it in `sync_replicas` optimizer. Makes
+  the train op.
+
+  Args:
+   task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    sync_optimizer: `tf.train.SyncReplicasOptimizer` instance which wraps KFAC
+      optimizer.
+    optimizer: Instance of `opt.KfacOptimizer`.
+    global_step: `tensor`, Global step.
+  """
+  tf.logging.info("Task id : %d", task_id)
+  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
+    global_step = tf.train.get_or_create_global_step()
+    optimizer = opt.KfacOptimizer(
+        learning_rate=0.0001,
+        cov_ema_decay=0.95,
+        damping=0.001,
+        layer_collection=layer_collection,
+        momentum=0.9)
+    sync_optimizer = tf.train.SyncReplicasOptimizer(
+        opt=optimizer,
+        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks),
+        total_num_replicas=num_worker_tasks)
+    return sync_optimizer, optimizer, global_step
+
+
+def distributed_grads_only_and_ops_chief_worker(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection, invert_every=10):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  All workers perform gradient computation. Chief worker applies gradient after
+  averaging the gradients obtained from all the workers. All workers block
+  execution untill the update is applied. Chief worker runs covariance and
+  inverse update ops. Covariance and inverse matrices are placed on parameter
+  servers in a round robin manner. For further details on synchronous
+  distributed optimization check `tf.train.SyncReplicasOptimizer`.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables. If 0,
       parameter servers are not used.
@@ -271,6 +344,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       run with each step.
     layer_collection: LayerCollection instance describing model architecture.
       Used by K-FAC to construct preconditioner.
+    invert_every: `int`, Number of steps between update the inverse.
 
   Returns:
     final value for 'accuracy'.
@@ -278,19 +352,80 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   Raises:
     ValueError: if task_id >= num_worker_tasks.
   """
-  with tf.device(tf.train.replica_device_setter(num_ps_tasks)):
-    global_step = tf.train.get_or_create_global_step()
-    optimizer = opt.KfacOptimizer(
-        learning_rate=0.0001,
-        cov_ema_decay=0.95,
-        damping=0.001,
-        layer_collection=layer_collection,
-        momentum=0.9)
-    inv_update_queue = oq.OpQueue(optimizer.inv_update_ops)
-    sync_optimizer = tf.train.SyncReplicasOptimizer(
-        opt=optimizer,
-        replicas_to_aggregate=_num_gradient_tasks(num_worker_tasks))
-    train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+
+  tf.logging.info("Starting training.")
+  hooks = [sync_optimizer.make_session_run_hook(is_chief)]
+
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  if is_chief:
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([train_op, cov_update_op]):
+      update_op = tf.cond(
+          tf.equal(tf.mod(global_step + 1, invert_every), 0),
+          lambda: make_update_op(inv_update_thunks),
+          tf.no_op)
+  else:
+    update_op = train_op
+
+  with tf.train.MonitoredTrainingSession(
+      master=master,
+      is_chief=is_chief,
+      checkpoint_dir=checkpoint_dir,
+      hooks=hooks,
+      stop_grace_period_secs=0) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, update_op])
+      tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
+                      loss_, accuracy_)
+  return accuracy_
+
+
+def distributed_grads_and_ops_dedicated_workers(
+    task_id, is_chief, num_worker_tasks, num_ps_tasks, master, checkpoint_dir,
+    loss, accuracy, layer_collection):
+  """Minimize loss with a synchronous implementation of K-FAC.
+
+  Different workers are responsible for different parts of K-FAC's Ops. The
+  first 60% of tasks compute gradients; the next 20% accumulate covariance
+  statistics; the last 20% invert the matrices used to precondition gradients.
+  The chief worker applies the gradient .
+
+  Args:
+    task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
+    num_worker_tasks: int. Number of workers in this distributed training setup.
+    num_ps_tasks: int. Number of parameter servers holding variables. If 0,
+      parameter servers are not used.
+    master: string. IP and port of TensorFlow runtime process. Set to empty
+      string to run locally.
+    checkpoint_dir: string or None. Path to store checkpoints under.
+    loss: 0-D Tensor. Loss to be minimized.
+    accuracy: dict mapping strings to 0-D Tensors. Additional accuracy to
+      run with each step.
+    layer_collection: LayerCollection instance describing model architecture.
+      Used by K-FAC to construct preconditioner.
+
+  Returns:
+    final value for 'accuracy'.
+
+  Raises:
+    ValueError: if task_id >= num_worker_tasks.
+  """
+  sync_optimizer, optimizer, global_step = _make_distributed_train_op(
+      task_id, num_worker_tasks, num_ps_tasks, layer_collection)
+  _, cov_update_op, inv_update_ops, _, _, _ = optimizer.make_ops_and_vars()
+  train_op = sync_optimizer.minimize(loss, global_step=global_step)
+  inv_update_queue = oq.OpQueue(inv_update_ops)
 
   tf.logging.info("Starting training.")
   is_chief = (task_id == 0)
@@ -306,7 +441,7 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
       if _is_gradient_task(task_id, num_worker_tasks):
         learning_op = train_op
       elif _is_cov_update_task(task_id, num_worker_tasks):
-        learning_op = optimizer.cov_update_op
+        learning_op = cov_update_op
       elif _is_inv_update_task(task_id, num_worker_tasks):
         # TODO(duckworthd): Running this op before cov_update_op has been run a
         # few times can result in "InvalidArgumentError: Cholesky decomposition
@@ -324,13 +459,18 @@ def minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks, master,
   return accuracy_
 
 
-def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
+def train_mnist_single_machine(data_dir,
+                               num_epochs,
+                               use_fake_data=False,
+                               device="/gpu:0"):
   """Train a ConvNet on MNIST.
 
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    device: string, Either '/cpu:0' or '/gpu:0'. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
@@ -350,22 +490,38 @@ def train_mnist_single_machine(data_dir, num_epochs, use_fake_data=False):
       examples, labels, num_labels=10, layer_collection=layer_collection)
 
   # Fit model.
-  return minimize_loss_single_machine(loss, accuracy, layer_collection)
+  return minimize_loss_single_machine(
+      loss, accuracy, layer_collection, device=device)
 
 
 def train_mnist_multitower(data_dir, num_epochs, num_towers,
-                           use_fake_data=True):
+                           use_fake_data=True, devices=None):
   """Train a ConvNet on MNIST.
 
+  Training data is split equally among the towers. Each tower computes loss on
+  its own batch of data and the loss is aggregated on the CPU. The model
+  variables are placed on first tower. The covariance and inverse update ops
+  and variables are placed on GPUs in a round robin manner.
+
   Args:
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
     num_towers: int. Number of CPUs to split inference across.
     use_fake_data: bool. If True, generate a synthetic dataset.
+    devices: string, Either list of CPU or GPU. The covaraince and inverse
+      update ops are run on this device.
 
   Returns:
     accuracy of model on the final minibatch of training data.
   """
+  if devices:
+    device_count = {"GPU": num_towers}
+  else:
+    device_count = {"CPU": num_towers}
+
+  devices = devices or [
+      "/cpu:{}".format(tower_id) for tower_id in range(num_towers)
+  ]
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
   tower_batch_size = 128
@@ -388,7 +544,7 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   layer_collection = lc.LayerCollection()
   tower_results = []
   for tower_id in range(num_towers):
-    with tf.device("/cpu:%d" % tower_id):
+    with tf.device(devices[tower_id]):
       with tf.name_scope("tower%d" % tower_id):
         with tf.variable_scope(tf.get_variable_scope(), reuse=(tower_id > 0)):
           tf.logging.info("Building tower %d." % tower_id)
@@ -402,34 +558,79 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   accuracy = tf.reduce_mean(accuracies)
 
   # Fit model.
+
   session_config = tf.ConfigProto(
-      allow_soft_placement=False, device_count={
-          "CPU": num_towers
-      })
-  return minimize_loss_single_machine(
-      loss, accuracy, layer_collection, session_config=session_config)
+      allow_soft_placement=False,
+      device_count=device_count,
+  )
+
+  g_step = tf.train.get_or_create_global_step()
+  optimizer = opt.KfacOptimizer(
+      learning_rate=0.0001,
+      cov_ema_decay=0.95,
+      damping=0.001,
+      layer_collection=layer_collection,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices,
+      momentum=0.9)
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
+  train_op = optimizer.minimize(loss, global_step=g_step)
 
-def train_mnist_distributed(task_id,
-                            num_worker_tasks,
-                            num_ps_tasks,
-                            master,
-                            data_dir,
-                            num_epochs,
-                            use_fake_data=False):
-  """Train a ConvNet on MNIST.
+  def make_update_op(update_thunks):
+    update_op = [thunk() for thunk in update_thunks]
+    return tf.group(*update_op)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([train_op, cov_update_op]):
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+
+  tf.logging.info("Starting training.")
+  with tf.train.MonitoredTrainingSession(config=session_config) as sess:
+    while not sess.should_stop():
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [g_step, loss, accuracy, inverse_op])
+
+      if (global_step_ + 1) % _INVERT_EVERY == 0:
+        tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
+                        global_step_, loss_, accuracy_)
+
+
+def train_mnist_distributed_sync_replicas(task_id,
+                                          is_chief,
+                                          num_worker_tasks,
+                                          num_ps_tasks,
+                                          master,
+                                          data_dir,
+                                          num_epochs,
+                                          op_strategy,
+                                          use_fake_data=False):
+  """Train a ConvNet on MNIST using Sync replicas optimizer.
 
   Args:
     task_id: int. Integer in [0, num_worker_tasks). ID for this worker.
+    is_chief: `boolean`, `True` if the worker is chief worker.
     num_worker_tasks: int. Number of workers in this distributed training setup.
     num_ps_tasks: int. Number of parameter servers holding variables.
     master: string. IP and port of TensorFlow runtime process.
     data_dir: string. Directory to read MNIST examples from.
     num_epochs: int. Number of passes to make over the training set.
+    op_strategy: `string`, Strategy to run the covariance and inverse
+      ops. If op_strategy == `chief_worker` then covaraiance and inverse
+      update ops are run on chief worker otherwise they are run on dedicated
+      workers.
+
     use_fake_data: bool. If True, generate a synthetic dataset.
 
   Returns:
     accuracy of model on the final minibatch of training data.
+
+  Raises:
+    ValueError: If `op_strategy` not in ["chief_worker", "dedicated_workers"].
   """
   # Load a dataset.
   tf.logging.info("Loading MNIST into memory.")
@@ -448,9 +649,17 @@ def train_mnist_distributed(task_id,
 
   # Fit model.
   checkpoint_dir = None if data_dir is None else os.path.join(data_dir, "kfac")
-  return minimize_loss_distributed(task_id, num_worker_tasks, num_ps_tasks,
-                                   master, checkpoint_dir, loss, accuracy,
-                                   layer_collection)
+  if op_strategy == "chief_worker":
+    return distributed_grads_only_and_ops_chief_worker(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  elif op_strategy == "dedicated_workers":
+    return distributed_grads_and_ops_dedicated_workers(
+        task_id, is_chief, num_worker_tasks, num_ps_tasks, master,
+        checkpoint_dir, loss, accuracy, layer_collection)
+  else:
+    raise ValueError("Only supported op strategies are : {}, {}".format(
+        "chief_worker", "dedicated_workers"))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c2d4a9e9bfcc4bfb55a25d2f23e66afe5b1375
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_distributed_main.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Distributed training with sync replicas optimizer. See
+`convnet.train_mnist_distributed_sync_replicas` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_integer("task", -1, "Task identifier")
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
+flags.DEFINE_string(
+    "cov_inv_op_strategy", "chief_worker",
+    "In dist training mode run the cov, inv ops on chief or dedicated workers."
+)
+flags.DEFINE_string("master", "local", "Session master.")
+flags.DEFINE_integer("ps_tasks", 2,
+                     "Number of tasks in the parameter server job.")
+flags.DEFINE_integer("replicas_to_aggregate", 5,
+                     "Number of replicas to aggregate.")
+flags.DEFINE_integer("worker_replicas", 5, "Number of replicas in worker job.")
+flags.DEFINE_integer("num_epochs", None, "Number of epochs.")
+
+
+def _is_chief():
+  """Determines whether a job is the chief worker."""
+  if "chief_worker" in FLAGS.brain_jobs:
+    return FLAGS.brain_job_name == "chief_worker"
+  else:
+    return FLAGS.task == 0
+
+
+def main(unused_argv):
+  _ = unused_argv
+  convnet.train_mnist_distributed_sync_replicas(
+      FLAGS.task, _is_chief(), FLAGS.worker_replicas, FLAGS.ps_tasks,
+      FLAGS.master, FLAGS.data_dir, FLAGS.num_epochs, FLAGS.cov_inv_op_strategy)
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..4249bf8a8d9d3a5beb87d4140a55b0ee6eadbc64
--- /dev/null
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_multi_tower_main.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Train a ConvNet on MNIST using K-FAC.
+
+Multi tower training mode. See `convnet.train_mnist_multitower` for details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from absl import flags
+import tensorflow as tf
+
+from tensorflow.contrib.kfac.examples import convnet
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/multitower_1/mnist", "local mnist dir")
+flags.DEFINE_integer("num_towers", 2,
+                     "Number of towers for multi tower training.")
+
+
+def main(unused_argv):
+  _ = unused_argv
+  assert FLAGS.num_towers > 1
+  devices = ["/gpu:{}".format(tower_id) for tower_id in range(FLAGS.num_towers)]
+  convnet.train_mnist_multitower(
+      FLAGS.data_dir,
+      num_epochs=200,
+      num_towers=FLAGS.num_towers,
+      devices=devices)
+
+
+if __name__ == "__main__":
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
similarity index 57%
rename from tensorflow/contrib/kfac/examples/convnet_mnist_main.py
rename to tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
index b0c6fbde198850c76af0bc1600dc23e926227229..3aa52aff196fd2699559f80b0c226f470c94b2a3 100644
--- a/tensorflow/contrib/kfac/examples/convnet_mnist_main.py
+++ b/tensorflow/contrib/kfac/examples/convnet_mnist_single_main.py
@@ -14,44 +14,26 @@
 # ==============================================================================
 r"""Train a ConvNet on MNIST using K-FAC.
 
-See convnet.py for details.
+Train on single machine. See `convnet.train_mnist_single_machine` for details.
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-import sys
 
+from absl import flags
 import tensorflow as tf
 
 from tensorflow.contrib.kfac.examples import convnet
 
-FLAGS = None
+FLAGS = flags.FLAGS
+flags.DEFINE_string("data_dir", "/tmp/mnist", "local mnist dir")
 
 
-def main(argv):
-  _ = argv
-
-  if FLAGS.num_towers > 1:
-    convnet.train_mnist_multitower(
-        FLAGS.data_dir, num_epochs=200, num_towers=FLAGS.num_towers)
-  else:
-    convnet.train_mnist_single_machine(FLAGS.data_dir, num_epochs=200)
+def main(unused_argv):
+  convnet.train_mnist_single_gpu(FLAGS.data_dir, num_epochs=200)
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      "--data_dir",
-      type=str,
-      default="/tmp/mnist",
-      help="Directory to store dataset in.")
-  parser.add_argument(
-      "--num_towers",
-      type=int,
-      default=1,
-      help="Number of CPUs to split minibatch across.")
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  tf.app.run(main=main)
diff --git a/tensorflow/contrib/kfac/examples/tests/BUILD b/tensorflow/contrib/kfac/examples/tests/BUILD
index ce7da95c124beaed4773d68ce0d0c41f187f7c9d..ede7f183fe24f26bd86e232e831dea5f8ea1fdc4 100644
--- a/tensorflow/contrib/kfac/examples/tests/BUILD
+++ b/tensorflow/contrib/kfac/examples/tests/BUILD
@@ -50,15 +50,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 8d86c2bb5150cd4bc8a2b21ba050e904929e0fe9..6de775cc79953ba548c766e861d6d88e0455a508 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -112,15 +112,16 @@ class ConvNetTest(tf.test.TestCase):
   def testMinimizeLossSingleMachine(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_single_machine(loss, accuracy,
-                                                       layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      accuracy_ = convnet.minimize_loss_single_machine(
+          loss, accuracy, layer_collection, device="/cpu:0")
+      self.assertLess(accuracy_, 2.0)
 
   def testMinimizeLossDistributed(self):
     with tf.Graph().as_default():
       loss, accuracy, layer_collection = self._build_toy_problem()
-      accuracy_ = convnet.minimize_loss_distributed(
+      accuracy_ = convnet.distributed_grads_only_and_ops_chief_worker(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
@@ -128,7 +129,7 @@ class ConvNetTest(tf.test.TestCase):
           loss=loss,
           accuracy=accuracy,
           layer_collection=layer_collection)
-      self.assertLess(accuracy_, 1.0)
+      self.assertLess(accuracy_, 2.0)
 
   def testTrainMnistSingleMachine(self):
     with tf.Graph().as_default():
@@ -138,7 +139,7 @@ class ConvNetTest(tf.test.TestCase):
       # but there are too few parameters for the model to effectively memorize
       # the training set the way an MLP can.
       convnet.train_mnist_single_machine(
-          data_dir=None, num_epochs=1, use_fake_data=True)
+          data_dir=None, num_epochs=1, use_fake_data=True, device="/cpu:0")
 
   def testTrainMnistMultitower(self):
     with tf.Graph().as_default():
@@ -149,13 +150,15 @@ class ConvNetTest(tf.test.TestCase):
   def testTrainMnistDistributed(self):
     with tf.Graph().as_default():
       # Ensure model training doesn't crash.
-      convnet.train_mnist_distributed(
+      convnet.train_mnist_distributed_sync_replicas(
           task_id=0,
+          is_chief=True,
           num_worker_tasks=1,
           num_ps_tasks=0,
           master="",
           data_dir=None,
           num_epochs=1,
+          op_strategy="chief_worker",
           use_fake_data=True)
 
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index 146ae8b7e2a3b2b479d5b8db7b8bffaca59a358f..2477d2bfc12c2df64a672fd457e9634009ccd129 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -114,6 +114,7 @@ py_test(
     name = "utils_test",
     srcs = ["utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/contrib/tpu",
@@ -155,15 +156,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index d721ad08afaa416f86ce881d4cdd968cd1809b5a..b897fd68a080e819042cd36f2a1acfcf175e656b 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -244,15 +244,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index ced1110676754b6c8bba813ace743b3f3daddb26..d11c9c828810742cd176e4c5b7b77cf9a5cf87d9 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -85,9 +85,9 @@ class FisherEstimator(object):
     """Create a FisherEstimator object.
 
     Args:
-      variables: A list of the variables for which to estimate the Fisher. This
-          must match the variables registered in layer_collection (if it is not
-          None).
+      variables: A `list` of variables or `callable` which returns the variables
+          for which to estimate the Fisher. This must match the variables
+          registered in layer_collection (if it is not None).
       cov_ema_decay: The decay factor used when calculating the covariance
           estimate moving averages.
       damping: float. The damping factor used to stabilize training due to
@@ -147,7 +147,10 @@ class FisherEstimator(object):
 
   @property
   def variables(self):
-    return self._variables
+    if callable(self._variables):
+      return self._variables()
+    else:
+      return self._variables
 
   @property
   def damping(self):
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index b04bf76a886049e876a8dde647dc7b718d03da9d..00b3673a742e92057b0a1673d3f42a19379111fe 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -19,11 +19,11 @@ Information matrix. Suppose one has a model that parameterizes a posterior
 distribution over 'y' given 'x' with parameters 'params', p(y | x, params). Its
 Fisher Information matrix is given by,
 
-  F(params) = E[ v(x, y, params) v(x, y, params)^T ]
+  $$F(params) = E[ v(x, y, params) v(x, y, params)^T ]$$
 
 where,
 
-  v(x, y, params) = (d / d params) log p(y | x, params)
+  $$v(x, y, params) = (d / d params) log p(y | x, params)$$
 
 and the expectation is taken with respect to the data's distribution for 'x' and
 the model's posterior distribution for 'y',
@@ -85,7 +85,7 @@ def normalize_damping(damping, num_replications):
 def compute_pi_tracenorm(left_cov, right_cov):
   """Computes the scalar constant pi for Tikhonov regularization/damping.
 
-  pi = sqrt( (trace(A) / dim(A)) / (trace(B) / dim(B)) )
+  $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
@@ -462,14 +462,14 @@ class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider fully connected layer in this model with (unshared) weight matrix
   'w'. For an example 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( a (d loss / d s)^T )
+    $$v(x, y, w) = vec( a (d loss / d s)^T )$$
 
   This FisherBlock tracks Fisher(params)[i, i] for all indices 'i' corresponding
   to the layer's parameters 'w'.
@@ -532,14 +532,14 @@ class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
   Let 'params' be a vector parameterizing a model and 'i' an arbitrary index
   into it. We are interested in Fisher(params)[i, i]. This is,
 
-    Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
-                         = E[ v(x, y, params)[i] ^ 2 ]
+    $$Fisher(params)[i, i] = E[ v(x, y, params) v(x, y, params)^T ][i, i]
+                         = E[ v(x, y, params)[i] ^ 2 ]$$
 
   Consider a convoluational layer in this model with (unshared) filter matrix
   'w'. For an example image 'x' that produces layer inputs 'a' and output
   preactivations 's',
 
-    v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )
+    $$v(x, y, w) = vec( sum_{loc} a_{loc} (d loss / d s_{loc})^T )$$
 
   where 'loc' is a single (x, y) location in an image.
 
@@ -805,12 +805,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
   'w'. For a minibatch that produces inputs 'a' and output preactivations 's',
   this FisherBlock estimates,
 
-    F(w) = #locations * kronecker(E[flat(a) flat(a)^T],
-                                  E[flat(ds) flat(ds)^T])
+    $$F(w) = \#locations * kronecker(E[flat(a) flat(a)^T],
+                                  E[flat(ds) flat(ds)^T])$$
 
   where
 
-    ds = (d / ds) log p(y | x, w)
+    $$ds = (d / ds) log p(y | x, w)$$
     #locations = number of (x, y) locations where 'w' is applied.
 
   where the expectation is taken over all examples and locations and flat()
@@ -861,12 +861,12 @@ class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
     super(ConvKFCBasicFB, self).__init__(layer_collection)
 
   def instantiate_factors(self, grads_list, damping):
+    inputs, grads_list = self._process_data(grads_list)
+
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(self._inputs[0].shape.as_list(),
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
                                              self._strides)
 
-    inputs, grads_list = self._process_data(grads_list)
-
     self._input_factor = self._layer_collection.make_or_get_factor(
         fisher_factors.ConvInputKroneckerFactor,
         (inputs, self._filter_shape, self._padding, self._strides,
@@ -1391,7 +1391,7 @@ class ConvKFCBasicMultiIndepFB(InputOutputMultiTowerMultiUse,
     inputs, grads_list = self._process_data(grads_list)
 
     # Infer number of locations upon which convolution is applied.
-    self._num_locations = num_conv_locations(inputs.shape.as_list(),
+    self._num_locations = num_conv_locations(inputs[0].shape.as_list(),
                                              self._strides)
 
     self._input_factor = self._layer_collection.make_or_get_factor(
@@ -1567,7 +1567,7 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
 
     if self._option == SeriesFBApproximation.option1:
 
-      # Note that L_A = A0^(-1/2) * U_A and L_G = G0^(-1/2) * U_G.
+      # Note that \\(L_A = A0^{-1/2} * U_A and L_G = G0^{-1/2} * U_G.\\)
       L_A, psi_A = self._input_factor.get_option1quants(
           self._input_damping_func)
       L_G, psi_G = self._output_factor.get_option1quants(
@@ -1581,33 +1581,33 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
         T = self._num_timesteps
         return (1 - x)**2 / (T * (1 - x**2) - 2 * x * (1 - x**T))
 
-      # Y = gamma( psi_G*psi_A^T ) (computed element-wise)
+      # \\(Y = \gamma( psi_G*psi_A^T )\\) (computed element-wise)
       # Even though Y is Z-independent we are recomputing it from the psi's
       # each since Y depends on both A and G quantities, and it is relatively
       # cheap to compute.
       Y = gamma(array_ops.reshape(psi_G, [int(psi_G.shape[0]), -1]) * psi_A)
 
-      # Z = L_G^T * Z * L_A
+      # \\(Z = L_G^T * Z * L_A\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = U_G^T * Z * U_A
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = U_G^T * Z * U_A\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A), transpose_a=True)
 
-      # Z = Z .* Y
+      # \\(Z = Z .* Y\\)
       Z *= Y
 
-      # Z = L_G * Z * L_A^T
+      # \\(Z = L_G * Z * L_A^T\\)
       # This is equivalent to the following computation from the original
       # pseudo-code:
-      # Z = U_G * Z * U_A^T
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
+      # \\(Z = U_G * Z * U_A^T\\)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
       Z = math_ops.matmul(L_G, math_ops.matmul(Z, L_A, transpose_b=True))
 
     elif self._option == SeriesFBApproximation.option2:
 
-      # Note that P_A = A_1^T * A_0^(-1) and P_G = G_1^T * G_0^(-1),
-      # and K_A = A_0^(-1/2) * E_A and K_G = G_0^(-1/2) * E_G.
+      # Note that \\(P_A = A_1^T * A_0^{-1} and P_G = G_1^T * G_0^{-1}\\),
+      # and \\(K_A = A_0^{-1/2} * E_A\ and\ K_G = G_0^{-1/2} * E_G.\\)
       P_A, K_A, mu_A = self._input_factor.get_option2quants(
           self._input_damping_func)
       P_G, K_G, mu_G = self._output_factor.get_option2quants(
@@ -1616,26 +1616,26 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # Our approach differs superficially from the pseudo-code in the paper
       # in order to reduce the total number of matrix-matrix multiplies.
       # In particular, the first three computations in the pseudo code are
-      # Z = G0^(-1/2) * Z * A0^(-1/2)
-      # Z = Z - hPsi_G^T * Z * hPsi_A
-      # Z = E_G^T * Z * E_A
-      # Noting that hPsi = C0^(-1/2) * C1 * C0^(-1/2), so that
-      # C0^(-1/2) * hPsi = C0^(-1) * C1 * C0^(-1/2) = P^T * C0^(-1/2)
+      # \\(Z = G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(Z = Z - hPsi_G^T * Z * hPsi_A\\)
+      # \\(Z = E_G^T * Z * E_A\\)
+      # Noting that hPsi = C0^{-1/2} * C1 * C0^{-1/2}\\), so that
+      # \\(C0^{-1/2} * hPsi = C0^{-1} * C1 * C0^{-1/2} = P^T * C0^{-1/2}\\)
       # the entire computation can be written as
-      # Z = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - hPsi_G^T * G0^(-1/2) * Z * A0^(-1/2) * hPsi_A) * E_A
-      #   = E_G^T * (G0^(-1/2) * Z * A0^(-1/2)
-      #     - G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2)) * E_A
-      #   = E_G^T * G0^(-1/2) * Z * A0^(-1/2) * E_A
-      #     -  E_G^T* G0^(-1/2) * P_G * Z * P_A^T * A0^(-1/2) * E_A
-      #   = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A
+      # \\(Z = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - hPsi_G^T * G0^{-1/2} * Z * A0^{-1/2} * hPsi_A) * E_A\\)
+      # \\(  = E_G^T * (G0^{-1/2} * Z * A0^{-1/2}\\)
+      # \\(    - G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2}) * E_A\\)
+      # \\(  = E_G^T * G0^{-1/2} * Z * A0^{-1/2} * E_A\\)
+      # \\(    -  E_G^T* G0^{-1/2} * P_G * Z * P_A^T * A0^{-1/2} * E_A\\)
+      # \\(  = K_G^T * Z * K_A  -  K_G^T * P_G * Z * P_A^T * K_A\\)
       # This final expression is computed by the following two lines:
-      # Z = Z - P_G * Z * P_A^T
+      # \\(Z = Z - P_G * Z * P_A^T\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A, transpose_b=True))
-      # Z = K_G^T * Z * K_A
+      # \\(Z = K_G^T * Z * K_A\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A), transpose_a=True)
 
-      # Z = Z ./ (1*1^T - mu_G*mu_A^T)
+      # \\(Z = Z ./ (1*1^T - mu_G*mu_A^T)\\)
       # Be careful with the outer product.  We don't want to accidentally
       # make it an inner-product instead.
       tmp = 1.0 - array_ops.reshape(mu_G, [int(mu_G.shape[0]), -1]) * mu_A
@@ -1646,13 +1646,13 @@ class FullyConnectedSeriesFB(InputOutputMultiTowerMultiUse,
       # We now perform the transpose/reverse version of the operations
       # derived above, whose derivation from the original pseudo-code is
       # analgous.
-      # Z = K_G * Z * K_A^T
+      # \\(Z = K_G * Z * K_A^T\\)
       Z = math_ops.matmul(K_G, math_ops.matmul(Z, K_A, transpose_b=True))
 
-      # Z = Z - P_G^T * Z * P_A
+      # \\(Z = Z - P_G^T * Z * P_A\\)
       Z -= math_ops.matmul(P_G, math_ops.matmul(Z, P_A), transpose_a=True)
 
-      # Z = normalize (1/E[T]) * Z
+      # \\(Z = normalize (1/E[T]) * Z\\)
       # Note that this normalization is done because we compute the statistics
       # by averaging, not summing, over time. (And the gradient is presumably
       # summed over time, not averaged, and thus their scales are different.)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 353e1c6abb738cf3ef59d3e188da2727b712b21a..0d40d265a1727075d0ba721b0d9a756c38269a96 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -336,12 +336,16 @@ class FisherFactor(object):
 
     new_cov = math_ops.add_n(new_cov_contribs) / float(self._num_towers)
 
-    # I have no idea if the TPU code below is still correct since I don't know
-    # what it actually does.  Also, this code is not present in some of the
-    # other versions of make_covariance_update_op.  Does it matter?
-    # Synchronize value across all TPU cores.
+    # Compute average of 'new_cov' across all TPU cores. On a TPU, each
+    # instance of 'new_cov' will be based on a different minibatch. This ensures
+    # that by the end of assign_moving_average(), all TPU cores see the same
+    # value for self._cov.
+    #
+    # Other implementations of make_covariance_update_op() that accumulate
+    # statistics in other variables should mimic this behavior.
     if utils.on_tpu():
       new_cov = utils.cross_replica_mean(new_cov)
+
     return moving_averages.assign_moving_average(
         self._cov, new_cov, ema_decay, zero_debias=ZERO_DEBIAS)
 
@@ -1398,6 +1402,10 @@ class FullyConnectedMultiKF(FullyConnectedKroneckerFactor):
       new_cov_dt1 = (math_ops.add_n(new_cov_dt1_contribs)
                      / float(self._num_towers))
 
+      # See comments in FisherFactor.make_covariance_update_op() for details.
+      if utils.on_tpu():
+        new_cov_dt1 = utils.cross_replica_mean(new_cov_dt1)
+
       op2 = moving_averages.assign_moving_average(
           self._cov_dt1, new_cov_dt1, ema_decay, zero_debias=ZERO_DEBIAS)
 
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 586a004f880e7bea2a772c53091285c2907ca31a..411da033c3a0d5e2148c02207f6e04efcd2a0efc 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -84,7 +84,7 @@ _EMBEDDING_MULTI_APPROX_TO_BLOCK_TYPES = {
     APPROX_KRONECKER_INDEP_NAME: fb.EmbeddingKFACMultiIndepFB
 }
 
-# Possible value for 'reuse' keyword argument. Sets 'reuse' to
+# Possible value for `reuse` keyword argument. Sets `reuse` to
 # tf.get_variable_scope().reuse.
 VARIABLE_SCOPE = "VARIABLE_SCOPE"
 
@@ -294,8 +294,8 @@ class LayerCollection(object):
       layer_key: A variable or tuple of variables. The key to check for in
           existing registrations and to register if valid.
       fisher_block: The associated `FisherBlock`.
-      reuse: Method to use for inserting new `FisherBlock`s. One of True, False,
-        or 'VARIABLE_SCOPE'.
+      reuse: Method to use for inserting new `FisherBlock's. One of True, False,
+        or `VARIABLE_SCOPE`.
 
     Raises:
       ValueError: If `layer_key` was already registered and reuse is `False`,
@@ -359,15 +359,14 @@ class LayerCollection(object):
         is None.
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: (OPTIONAL) bool or str.  If True, reuse an existing FisherBlock.
-        If False, create a new FisherBlock.  If VARIABLE_SCOPE, use
-        tf.get_variable_scope().reuse.
+      reuse: (OPTIONAL) bool or str.  If True, adds `loss` as an additional
+        tower for the existing loss function.
 
     Raises:
       ValueError: If reuse == True and name == None.
       ValueError: If reuse == True and seed != None.
-      KeyError: If reuse == True and no existing LossFunction with 'name' found.
-      KeyError: If reuse == False and existing LossFunction with 'name' found.
+      KeyError: If reuse == True and no existing LossFunction with `name` found.
+      KeyError: If reuse == False and existing LossFunction with `name` found.
     """
 
     name = name or self._graph.unique_name(base_name)
@@ -491,24 +490,24 @@ class LayerCollection(object):
     """
     params = frozenset(utils.ensure_sequence(params))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.fisher_blocks.keys()'.
+    # Check if any of the variables in `params` is already in
+    # 'self.fisher_blocks.keys()`.
     for registered_params, fisher_block in self.fisher_blocks.items():
       registered_params_set = set(utils.ensure_sequence(registered_params))
       for variable in params:
         if (variable in registered_params_set and
             params != registered_params_set):
           raise ValueError(
-              "Can't link parameters {}, variable {} was already registered in "
+              "Can`t link parameters {}, variable {} was already registered in "
               "group {} with layer {}".format(params, variable,
                                               registered_params, fisher_block))
 
-    # Check if any of the variables in 'params' is already in
-    # 'self.linked_parameters'.
+    # Check if any of the variables in `params` is already in
+    # 'self.linked_parameters`.
     for variable in params:
       for other_linked_params in self.linked_parameters:
         if variable in other_linked_params:
-          raise ValueError("Can't link parameters {}, variable {} was already "
+          raise ValueError("Can`t link parameters {}, variable {} was already "
                            "linked in group {}.".format(params, variable,
                                                         other_linked_params))
     self._linked_parameters[params] = approximation
@@ -576,15 +575,15 @@ class LayerCollection(object):
         produced by layer.
       approx: str or None. If not None must be "kron".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -618,15 +617,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -669,15 +668,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
 
@@ -686,7 +685,7 @@ class LayerCollection(object):
         _CONV2D_APPROX_TO_BLOCK_TYPES)
 
     # It feels bad to pass in configuration that has to do with the internal
-    # implementation.  And then we can't use the same constructor for both
+    # implementation.  And then we can`t use the same constructor for both
     # anymore and are thus forced to use this ugly if-statement.
     # TODO(b/74793309): Clean this up?
     if approx == APPROX_KRONECKER_NAME:
@@ -749,15 +748,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -804,15 +803,15 @@ class LayerCollection(object):
       data_format: str or None. Format of data.
       approx: str or None. If not None must "diagonal".  The Fisher
         approximation to use. If None the default value is used. (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     # TODO(b/74793309): Have this use _get_block_type like the other
@@ -872,15 +871,15 @@ class LayerCollection(object):
       approx: str or None. If not None must be one of "kron" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds 'inputs' and 'outputs' as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     self.register_depthwise_conv2d(
@@ -917,14 +916,14 @@ class LayerCollection(object):
       approx: str or None. It not None, must be one of "full" or "diagonal".
         The Fisher approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str. If True, this adds 'batch_size' to the total
+      reuse: bool or str. If True, this adds `batch_size` to the total
         mini-batch size use when estimating the Fisher block for this layer
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -954,10 +953,10 @@ class LayerCollection(object):
         correspond to a "time-step" in an RNN). OR, can be single Tensor, of
         shape [num_uses * batch_size , input_size], which is a reshaped version
         of a Tensor of shape [num_uses, batch_size, input_size].
-      outputs: A list of Tensors, the same length as 'inputs', each of shape
+      outputs: A list of Tensors, the same length as `inputs`, each of shape
         [batch_size, output_size]. Outputs produced by layer. The list indexes
         each use in the graph (which might correspond to a "time-step" in an
-        RNN). Needs to correspond with the order used in 'inputs'.  OR, can be
+        RNN). Needs to correspond with the order used in `inputs`.  OR, can be
         a single Tensor of shape [num_uses * batch_size, output_size], which is
         a reshaped version of a Tensor of shape [num_uses, batch_size,
         output_size].
@@ -967,16 +966,16 @@ class LayerCollection(object):
       approx: str or None. If not None, must be of "kron_indep", "kron_series_1"
         or "kron_series_2". The Fisher approximation to use. If None the default
         value is used. (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
+      ValueError: For improper value to `approx`.
     """
     block_type, approx = self._get_block_type(
         params, approx, self.default_fully_connected_multi_approximation,
@@ -990,9 +989,11 @@ class LayerCollection(object):
                                                    num_uses=num_uses),
                                 reuse=reuse)
     block.register_additional_tower(inputs, outputs)
-
-    assert len(inputs) == len(outputs)
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   def register_conv2d_multi(self,
                             params,
@@ -1023,7 +1024,7 @@ class LayerCollection(object):
       outputs: A list of Tensors, each of shape [batch_size, height, width,
         out_channels]. Output produced by layer. The list indexes each use
         in the graph (which might correspond to a "time-step" in an RNN).
-        Needs to correspond with the order used in 'inputs'.  OR, can be a
+        Needs to correspond with the order used in `inputs`.  OR, can be a
         single Tensor, of shape [num_uses * batch_size, height, width,
         out_channels], which is a reshaped version of a Tensor of shape
         [num_uses, batch_size, height, width, out_channels].
@@ -1035,17 +1036,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1066,9 +1067,11 @@ class LayerCollection(object):
         reuse=reuse)
 
     block.register_additional_tower(inputs, outputs)
-
-    assert len(inputs) == len(outputs)
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      assert len(inputs) == len(outputs)
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   # TODO(b/74108452): change the loss registration functions names to refer
   # to "loss functions" instead of distributions.  Following naming convention
@@ -1088,13 +1091,13 @@ class LayerCollection(object):
       inputs: A list of Tensors, each of shape [batch_size, input_size] and
         dtype int32. Indices into embedding matrix. The list indexes each use
         in the graph (which might correspond to a "time-step" in an RNN).
-        OR, can be single Tensor, of shape [num_uses, batch_size, input_size],
+        OR, can be single Tensor, of shape [num_uses*batch_size, input_size],
         which is a reshaped version of a Tensor of shape [num_uses, batch_size,
         input_size].
       outputs: A list of Tensors, each of shape [batch_size, embedding_size].
         Outputs produced by layer. The list indexes each use in the graph
         (which might correspond to a "time-step" in an RNN). Needs to
-        correspond with the order used in 'inputs'. OR, can be a
+        correspond with the order used in `inputs`. OR, can be a
         single Tensor, of shape [num_uses * batch_size, embedding_size], which
         is a reshaped version of a Tensor of shape [num_uses, batch_size,
         embedding_size].
@@ -1104,17 +1107,17 @@ class LayerCollection(object):
       approx: str or None. If not None must by "kron_indep". The Fisher
         approximation to use. If None the default value is used.
         (Default: None)
-      reuse: bool or str.  If True, this adds inputs and outputs as an
+      reuse: bool or str.  If True, this adds `inputs` and `outputs` as an
         additional mini-batch/tower of data to use when estimating the Fisher
         block for this layer (which must have already been registered). If
         "VARIABLE_SCOPE", use tf.get_variable_scope().reuse.  (Note that the
-        word 'use' here has a completely different meaning to "use in the graph"
-        as it perturns to the 'inputs', 'outputs', and 'num_uses' arguments.)
+        word `use` here has a completely different meaning to "use in the graph"
+        as it perturns to the `inputs`, `outputs`, and `num_uses` arguments.)
         (Default: "VARIABLE_SCOPE")
 
     Raises:
-      ValueError: For improper value to 'approx'.
-      KeyError: If reuse == True but no FisherBlock found for 'params'.
+      ValueError: For improper value to `approx`.
+      KeyError: If reuse == True but no FisherBlock found for `params`.
       ValueError: If reuse == True and FisherBlock found but of the wrong type.
     """
     block_type, approx = self._get_block_type(
@@ -1129,7 +1132,10 @@ class LayerCollection(object):
         params, block_type(self, vocab_size, num_uses=num_uses), reuse=reuse)
     block.register_additional_tower(inputs, outputs)
 
-    self._add_uses(params, len(inputs))
+    if isinstance(inputs, (tuple, list)):
+      self._add_uses(params, len(inputs))
+    else:
+      self._add_uses(params, 1)
 
   def register_categorical_predictive_distribution(self,
                                                    logits,
@@ -1149,7 +1155,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1183,7 +1189,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'mean' and 'var' as an additional
+      reuse: bool or str.  If True, this adds `mean` and `var` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1212,7 +1218,7 @@ class LayerCollection(object):
         (Default: None)
       name: (OPTIONAL) str or None. Unique name for this loss function. If None,
         a new name is generated. (Default: None)
-      reuse: bool or str.  If True, this adds 'logits' as an additional
+      reuse: bool or str.  If True, this adds `logits` as an additional
         mini-batch/tower of inputs to the loss-function/predictive distribution
         (which must have already been registered). If "VARIABLE_SCOPE", use
         tf.get_variable_scope().reuse. (Default: "VARIABLE_SCOPE")
@@ -1224,18 +1230,18 @@ class LayerCollection(object):
                                 name=name, reuse=reuse)
 
   def make_or_get_factor(self, cls, args):
-    """Insert 'cls(args)' into 'self.fisher_factors' if not already present.
+    """Insert `cls(args)` into 'self.fisher_factors` if not already present.
 
-    Wraps constructor in 'tf.variable_scope()' to ensure variables constructed
-    in 'cls.__init__' are placed under this LayerCollection's scope.
+    Wraps constructor in `tf.variable_scope()` to ensure variables constructed
+    in `cls.__init__` are placed under this LayerCollection's scope.
 
     Args:
       cls: Class that implements FisherFactor.
-      args: Tuple of arguments to pass into 'cls's constructor. Must be
+      args: Tuple of arguments to pass into `cls's constructor. Must be
         hashable.
 
     Returns:
-      Instance of 'cls' found in self.fisher_factors.
+      Instance of `cls` found in self.fisher_factors.
     """
     try:
       hash(args)
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index 843aeef7d82df064b757ab4618f2b0ccbbec4cbe..f01c5a832212f88d80529672b652ca04d45c0f0e 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -108,13 +108,8 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
-
-    variables = var_list
-    if variables is None:
-      variables = tf_variables.trainable_variables()
-
     # Parameters to be passed to the Fisher estimator:
-    self._variables = variables
+    self._variables = var_list or tf_variables.trainable_variables
     self._cov_ema_decay = cov_ema_decay
     self._layers = layer_collection
     self._estimation_mode = estimation_mode
@@ -235,7 +230,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
 
   @property
   def variables(self):
-    return self._variables
+    return self._fisher_est.variables
 
   @property
   def damping(self):
@@ -373,6 +368,7 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
     else:
       kwargs["var_list"] = kwargs.get("var_list") or self.variables
       var_list = kwargs["var_list"]
+
     if set(var_list) != set(self.variables):
       raise ValueError("var_list doesn't match with set of Fisher-estimating "
                        "variables.")
diff --git a/tensorflow/contrib/labeled_tensor/BUILD b/tensorflow/contrib/labeled_tensor/BUILD
index 894e6f6946bb59810a9da2d304cc0dd43d25201d..c8812d4b23f94102d093db878a709b090a3318d6 100644
--- a/tensorflow/contrib/labeled_tensor/BUILD
+++ b/tensorflow/contrib/labeled_tensor/BUILD
@@ -70,6 +70,7 @@ py_test(
         "python/ops/core_test.py",
     ],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":_typecheck",
         ":core",
@@ -213,14 +214,3 @@ py_test(
         "//tensorflow/python:math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index 852d06e1e3cc8f8deecd15b7436cd4e4a393ad66..d5b3b279a1b7327602790c0260349cb0c758aa86 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -188,6 +188,7 @@ py_test(
     size = "small",
     srcs = ["python/layers/normalization_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/contrib/framework:framework_py",
@@ -353,6 +354,7 @@ py_test(
     size = "small",
     srcs = ["python/ops/sparse_ops_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":layers_py",
         "//tensorflow/python:array_ops",
@@ -390,15 +392,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 337c9e06b870b2cca53fcdbf3d94225660e193c4..00f03a111ae8be7f49761ef5fb5a82810bcca182 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -104,6 +104,7 @@ See the @{$python/contrib.layers} guide.
 @@infer_real_valued_columns
 @@sequence_input_from_feature_columns
 
+@@group_norm
 @@instance_norm
 """
 
@@ -122,6 +123,7 @@ _allowed_symbols = ['bias_add',
                     'conv3d',
                     'elu',
                     'feature_column',
+                    'group_norm',
                     'instance_norm',
                     'legacy_fully_connected',
                     'legacy_linear',
diff --git a/tensorflow/contrib/layers/kernels/BUILD b/tensorflow/contrib/layers/kernels/BUILD
index e407a9ce015603094c7bbab72856403e2f0eb1a1..7aae09ff3e9995b2d92b05211b3bf8a94a26ff43 100644
--- a/tensorflow/contrib/layers/kernels/BUILD
+++ b/tensorflow/contrib/layers/kernels/BUILD
@@ -18,14 +18,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index ffa208540dae975cb139ad6d76dcf392678ba0ee..49c3faf3b7f5eaa3b1542a1fdddcfaff99737a24 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -140,6 +140,9 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != "sum":
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -188,13 +191,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 def scattered_embedding_lookup(params,
                                values,
                                dimension,
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 350bcb3bca11b4cad18ce863ab1496076477aa3c..949e73deffc201e0392bb091025c057be79c556f 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -932,7 +932,8 @@ def convolution(inputs,
                 variables_collections=None,
                 outputs_collections=None,
                 trainable=True,
-                scope=None):
+                scope=None,
+                conv_dims=None):
   """Adds an N-D convolution followed by an optional batch_norm layer.
 
   It is required that 1 <= N <= 3.
@@ -993,6 +994,10 @@ def convolution(inputs,
     trainable: If `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     scope: Optional scope for `variable_scope`.
+    conv_dims: Optional convolution dimensionality, when set it would use the
+      corresponding convolution (e.g. 2 for Conv 2D, 3 for Conv 3D, ..). When
+      leaved to None it would select the convolution dimensionality based on
+      the input rank (i.e. Conv ND, with N = input_rank - 2).
 
   Returns:
     A tensor representing the output of the operation.
@@ -1015,6 +1020,9 @@ def convolution(inputs,
     inputs = ops.convert_to_tensor(inputs)
     input_rank = inputs.get_shape().ndims
 
+    if conv_dims is not None and conv_dims + 2 != input_rank:
+      raise ValueError('Convolution expects input with rank %d, got %d' %
+                       (conv_dims + 2, input_rank))
     if input_rank == 3:
       layer_class = convolutional_layers.Convolution1D
     elif input_rank == 4:
@@ -1061,10 +1069,134 @@ def convolution(inputs,
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
 
+@add_arg_scope
+def convolution1d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=1)
+
+convolution1d.__doc__ = convolution.__doc__
 
-convolution2d = convolution
-convolution3d = convolution
+@add_arg_scope
+def convolution2d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=2)
+
+convolution2d.__doc__ = convolution.__doc__
 
+@add_arg_scope
+def convolution3d(inputs,
+                  num_outputs,
+                  kernel_size,
+                  stride=1,
+                  padding='SAME',
+                  data_format=None,
+                  rate=1,
+                  activation_fn=nn.relu,
+                  normalizer_fn=None,
+                  normalizer_params=None,
+                  weights_initializer=initializers.xavier_initializer(),
+                  weights_regularizer=None,
+                  biases_initializer=init_ops.zeros_initializer(),
+                  biases_regularizer=None,
+                  reuse=None,
+                  variables_collections=None,
+                  outputs_collections=None,
+                  trainable=True,
+                  scope=None):
+  return convolution(inputs,
+                     num_outputs,
+                     kernel_size,
+                     stride,
+                     padding,
+                     data_format,
+                     rate,
+                     activation_fn,
+                     normalizer_fn,
+                     normalizer_params,
+                     weights_initializer,
+                     weights_regularizer,
+                     biases_initializer,
+                     biases_regularizer,
+                     reuse,
+                     variables_collections,
+                     outputs_collections,
+                     trainable,
+                     scope,
+                     conv_dims=3)
+
+convolution3d.__doc__ = convolution.__doc__
 
 @add_arg_scope
 def convolution2d_in_plane(
@@ -3045,16 +3177,16 @@ def legacy_fully_connected(x,
   `activation_fn` is `None`, the result of `y = w * x + b` is
   returned.
 
-  If `x` has shape [\\\(\\text{dim}_0, \\text{dim}_1, ..., \\text{dim}_n\\\)]
-  with more than 2 dimensions (\\\(n > 1\\\)), then we repeat the matrix
+  If `x` has shape [\\(\text{dim}_0, \text{dim}_1, ..., \text{dim}_n\\)]
+  with more than 2 dimensions (\\(n > 1\\)), then we repeat the matrix
   multiply along the first dimensions. The result r is a tensor of shape
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`],
-  where \\\( r_{i_0, ..., i_{n-1}, k} =
-  \\sum_{0 \\leq j < \\text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\\).
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`],
+  where \\( r_{i_0, ..., i_{n-1}, k} =
+  \sum_{0 \leq j < \text{dim}_n} x_{i_0, ... i_{n-1}, j} \cdot w_{j, k}\\).
   This is accomplished by reshaping `x` to 2-D
-  [\\\(\\text{dim}_0 \\cdot ... \\cdot \\text{dim}_{n-1}, \\text{dim}_n\\\)]
+  [\\(\text{dim}_0 \cdot ... \cdot \text{dim}_{n-1}, \text{dim}_n\\)]
   before the matrix multiply and afterwards reshaping it to
-  [\\\(\\text{dim}_0, ..., \\text{dim}_{n-1},\\\) `num_output_units`].
+  [\\(\text{dim}_0, ..., \text{dim}_{n-1},\\) `num_output_units`].
 
   This op creates `w` and optionally `b`. Bias (`b`) can be disabled by setting
   `bias_init` to `None`.
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 997f910a2a97567adbd7ffa3e81a31d2ae0bad7e..b01fd5d5c95ac15c76f9dbe7c77f7e76f12149a9 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -310,6 +310,17 @@ class BiasAddTest(test.TestCase):
 
 class ConvolutionTest(test.TestCase):
 
+  def testInvalidShape(self):
+    with self.test_session():
+      images_2d = random_ops.random_uniform((5, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 5, got 4'):
+        layers_lib.convolution3d(images_2d, 32, 3)
+      images_3d = random_ops.random_uniform((5, 6, 7, 9, 3), seed=1)
+      with self.assertRaisesRegexp(
+          ValueError, 'Convolution expects input with rank 4, got 5'):
+        layers_lib.convolution2d(images_3d, 32, 3)
+
   def testInvalidDataFormat(self):
     height, width = 7, 9
     with self.test_session():
@@ -3155,7 +3166,7 @@ class RepeatTests(test.TestCase):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3)).astype(np.float32)
       output = _layers.repeat(images, 3, layers_lib.conv2d, 32, [3, 3])
-      self.assertEqual(output.op.name, 'Repeat/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Repeat/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 32])
 
   def testRepeatWithScope(self):
@@ -3749,7 +3760,7 @@ class StackTests(test.TestCase):
           layers_lib.convolution2d, [10, 20, 30],
           kernel_size=[3, 3],
           padding='SAME')
-      self.assertEqual(output.op.name, 'Stack/convolution_3/Relu')
+      self.assertEqual(output.op.name, 'Stack/convolution2d_3/Relu')
       self.assertListEqual(output.get_shape().as_list(), [5, 3, 3, 30])
 
   def testStackWithScope(self):
diff --git a/tensorflow/contrib/layers/python/layers/normalization.py b/tensorflow/contrib/layers/python/layers/normalization.py
index e7d4080ff769327cc74b6629a7705ddfa552169b..c807ab0f2e5c8ac3ec2ae1d84a5b36b5f4ba76a4 100644
--- a/tensorflow/contrib/layers/python/layers/normalization.py
+++ b/tensorflow/contrib/layers/python/layers/normalization.py
@@ -24,11 +24,13 @@ from tensorflow.contrib.layers.python.layers import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
 
 
 __all__ = [
+    'group_norm',
     'instance_norm',
 ]
 
@@ -158,3 +160,196 @@ def instance_norm(inputs,
     if activation_fn is not None:
       outputs = activation_fn(outputs)
     return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
+
+
+@add_arg_scope
+def group_norm(inputs,
+               groups=32,
+               channels_axis=-1,
+               reduction_axes=(-3, -2),
+               center=True,
+               scale=True,
+               epsilon=1e-6,
+               activation_fn=None,
+               param_initializers=None,
+               reuse=None,
+               variables_collections=None,
+               outputs_collections=None,
+               trainable=True,
+               scope=None):
+  """Functional interface for the group normalization layer.
+
+  Reference: https://arxiv.org/abs/1803.08494.
+
+    "Group Normalization", Yuxin Wu, Kaiming He
+
+  Args:
+    inputs: A Tensor with at least 2 dimensions one which is channels. All
+     shape dimensions must be fully defined.
+    groups: Integer. Divide the channels into this number of groups over which
+      normalization statistics are computed. This number must be commensurate
+      with the number of channels in `inputs`.
+    channels_axis: An integer. Specifies index of channels axis which will be
+      broken into `groups`, each of which whose statistics will be computed
+      across. Must be mutually exclusive with `reduction_axes`. Preferred usage
+      is to specify negative integers to be agnostic as to whether a batch
+      dimension is included.
+    reduction_axes: Tuple of integers. Specifies dimensions over which
+       statistics will be accumulated. Must be mutually exclusive with
+       `channels_axis`. Statistics will not be accumulated across axes not
+       specified in `reduction_axes` nor `channel_axis`. Preferred usage is to
+       specify negative integers to be agnostic to whether a batch dimension is
+       included.
+
+      Some sample usage cases:
+        NHWC format: channels_axis=-1, reduction_axes=[-3, -2]
+        NCHW format: channels_axis=-3, reduction_axes=[-2, -1]
+
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is
+      not used. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling can be done by the next layer.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    activation_fn: Activation function, default set to None to skip it and
+      maintain a linear activation.
+    param_initializers: Optional initializers for beta, gamma, moving mean and
+      moving variance.
+    reuse: Whether or not the layer and its variables should be reused. To be
+      able to reuse the layer scope must be given.
+    variables_collections: Optional collections for the variables.
+    outputs_collections: Collections to add the outputs.
+    trainable: If `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    scope: Optional scope for `variable_scope`.
+
+  Returns:
+    A `Tensor` representing the output of the operation.
+
+  Raises:
+    ValueError: If the rank of `inputs` is undefined.
+    ValueError: If rank or channels dimension of `inputs` is undefined.
+    ValueError: If number of groups is not commensurate with number of channels.
+    ValueError: If reduction_axes or channels_axis are out of bounds.
+    ValueError: If reduction_axes are not mutually exclusive with channels_axis.
+  """
+  # TODO(shlens): Support partially defined shapes for the inputs.
+  inputs = ops.convert_to_tensor(inputs)
+  original_shape = inputs.shape
+
+  if inputs.shape.ndims is None:
+    raise ValueError('Inputs %s has undefined rank.' % inputs.name)
+  if channels_axis > (inputs.shape.ndims - 1):
+    raise ValueError('Axis is out of bounds.')
+
+  # Standardize the channels_axis to be positive and identify # of channels.
+  if channels_axis < 0:
+    channels_axis = inputs.shape.ndims + channels_axis
+  channels = inputs.shape[channels_axis].value
+
+  if channels is None:
+    raise ValueError('Inputs %s has undefined channel dimension: %d.' % (
+        inputs.name, channels_axis))
+
+  # Standardize the reduction_axes to be positive.
+  reduction_axes = list(reduction_axes)
+  for i in range(len(reduction_axes)):
+    if reduction_axes[i] < 0:
+      reduction_axes[i] += inputs.shape.ndims
+
+  for a in reduction_axes:
+    if a > inputs.shape.ndims:
+      raise ValueError('Axis is out of bounds.')
+    if inputs.shape[a].value is None:
+      raise ValueError('Inputs %s has undefined dimensions %d.' % (
+          inputs.name, a))
+    if channels_axis == a:
+      raise ValueError('reduction_axis must be mutually exclusive '
+                       'with channels_axis')
+  if groups > channels:
+    raise ValueError('Invalid groups %d for %d channels.' % (groups, channels))
+  if channels % groups != 0:
+    raise ValueError('%d channels is not commensurate with %d groups.' %
+                     (channels, groups))
+
+  # Determine axes before channels. Some examples of common image formats:
+  #  'NCHW': before = [N], after = [HW]
+  #  'NHWC': before = [NHW], after = []
+  axes_before_channels = inputs.shape.as_list()[:channels_axis]
+  axes_after_channels = inputs.shape.as_list()[channels_axis+1:]
+
+  # Manually broadcast the parameters to conform to the number of groups.
+  params_shape_broadcast = ([1] * len(axes_before_channels) +
+                            [groups, channels // groups] +
+                            [1] * len(axes_after_channels))
+
+  # Reshape the input by the group within the channel dimension.
+  inputs_shape = (axes_before_channels + [groups, channels // groups] +
+                  axes_after_channels)
+  inputs = array_ops.reshape(inputs, inputs_shape)
+
+  # Determine the dimensions across which moments are calculated.
+  moments_axes = [channels_axis + 1]
+  for a in reduction_axes:
+    if a > channels_axis:
+      moments_axes.append(a + 1)
+    else:
+      moments_axes.append(a)
+
+  with variable_scope.variable_scope(
+      scope, 'GroupNorm', [inputs], reuse=reuse) as sc:
+    # Note that the params_shape is the number of channels always.
+    params_shape = [channels]
+
+    # Allocate parameters for the beta and gamma of the normalization.
+    beta, gamma = None, None
+    dtype = inputs.dtype.base_dtype
+    if param_initializers is None:
+      param_initializers = {}
+    if center:
+      beta_collections = utils.get_variable_collections(
+          variables_collections, 'beta')
+      beta_initializer = param_initializers.get(
+          'beta', init_ops.zeros_initializer())
+      beta = variables.model_variable('beta',
+                                      shape=params_shape,
+                                      dtype=dtype,
+                                      initializer=beta_initializer,
+                                      collections=beta_collections,
+                                      trainable=trainable)
+      beta = array_ops.reshape(beta, params_shape_broadcast)
+
+    if scale:
+      gamma_collections = utils.get_variable_collections(
+          variables_collections, 'gamma')
+      gamma_initializer = param_initializers.get(
+          'gamma', init_ops.ones_initializer())
+      gamma = variables.model_variable('gamma',
+                                       shape=params_shape,
+                                       dtype=dtype,
+                                       initializer=gamma_initializer,
+                                       collections=gamma_collections,
+                                       trainable=trainable)
+      gamma = array_ops.reshape(gamma, params_shape_broadcast)
+
+    # Calculate the moments.
+    mean, variance = nn.moments(inputs, moments_axes, keep_dims=True)
+
+    # Compute normalization.
+    # TODO(shlens): Fix nn.batch_normalization to handle the 5-D Tensor
+    # appropriately so that this operation may be faster.
+    gain = math_ops.rsqrt(variance + epsilon)
+    offset = -mean * gain
+    if gamma is not None:
+      gain *= gamma
+      offset *= gamma
+    if beta is not None:
+      offset += beta
+    outputs = inputs * gain + offset
+
+    # Collapse the groups into the channel dimension.
+    outputs = array_ops.reshape(outputs, original_shape)
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+    return utils.collect_named_outputs(outputs_collections, sc.name, outputs)
diff --git a/tensorflow/contrib/layers/python/layers/normalization_test.py b/tensorflow/contrib/layers/python/layers/normalization_test.py
index 5cff1bf0ebb2fe8bc6933de882ecd47a9edf0f94..b6e96350db92baf4770683273be7e5dde73dbcec 100644
--- a/tensorflow/contrib/layers/python/layers/normalization_test.py
+++ b/tensorflow/contrib/layers/python/layers/normalization_test.py
@@ -166,5 +166,231 @@ class InstanceNormTest(test.TestCase):
   def testOutputBigInput5DNCHW(self):
     self.doOutputTest((1, 100, 100, 1, 1), 'NCHW', tol=1e-3)
 
+
+class GroupNormTest(test.TestCase):
+
+  def testInvalidGroupSize(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 2, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Invalid groups 10 for 2 channels.'):
+      normalization.group_norm(inputs, groups=10,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testBadCommensurateGroup(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(5, 4, 10, 10))
+    with self.assertRaisesRegexp(ValueError,
+                                 '4 channels is not commensurate with '
+                                 '3 groups.'):
+      normalization.group_norm(inputs, groups=3,
+                               reduction_axes=[-2, -1], channels_axis=-3)
+
+  def testAxisIsBad(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 2, 4, 5))
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, channels_axis=5)
+    with self.assertRaisesRegexp(ValueError,
+                                 'Axis is out of bounds.'):
+      normalization.group_norm(inputs, reduction_axes=[1, 5])
+
+  def testNotMutuallyExclusiveAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(10, 32, 32, 32))
+    # Specify axis with negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[-2])
+    # Specify axis with positive values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=1, reduction_axes=[1, 3])
+    # Specify axis with mixed positive and negative values.
+    with self.assertRaisesRegexp(ValueError, 'mutually exclusive'):
+      normalization.group_norm(inputs, channels_axis=-2, reduction_axes=[2])
+
+  def testUnknownShape(self):
+    inputs = array_ops.placeholder(dtypes.float32)
+    with self.assertRaisesRegexp(ValueError, 'undefined rank'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedReductionAxes(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 32, None, 4))
+    with self.assertRaisesRegexp(ValueError, 'undefined dimensions'):
+      normalization.group_norm(inputs)
+
+  def testParamsShapeNotFullyDefinedChannelsAxis(self):
+    inputs = array_ops.placeholder(dtypes.float32, shape=(1, 3, 4, None))
+    with self.assertRaisesRegexp(ValueError, 'undefined channel dimension'):
+      normalization.group_norm(inputs, channels_axis=-1,
+                               reduction_axes=[-3, -2])
+
+  def testCreateOp(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, height, width, 2*groups), seed=1)
+    output = normalization.group_norm(images, groups=groups, channels_axis=-1,
+                                      reduction_axes=[-3, -2])
+    print('name: ', output.op.name)
+    self.assertListEqual([5, height, width, 2*groups], output.shape.as_list())
+
+  def testCreateOpFloat64(self):
+    height, width, groups = 3, 3, 5
+    images = random_ops.random_uniform(
+        (5, height, width, 4*groups), dtype=dtypes.float64, seed=1)
+    output = normalization.group_norm(images, groups=groups)
+    self.assertEqual(dtypes.float64, output.dtype)
+    self.assertListEqual([5, height, width, 4*groups], output.shape.as_list())
+
+  def testCreateOpNoScaleCenter(self):
+    height, width, groups = 3, 3, 7
+    images = random_ops.random_uniform(
+        (5, height, width, 3*groups), dtype=dtypes.float32, seed=1)
+    output = normalization.group_norm(images, groups=groups, center=False,
+                                      scale=False)
+    self.assertListEqual([5, height, width, 3*groups], output.shape.as_list())
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('beta')))
+    self.assertEqual(0, len(contrib_variables.get_variables_by_name('gamma')))
+
+  def testCreateVariables_NHWC(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 8), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-1, reduction_axes=(-3, -2),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testCreateVariables_NCHW(self):
+    height, width, groups = 3, 3, 4
+    images = random_ops.random_uniform((5, 2*groups, height, width), seed=1)
+    normalization.group_norm(images, groups=4,
+                             channels_axis=-3, reduction_axes=(-2, -1),
+                             center=True, scale=True)
+    beta = contrib_variables.get_variables_by_name('beta')[0]
+    gamma = contrib_variables.get_variables_by_name('gamma')[0]
+    self.assertEqual('GroupNorm/beta', beta.op.name)
+    self.assertEqual('GroupNorm/gamma', gamma.op.name)
+
+  def testReuseVariables(self):
+    height, width = 3, 3
+    images = random_ops.random_uniform((5, height, width, 4), seed=1)
+    normalization.group_norm(images, groups=2, scale=True, scope='IN')
+    normalization.group_norm(images, groups=2, scale=True, scope='IN',
+                             reuse=True)
+    beta = contrib_variables.get_variables_by_name('beta')
+    gamma = contrib_variables.get_variables_by_name('gamma')
+    self.assertEqual(1, len(beta))
+    self.assertEqual(1, len(gamma))
+
+  def testValueCorrectWithReuseVars(self):
+    height, width = 3, 3
+    image_shape = (10, height, width, 4)
+    images = random_ops.random_uniform(image_shape, seed=1)
+    output_train = normalization.group_norm(images, groups=2, scope='IN')
+    output_eval = normalization.group_norm(images, groups=2, scope='IN',
+                                           reuse=True)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      # output_train and output_eval should be the same.
+      train_np, eval_np = sess.run([output_train, output_eval])
+      self.assertAllClose(train_np, eval_np)
+
+  def doOutputTest(self, input_shape, channels_axis=None, reduction_axes=None,
+                   groups=2, tol=1e-2):
+    # Select the axis for the channel and the dimensions along which statistics
+    # are accumulated.
+    if channels_axis < 0:
+      channels_axis += len(input_shape)
+    reduced_axes = [channels_axis + 1]
+    for a in reduction_axes:
+      if a < 0:
+        a += len(input_shape)
+      if a < channels_axis:
+        reduced_axes.append(a)
+      else:
+        reduced_axes.append(a+1)
+    reduced_axes = tuple(reduced_axes)
+
+    # Calculate the final shape for the output Tensor.
+    axes_before_channels = input_shape[:channels_axis]
+    axes_after_channels = input_shape[channels_axis+1:]
+    channels = input_shape[channels_axis]
+    outputs_shape = (axes_before_channels + [groups, channels // groups] +
+                     axes_after_channels)
+
+    # Calculate the final shape for the output statistics.
+    reduced_shape = []
+    for i, a in enumerate(outputs_shape):
+      if i not in reduced_axes:
+        reduced_shape.append(a)
+
+    for mu in (0.0, 1e2):
+      for sigma in (1.0, 0.1):
+        # Determine shape of Tensor after normalization.
+        expected_mean = np.zeros(reduced_shape)
+        expected_var = np.ones(reduced_shape)
+
+        inputs = random_ops.random_uniform(input_shape, seed=0) * sigma + mu
+        output_op = normalization.group_norm(
+            inputs, groups=groups, center=False, scale=False,
+            channels_axis=channels_axis,
+            reduction_axes=reduction_axes)
+        with self.test_session() as sess:
+          sess.run(variables.global_variables_initializer())
+          outputs = sess.run(output_op)
+          # Make sure that there are no NaNs
+          self.assertFalse(np.isnan(outputs).any())
+
+          outputs = np.reshape(outputs, outputs_shape)
+          mean = np.mean(outputs, axis=reduced_axes)
+          var = np.var(outputs, axis=reduced_axes)
+          # The mean and variance of each example should be close to 0 and 1
+          # respectively.
+          self.assertAllClose(expected_mean, mean, rtol=tol, atol=tol)
+          self.assertAllClose(expected_var, var, rtol=tol, atol=tol)
+
+  def testOutputSmallInput4D_NHWC(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=3, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput3D_NHWC(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=2, reduction_axes=[0, 1])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-1, reduction_axes=[-3, -2])
+
+  def testOutputSmallInput4D_NCHW(self):
+    input_shape = [10, 10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=1, reduction_axes=[2, 3])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputSmallInput3D_NCHW(self):
+    input_shape = [10, 10, 30]
+    # Specify axes with positive values.
+    self.doOutputTest(input_shape, channels_axis=0, reduction_axes=[1, 2])
+    # Specify axes with negative values.
+    self.doOutputTest(input_shape, channels_axis=-3, reduction_axes=[-2, -1])
+
+  def testOutputBigInput4D_NHWC(self):
+    self.doOutputTest([5, 100, 100, 1], channels_axis=3, reduction_axes=[1, 2],
+                      groups=1)
+
+  def testOutputBigInput4D_NCHW(self):
+    self.doOutputTest([1, 100, 100, 4], channels_axis=1, reduction_axes=[2, 3],
+                      groups=4)
+
+  def testOutputSmallInput2D_NC(self):
+    self.doOutputTest([10, 7*100], channels_axis=1, reduction_axes=[], groups=7)
+
+  def testOutputSmallInput5D_NCXXX(self):
+    self.doOutputTest([10, 10, 20, 40, 5],
+                      channels_axis=1,
+                      reduction_axes=[2, 3, 4],
+                      groups=5)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 0b38c0c3fdd84cf432c334554eba3a9b0e44084c..02d294c68f1e10108d774c5fe23b6371a7a9f0e6 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -33,6 +33,7 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.framework.python import ops as contrib_framework_ops
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.layers import base
@@ -246,9 +247,7 @@ class RevBlock(base.Layer):
     f_vars_idxs = [[] for _ in range(self.num_layers)]
     g_vars_idxs = [[] for _ in range(self.num_layers)]
 
-    for i, t in enumerate(variables):
-      ref = _underlying_variable_ref(t)
-
+    for i, ref in enumerate(variables):
       # Use the name to identify the layer number and function (f or g)
       regex = LAYER_RE.match(ref.name)
       layer_no = int(regex.group(1))
@@ -603,6 +602,7 @@ def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
     """Custom grad fn applying grad_fn for identity Defun."""
     fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
         defun_inputs, list(op.inputs))
+    fn_vars = [_underlying_variable_ref(v) for v in fn_vars]
     dys = list(dys)
     assert len(fn_outputs) == len(outputs)
     assert len(fn_outputs) == len(dys)
@@ -660,7 +660,9 @@ def _force_data_dependency(first_compute, then_compute):
     if x.get_shape().ndims is None:
       raise ValueError("Rank of Tensor %s must be known" % x)
     ndims = x.get_shape().ndims
-    return array_ops.reshape(array_ops.slice(x, [0] * ndims, [1] * ndims), [])
+    begin = framework_ops.convert_to_tensor([0] * ndims, dtype=dtypes.int32)
+    size = framework_ops.convert_to_tensor([1] * ndims, dtype=dtypes.int32)
+    return array_ops.reshape(array_ops.slice(x, begin, size), [])
 
   first_compute_sum = math_ops.add_n(
       [_first_element(x) for x in first_compute if x is not None])
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index d1ad4e8c98de3e5c5ac212d55cc93707ba9c01cc..392a490be15f14d7c86db2136b71513e2f3fb051 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -304,6 +304,20 @@ class RecomputeTest(test.TestCase):
           self.assertAllClose(current, g)
           current = g
 
+  def testResourceVariable(self):
+    @rev_block_lib.recompute_grad(tupleize_grads=True)
+    def layer_with_recompute(inputs):
+      var = variable_scope.get_variable("var", ())
+      return var * inputs
+
+    inputs = array_ops.ones((), dtypes.float32)
+    with variable_scope.variable_scope("layer", use_resource=True):
+      outputs = layer_with_recompute(inputs)
+      loss = math_ops.square(outputs)
+      grads = gradients_impl.gradients(loss, variables.trainable_variables())
+      self.assertEqual(1, len(grads))
+      self.assertTrue(grads[0] is not None)
+
 
 class FnWithCustomGradTest(test.TestCase):
 
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 16f80a876fac5e19bb8ce13074759c704c113947..d665fc9335cf22cdfa1e7330ab67003042502515 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -117,6 +117,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/data_feeder_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/python:client_testlib",
@@ -172,6 +173,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -190,6 +192,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/graph_actions_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/framework:framework_py",
@@ -591,6 +594,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/learn_io/io_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/learn/python/learn/datasets",
@@ -820,6 +824,7 @@ py_test(
     size = "small",
     srcs = ["python/learn/utils/saved_model_export_utils_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":learn",
         "//tensorflow/contrib/layers:layers_py",
@@ -874,15 +879,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/learn/python/learn/datasets/BUILD b/tensorflow/contrib/learn/python/learn/datasets/BUILD
index 8bf372841d04dc9e1339925474801d5aa3af4ccd..2c7215bba3816ff3762e5b7927f650d1c9cbf617 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/BUILD
+++ b/tensorflow/contrib/learn/python/learn/datasets/BUILD
@@ -44,18 +44,6 @@ py_binary(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "base_test",
     size = "small",
diff --git a/tensorflow/contrib/learn/python/learn/datasets/base.py b/tensorflow/contrib/learn/python/learn/datasets/base.py
index 3b5c9b97c08a388e1f35249967b6cab26861f100..4676eedb206147d178c6a652aa7c2cb48ef888c0 100644
--- a/tensorflow/contrib/learn/python/learn/datasets/base.py
+++ b/tensorflow/contrib/learn/python/learn/datasets/base.py
@@ -139,15 +139,48 @@ def retry(initial_delay,
 
   Args:
     initial_delay: the initial delay.
+    max_delay: the maximum delay allowed (actual max is
+        max_delay * (1 + jitter).
     factor: each subsequent retry, the delay is multiplied by this value.
         (must be >= 1).
     jitter: to avoid lockstep, the returned delay is multiplied by a random
         number between (1-jitter) and (1+jitter). To add a 20% jitter, set
         jitter = 0.2. Must be < 1.
+    is_retriable: (optional) a function that takes an Exception as an argument
+        and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
+  """
+  return _internal_retry(
+      initial_delay=initial_delay,
+      max_delay=max_delay,
+      factor=factor,
+      jitter=jitter,
+      is_retriable=is_retriable)
+
+
+def _internal_retry(initial_delay,
+                    max_delay,
+                    factor=2.0,
+                    jitter=0.25,
+                    is_retriable=None):
+  """Simple decorator for wrapping retriable functions, for internal use only.
+
+  Args:
+    initial_delay: the initial delay.
     max_delay: the maximum delay allowed (actual max is
         max_delay * (1 + jitter).
+    factor: each subsequent retry, the delay is multiplied by this value.
+        (must be >= 1).
+    jitter: to avoid lockstep, the returned delay is multiplied by a random
+        number between (1-jitter) and (1+jitter). To add a 20% jitter, set
+        jitter = 0.2. Must be < 1.
     is_retriable: (optional) a function that takes an Exception as an argument
         and returns true if retry should be applied.
+
+  Returns:
+    A function that wraps another function to automatically retry it.
   """
   if factor < 1:
     raise ValueError('factor must be >= 1; was %f' % (factor,))
@@ -195,7 +228,7 @@ def _is_retriable(e):
 
 
 @deprecated(None, 'Please use urllib or similar directly.')
-@retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
+@_internal_retry(initial_delay=1.0, max_delay=16.0, is_retriable=_is_retriable)
 def urlretrieve_with_retry(url, filename=None):
   return urllib.request.urlretrieve(url, filename)
 
diff --git a/tensorflow/contrib/learn/python/learn/estimators/run_config.py b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
index 1d161093de01ef838d0c75ec9a39574c7529bd57..8c85c431be69caaca6872111896b9487faf9e679 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/run_config.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/run_config.py
@@ -290,8 +290,15 @@ class RunConfig(ClusterConfig, core_run_config.RunConfig):
         Note - using this argument, it is easy to provide settings which break
         otherwise perfectly good models. Use with care.
     """
-    super(RunConfig, self).__init__(
-        master=master, evaluation_master=evaluation_master)
+    # Neither parent class calls super().__init__(), so here we have to
+    # manually call their __init__() methods.
+    ClusterConfig.__init__(
+        self, master=master, evaluation_master=evaluation_master)
+    # For too long this code didn't call:
+    #   core_run_config.RunConfig.__init__(self)
+    # so instead of breaking compatibility with that assumption, we
+    # just manually initialize this field:
+    self._train_distribute = None
 
     gpu_options = config_pb2.GPUOptions(
         per_process_gpu_memory_fraction=gpu_memory_fraction)
diff --git a/tensorflow/contrib/legacy_seq2seq/BUILD b/tensorflow/contrib/legacy_seq2seq/BUILD
index 1fa55132b1fc0cd3367ca2eb331b6870edc30c3b..8c2c4fd29c0502d4199f27a65e4827b2db973c3d 100644
--- a/tensorflow/contrib/legacy_seq2seq/BUILD
+++ b/tensorflow/contrib/legacy_seq2seq/BUILD
@@ -60,15 +60,3 @@ cuda_py_tests(
     ],
     tags = ["noasan"],  # times out b/63678675
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/libsvm/BUILD b/tensorflow/contrib/libsvm/BUILD
index df96402a4ffd51840f77d58d8066487030362340..4dccb9be7cd2e603edcf10c020cc0ee1675f518a 100644
--- a/tensorflow/contrib/libsvm/BUILD
+++ b/tensorflow/contrib/libsvm/BUILD
@@ -88,15 +88,3 @@ tf_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index 359255374d2ea2d35fc4b8a8d72fccc280137979..a7812f74d1e69276a4bba597b41e442bc4dbbc4a 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -61,15 +61,3 @@ cuda_py_test(
     shard_count = 4,
     tags = ["noasan"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
index 80649bd52da76452e0427f341ff686c26d70a70f..9d3af66c92b59dd030d4b2a829ab733eec6cf0c1 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
+++ b/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
@@ -138,8 +138,7 @@ class LinearOperatorBlockDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
         This is true by default, and will raise a `ValueError` otherwise.
       name: A name for this `LinearOperator`.  Default is the individual
diff --git a/tensorflow/contrib/linear_optimizer/BUILD b/tensorflow/contrib/linear_optimizer/BUILD
index cea3627ed565f0de86d8d9bb6b45c4b19c5b5558..5b89c6cef9fa9fdef7c26ddee1efa03f3056d881 100644
--- a/tensorflow/contrib/linear_optimizer/BUILD
+++ b/tensorflow/contrib/linear_optimizer/BUILD
@@ -138,14 +138,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index cfe62fac43b35d863eb559b95057ae62a41bed49..ac50699f5984d04dc895205464cd6199dd4342f7 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import random
 import threading
 
 from tensorflow.contrib.linear_optimizer.python.ops.sdca_ops import SdcaModel
@@ -102,6 +103,33 @@ def make_example_dict(example_protos, example_weights):
       example_ids=['%d' % i for i in range(0, len(example_protos))])
 
 
+def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
+  random.seed(1)
+  sparse_features = [
+      SparseFeatureColumn(
+          [int(i / num_non_zero) for i in range(num_examples * num_non_zero)],
+          [int(random.random() * dim) for _ in range(
+              num_examples * num_non_zero)],
+          [num_non_zero**(-0.5) for _ in range(num_examples * num_non_zero)])
+  ]
+  examples_dict = dict(
+      sparse_features=sparse_features,
+      dense_features=[],
+      example_weights=[random.random() for _ in range(num_examples)],
+      example_labels=[
+          1. if random.random() > 0.5 else 0. for _ in range(num_examples)
+      ],
+      example_ids=[str(i) for i in range(num_examples)])
+
+  weights = variables_lib.Variable(
+      array_ops.zeros([dim], dtype=dtypes.float32))
+  variables_dict = dict(
+      sparse_features_weights=[weights],
+      dense_features_weights=[])
+
+  return examples_dict, variables_dict
+
+
 def make_variable_dict(max_age, max_gender):
   # TODO(sibyl-toe9oF2e):  Figure out how to derive max_age & max_gender from
   # examples_dict.
@@ -235,6 +263,32 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         self.assertAllClose(
             0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
+  def testSparseRandom(self):
+    dim = 20
+    num_examples = 1000
+    # Number of non-zero features per example.
+    non_zeros = 10
+    # Setup test data.
+    with self._single_threaded_test_session():
+      examples, variables = make_random_examples_and_variables_dicts(
+          num_examples, dim, non_zeros)
+      options = dict(
+          symmetric_l2_regularization=.1,
+          symmetric_l1_regularization=0,
+          num_table_shards=1,
+          adaptive=False,
+          loss_type='logistic_loss')
+
+      lr = SdcaModel(examples, variables, options)
+      variables_lib.global_variables_initializer().run()
+      train_op = lr.minimize()
+      for _ in range(4):
+        train_op.run()
+      lr.update_weights(train_op).run()
+      # Duality gap is 1.4e-5.
+      # It would be 0.01 without shuffling and 0.02 with adaptive sampling.
+      self.assertNear(0.0, lr.approximate_duality_gap().eval(), err=1e-3)
+
   def testDistributedSimple(self):
     # Setup test data
     example_protos = [
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index 3f5fdc18bb8f47cceee8f81dd5ded02059344b8b..f980746a19fb8e0a02b9d023c127da7ab33e457f 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -168,6 +168,10 @@ class SdcaModel(object):
     # of workers
     return self._options.get('num_loss_partitions', 1)
 
+  def _adaptive(self):
+    # Perform adaptive sampling.
+    return self._options.get('adaptive', True)
+
   def _num_table_shards(self):
     # Number of hash table shards.
     # Return 1 if not specified or if the value is 'None'
@@ -344,7 +348,8 @@ class SdcaModel(object):
           l1=self._options['symmetric_l1_regularization'],
           l2=self._symmetric_l2_regularization(),
           num_loss_partitions=self._num_loss_partitions(),
-          num_inner_iterations=1)
+          num_inner_iterations=1,
+          adaptative=self._adaptive())
       # pylint: enable=protected-access
 
       with ops.control_dependencies([esu]):
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 79a5928a21cb9a2633b2aac178f185ba333790d6..bed3d5139fcbf9d9e8b85605c752736f26af6793 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -30,6 +30,13 @@ from tensorflow.python.platform import test
 
 class SDCALogisticClassifierTest(test.TestCase):
 
+  def _single_threaded_test_session(self):
+    # TODO(andreasst): figure out why SDCALinearRegressor needs a single
+    # threaded session to pass in tsan mode but SDCALogisticClassifier does not.
+    config = config_pb2.ConfigProto(
+        inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
+    return self.test_session(config=config)
+
   def testRealValuedFeatures(self):
     """Tests SDCALogisticClassifier works with real valued features."""
 
@@ -41,7 +48,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       maintenance_cost = feature_column_lib.real_valued_column(
           'maintenance_cost')
       sq_footage = feature_column_lib.real_valued_column('sq_footage')
@@ -66,7 +73,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[500.0, 800.0], [200.0, 600.0]])
       }, constant_op.constant([[0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       dense_feature = feature_column_lib.real_valued_column(
           'dense_feature', dimension=2)
       classifier = sdca_estimator.SDCALogisticClassifier(
@@ -86,7 +93,7 @@ class SDCALogisticClassifierTest(test.TestCase):
           'weights': constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('price'),
           boundaries=[500.0, 700.0])
@@ -120,7 +127,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[1.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
@@ -151,7 +158,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 5])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       country = feature_column_lib.sparse_column_with_hash_bucket(
           'country', hash_bucket_size=5)
       country_weighted_by_price = feature_column_lib.weighted_sparse_column(
@@ -163,6 +170,38 @@ class SDCALogisticClassifierTest(test.TestCase):
       metrics = classifier.evaluate(input_fn=input_fn, steps=1)
       self.assertGreater(metrics['accuracy'], 0.9)
 
+  def testSparseFeaturesWithDuplicates(self):
+    """Tests SDCALogisticClassifier with duplicated sparse features."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2']),
+          'age':
+              sparse_tensor.SparseTensor(
+                  values=['20-29'] * 5 + ['31-40'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+          'gender':
+              sparse_tensor.SparseTensor(
+                  values=['m'] * 5 + ['f'] * 5,
+                  indices=[[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [1, 0],
+                           [1, 0], [1, 0], [1, 0], [1, 0]],
+                  dense_shape=[2, 1]),
+      }, constant_op.constant([[1], [0]])
+
+    with self._single_threaded_test_session():
+      age = feature_column_lib.sparse_column_with_hash_bucket(
+          'age', hash_bucket_size=10)
+      gender = feature_column_lib.sparse_column_with_hash_bucket(
+          'gender', hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id', feature_columns=[age, gender])
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertLess(metrics['loss'], 0.060)
+
   def testCrossedFeatures(self):
     """Tests SDCALogisticClassifier with crossed features."""
 
@@ -182,7 +221,7 @@ class SDCALogisticClassifierTest(test.TestCase):
                   dense_shape=[3, 1])
       }, constant_op.constant([[0], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       language = feature_column_lib.sparse_column_with_hash_bucket(
           'language', hash_bucket_size=5)
       country = feature_column_lib.sparse_column_with_hash_bucket(
@@ -215,7 +254,7 @@ class SDCALogisticClassifierTest(test.TestCase):
               constant_op.constant([[3.0], [1.0], [1.0]])
       }, constant_op.constant([[1], [0], [1]])
 
-    with self.test_session():
+    with self._single_threaded_test_session():
       price = feature_column_lib.real_valued_column('price')
       sq_footage_bucket = feature_column_lib.bucketized_column(
           feature_column_lib.real_valued_column('sq_footage'),
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 92d022f2a30ffeb77e81d3bd01365afcd14826b5..5d4572bf6c761e0de2c9e6d7e17193abf0ebb170 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.contrib import layers
 from tensorflow.contrib.linear_optimizer.python.ops import sdca_ops
 from tensorflow.contrib.linear_optimizer.python.ops.sparse_feature_column import SparseFeatureColumn
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -71,12 +72,14 @@ class SDCAOptimizer(object):
                num_loss_partitions=1,
                num_table_shards=None,
                symmetric_l1_regularization=0.0,
-               symmetric_l2_regularization=1.0):
+               symmetric_l2_regularization=1.0,
+               adaptive=True):
     self._example_id_column = example_id_column
     self._num_loss_partitions = num_loss_partitions
     self._num_table_shards = num_table_shards
     self._symmetric_l1_regularization = symmetric_l1_regularization
     self._symmetric_l2_regularization = symmetric_l2_regularization
+    self._adaptive = adaptive
 
   def get_name(self):
     return 'SDCAOptimizer'
@@ -101,6 +104,10 @@ class SDCAOptimizer(object):
   def symmetric_l2_regularization(self):
     return self._symmetric_l2_regularization
 
+  @property
+  def adaptive(self):
+    return self._adaptive
+
   def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
                      features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
@@ -175,28 +182,42 @@ class SDCAOptimizer(object):
         elif isinstance(
             column,
             (
+                layers.feature_column._WeightedSparseColumn,  # pylint: disable=protected-access
                 layers.feature_column._CrossedColumn,  # pylint: disable=protected-access
                 layers.feature_column._SparseColumn)):  # pylint: disable=protected-access
-          sparse_features.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=transformed_tensor.indices,
-                          num_or_size_splits=2,
-                          axis=1)[0], [-1]),
-                  array_ops.reshape(transformed_tensor.values, [-1]), None))
-          sparse_feature_weights.append(columns_to_variables[column][0])
-        elif isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
-          id_tensor = column.id_tensor(transformed_tensor)
-          weight_tensor = column.weight_tensor(transformed_tensor)
+
+          if isinstance(column, layers.feature_column._WeightedSparseColumn):  # pylint: disable=protected-access
+            id_tensor = column.id_tensor(transformed_tensor)
+            weight_tensor = array_ops.reshape(
+                column.weight_tensor(transformed_tensor).values, [-1])
+          else:
+            id_tensor = transformed_tensor
+            weight_tensor = array_ops.ones(
+                [array_ops.shape(id_tensor.indices)[0]], dtypes.float32)
+
+          example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1])
+
+          flat_ids = array_ops.reshape(id_tensor.values, [-1])
+          projection_length = math_ops.reduce_max(flat_ids) + 1
+          # project ids based on example ids so that we can dedup ids that
+          # occur multiple times for a single example.
+          projected_ids = projection_length * example_ids + flat_ids
+
+          # Remove any redudant ids.
+          ids, idx = array_ops.unique(projected_ids)
+          # Keep only one example id per duplicated ids.
+          example_ids_filtered = math_ops.unsorted_segment_min(
+              example_ids, idx,
+              array_ops.shape(ids)[0])
+
+          # reproject ids back feature id space.
+          reproject_ids = (ids - projection_length * example_ids_filtered)
+
+          weights = array_ops.reshape(
+              math_ops.unsorted_segment_sum(weight_tensor, idx,
+                                            array_ops.shape(ids)[0]), [-1])
           sparse_feature_with_values.append(
-              SparseFeatureColumn(
-                  array_ops.reshape(
-                      array_ops.split(
-                          value=id_tensor.indices, num_or_size_splits=2, axis=1)
-                      [0], [-1]),
-                  array_ops.reshape(id_tensor.values, [-1]),
-                  array_ops.reshape(weight_tensor.values, [-1])))
+              SparseFeatureColumn(example_ids_filtered, reproject_ids, weights))
           sparse_feature_with_values_weights.append(
               columns_to_variables[column][0])
         else:
@@ -228,6 +249,7 @@ class SDCAOptimizer(object):
         options=dict(
             symmetric_l1_regularization=self._symmetric_l1_regularization,
             symmetric_l2_regularization=self._symmetric_l2_regularization,
+            adaptive=self._adaptive,
             num_loss_partitions=self._num_loss_partitions,
             num_table_shards=self._num_table_shards,
             loss_type=loss_type))
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 18efa64507c95ac7b8d37bd9a8b62c9335b7b5d0..9c4533079c72f5ed68c6f45582fb1cecaa3a3679 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -89,6 +89,7 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
     ],
+    deps = [":context"],
 )
 
 cc_library(
@@ -271,18 +272,3 @@ cc_test(
 #        ],
 #    }),
 #)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "downloads",
-            "examples",
-            "gen",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/README.md b/tensorflow/contrib/lite/README.md
index c15ae3f233ed6a697e2df7a539e0ba131d4dd1d9..a676b705f143b393c7e5bfa9e40d23f9adb68dcc 100644
--- a/tensorflow/contrib/lite/README.md
+++ b/tensorflow/contrib/lite/README.md
@@ -1,238 +1,8 @@
 # TensorFlow Lite
-TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded devices. It enables low-latency inference of on-device machine learning models with a small binary size and fast performance supporting hardware acceleration.
 
-TensorFlow Lite uses many techniques for achieving low latency like optimizing the kernels for specific mobile apps, pre-fused activations, quantized kernels that allow smaller and faster (fixed-point math) models, and in the future, leverage specialized machine learning hardware to get the best possible performance for a particular model on a particular device.
+TensorFlow Lite is TensorFlow's lightweight solution for mobile and embedded
+devices. It enables low-latency inference of on-device machine learning models
+with a small binary size and fast performance supporting hardware acceleration.
 
-![image](g3doc/TFLite-Architecture.jpg)
-# Getting Started with an Android Demo App
-
-This section contains an example application using TensorFlow Lite for Android devices. The demo is a sample camera app that classifies images continuously using either a quantized Mobilenet model or a floating point Inception-v3 model. A device running Android 5.0 ( API 21) or higher is required to run the demo.
-
-There are 3 ways to get the demo app to your device
- - Download the prebuilt binary or
- - Use Android Studio to build the application or
- - Download the source code for TensorFlow Lite and the demo and build it using bazel
-
-## Description
-In the demo app, inference is done using the TensorFlow Lite Java API. The demo app classifies frames in real-time, displaying the top most probable classifications. It also displays the time taken to detect the object.
-
-## Downloading the pre-built binary
-The fastest path to trying the demo, is to download the pre-built binary
-[TfLiteCameraDemo.apk](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
-
-Once the apk is installed, click the app icon to start the app. The first-time the app is opened, the app asks for runtime permissions to access the device camera. The demo app opens the back-camera of the device and recognizes the objects in the camera's field of view. At the bottom of the image (or at the left of the image if the device is in landscape mode), it shows the latency of classification and the top three objects classified.
-
-## Building in Android Studio using TensorFlow Lite AAR from JCenter
-The simplest way to compile the demo app, and try out changes to the project code is to use AndroidStudio.
-
- - Install the latest version of Android Studio 3 as specified [here](https://developer.android.com/studio/index.html).
- - Make sure the Android SDK version is greater than 26 and NDK version is greater than 14 (in the Android Studio Settings).
- - Import the `tensorflow/contrib/lite/java/demo` directory as a new Android Studio project.
- - Click through installing all the Gradle extensions it requests.
- - Either
-     - Download the quantized Mobilenet TensorFlow Lite model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-         - unzip and copy mobilenet_quant_v1_224.tflite to the assets directory:
-           `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
-     - Or download the floating point Inception-v3 model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
-         - unzip and copy inceptionv3_non_slim_2015.tflite to the assets directory
-         - change the chosen classifier in [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java) from
-         `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`
-         to
-         `classifier = new ImageClassifierFloatInception(getActivity());`
- - Build and run the demo app
-
-## Building TensorFlow Lite and the demo app from source
-
-### Clone the TensorFlow repo
-- git clone
-  [https://github.com/tensorflow/tensorflow](https://github.com/tensorflow/tensorflow)
-
-### Install Bazel
-If bazel is not installed on your system, install it now by following [these directions](https://bazel.build/versions/master/docs/install.html)
-
-NOTE: Bazel does not fully support building Android on Windows yet. Full support for Gradle/CMake builds is coming soon, but in the meantime Windows users should download the [prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead.
-
-### Install Android NDK and SDK
-Bazel is the primary build system for TensorFlow. Bazel and the Android NDK and SDK must be installed on your system.
- - Install the latest version of Bazel as per the instructions on the [Bazel website](https://bazel.build/versions/master/docs/install.html)
- - The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The current recommended version is 14b, which can be found [here](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads).
- - The Android SDK and build tools may be obtained [here](https://developer.android.com/tools/revisions/build-tools.html), or alternatively as part of [Android Studio](https://developer.android.com/studio/index.html). Build tools API >= 23 is required to build the TF Android demo (though it will run on API >= 21 devices).
- - In the root of the TensorFlow repository update the `WORKSPACE` file with the `api_level` and location of the SDK and NDK. If you installed it with AndroidStudio the SDK path can be found in the SDK manager, and the default NDK path is:`{SDK path}/ndk-bundle.`
-
-```
-android_sdk_repository (
-    name = "androidsdk",
-    api_level = 23,
-    build_tools_version = "23.0.2",
-    path = "/home/xxxx/android-sdk-linux/",
-)
-
-android_ndk_repository(
-    name = "androidndk",
-    path = "/home/xxxx/android-ndk-r10e/",
-    api_level = 19,
-)
-```
-
-Additional details on building with Android can be found [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
-
-### Build the source code
-Run bazel with the following command to build the demo.
-
-Build the demo app:
-
-```
-bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
-```
-
-### Note
-
-Currently, we only support building the Android demo app within a Python 2
-environment (due to a Bazel bug).
-
-### More about the demo
-The demo is resizing each camera image frame to (224 width * 224 height) to match the quantized Mobilenet model being used (299 * 299 for Inception-v3). The resized image is converted into a ByteBuffer row by row of size 1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch. 224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents three colors of a pixel. This demo uses the TensorFlow Lite Java inference API for models which take a single input and provide a single output. This outputs a two-dimensional array, with the first dimension being the category index and the second dimension being the confidence of classification. Both models have 1001 unique categories and the app sorts the probabilities of all the categories and displays the top three. The model file must be downloaded and bundled within the assets directory of the app.
-
-# iOS Demo App
-
-Similar to the Android demo app, there's an iOS camera app that uses exactly the same model (224 * 224 quantized Mobilenet).
-
-This demo app requires a camera so it doesn't work with simulators. It need to be executed on a real iOS device. Follow the instructions to build and run the demo app:
-
-1.   Run `tensorflow/contrib/lite/examples/ios/download_models.sh` to download the model files used by the demo app.
-1.   Install [CocoaPods](https://cocoapods.org/) if it wasn't installed yet: `sudo gem install cocoapods`.
-1.   Run `pod install` in `tensorflow/contrib/lite/examples/ios/camera` to generate the workspace file.
-1.   Open the project by running `open tflite_camera_example.xcworkspace`, and build the app in XCode.
-
-# TensorFlow Lite Quick Start
-
-## Step 1. Decide which GraphDef to use
- Depending on the use case, the developer may choose to use one of the popular
- open-sourced models such as InceptionV3 or MobileNets, re-train these models
- with their own custom data set or even build their own custom model.
-
-### Using a pre-trained model
-
-[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html) is a family of mobile-first computer vision models for [TensorFlow](https://www.tensorflow.org/) designed to effectively maximize accuracy while being mindful of the restricted resources for an on-device or embedded application. MobileNets are small, low-latency, low-power models parameterized to meet the resource constraints of a variety of use cases. They can be built upon for classification, detection, embeddings and segmentation similar to how other popular large scale models, such as [Inception](https://arxiv.org/pdf/1602.07261.pdf), are used. Google provides 16 pre-trained [ImageNet](http://www.image-net.org/challenges/LSVRC/)  classification checkpoints for MobileNets for use in mobile projects of all sizes.
-
-[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model which achieves fairly high accuracy in recognizing general objects with 1000 classes, like "Zebra", "Dalmatian", and "Dishwasher". The model extracts general features from input images using a convolutional neural network and classifies them based on those features with fully-connected and softmax layers.
-
-[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)  is an on-device model which provides one-touch replies for an incoming text message by suggesting contextually relevant messages. The model is built specifically for memory constrained devices such as watches & phones and it has been successfully used to surface [Smart Replies on Android Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html). Note that this model only works on Android as of now.
-
-These pre-trained models can be downloaded from [here](g3doc/models.md).
-
-### Retrain Inception-V3 or MobileNet for a custom data set
-The above pre-trained models have been trained on the ImageNet data set, which consists of 1000 predefined classes. A model will need to be re-trained if these classes are not relevant or useful for a given use case. This technique is called transfer learning, which starts with a model that has been already trained on a problem and will then be retrained on a similar problem. Deep learning from scratch can take days, but transfer learning can be done fairly quickly. In order to do this, a developer will need to generate their custom data set labeled with the relevant classes.
-
-The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/) codelab walks through this process step-by-step. The retraining code supports retraining for both floating point and quantized inference.
-
-# Getting started with RaspberryPi
-
-Using RaspberryPi can be accomplished by following the [Makefile instructions](g3doc/rpi.md). That will give a you a static library (.a) that you can build your app against. Python bindings will be coming soon as well as a demo app.
-
-### Train a custom model
-A developer may choose to train a custom model using Tensorflow. TensorFlow documentation has [several tutorials](https://www.tensorflow.org/tutorials/) for building and training models. If the user has written a model using TensorFlow's Slim Framework the first step is to export this to a GraphDef file. This is necessary because Slim does not store the model structure outside the code, so to communicate with other parts of the framework it needs to be exported. Documentation for the export can be found [here](https://github.com/tensorflow/models/tree/master/research/slim#Export). The output of this step will be a .pb file for the custom model.
-
-TensorFlow Lite currently supports a subset of TensorFlow operators. Please refer to [this document](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for details of supported operators and their usage. This
-set will continue to expand in future releases of Tensorflow Lite.
-
-
-## Step 2. Model format conversion
-
-The model generated in Step 1 is a standard Tensorflow model. After the completion of Step 1 a user should have a standard .pb or .pbtxt GraphDef file. If the application developer is using a pre-trained model (as defined in Step 1 above), they can download a ready to use, already converted model for use from [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/models.md). Models generated using retraining (aka transfer learning) or custom models will need to be converted using the steps mentioned below.
-
-A prerequisite to converting the model to the Tensorflow Lite format is to freeze the graph.
-
-Since we employ several formats, the following definitions may be useful:
- - GraphDef (.pb) - a protobuf that represents the TensorFlow training and or computation graph. This contains operators, tensors, and variables definitions.
-
- - CheckPoint (.ckpt) - Serialized variables from a TensorFlow graph. Note, this does not contain the graph structure, so alone it cannot typically be interpreted.
-
- - FrozenGraphDef - a subclass of GraphDef that contains no variables. A GraphDef can be converted to a frozen graphdef by taking a checkpoint and a graphdef and converting every variable into a constant with the value looked up in the checkpoint.
-
- - SavedModel - A collection of GraphDef and CheckPoint together with a signature that labels input and output arguments to a model. A GraphDef and Checkpoint can be extracted from a saved model.
-
- - TensorFlow lite model (.tflite) - a serialized flatbuffer, containing TensorFlow lite operators and Tensors for the TensorFlow lite interpreter. This is most analogous to TensorFlow frozen GraphDefs.
-
-### Freeze Graph
-To use this .pb GraphDef file within TensorFlow Lite, the application developer will need checkpoints containing trained weight parameters. The .pb contains only the structure of the graph. The process of merging the checkpoint values with the graph structure is known as "freezing" the graph.
-
-The developer should know where the checkpoints folder is present or checkpoints can also be downloaded for a pre-trained model (Example: Here is a link to the [MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
-
-Graph freezing can be done using the command below (and modifying the arguments appropriately)
-
-```
-bazel build tensorflow/python/tools:freeze_graph
-
-bazel-bin/tensorflow/python/tools/freeze_graph\
-    --input_graph=/tmp/mobilenet_v1_224.pb \
-    --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
-    --input_binary=true --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
-    --output_node_names=MobilenetV1/Predictions/Reshape_1
-```
-
-The user has to first build the freeze_graph script using bazel and then run the script.  The input_binary flag has to be enabled to ensure that the protobuf is read and written in binary format.  The user has to input the .pb and the .ckpt files to freeze the graph The output_node_names may not be obvious outside of the code that built the model. The easiest way to find them is to visualize the graph, either with
-graphviz, or [in tensorboard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3).
-
-This frozen Graphdef is now ready to be converted to flatbuffer format (.tflite) for use on Android or iOS.  On Android users have the flexibility to use either the float or quantized versions of the frozen graphdef, if available, using the Tensorflow Optimizing Converter tool.
-
-Here is a sample command line to convert the frozen Graphdef to '.tflite' format for  The Tensorflow Optimizing Converter supports both float and quantized models, however, different configuration parameters are needed depending on whether a FLOAT or QUANTIZED mode is being used.
-(Here is a link to the pb [file](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)).
-
-```
-bazel build tensorflow/contrib/lite/toco:toco
-
-bazel-bin/tensorflow/contrib/lite/toco/toco \
-  --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF  --output_format=TFLITE \
-  --output_file=/tmp/mobilenet_v1_1.0_224.tflite --inference_type=FLOAT \
-  --input_type=FLOAT --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 --input_shapes=1,224,224,3
-```
-
-- The input_file argument should point to the frozen GraphDef file that holds the model architecture.
-- The output_file argument should point to where the TensorFlow Lite model file should be generated.
-- The input_type and inference_type arguments should be set to FLOAT, unless converted a [quantized](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/) model.
-- Setting the input_array, output_array and input_shape arguments are a bit trickier. The easiest way to find these values is to explore the graph in tensorboard .  The user should reuse the arguments that were used for specifying the output nodes for inference in the `freeze_graph`step.
-
-Note, it is also possible to use the Tensorflow Optimizing Converter through protos either from Python or from the command line see the
-documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py). A developer can then integrate the conversion step into their model design workflow to ensure that a model will be easily convertible to a mobile inference graph. For example,
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-
-```
-For detailed instructions on how to use the Tensorflow Optimizing Converter, please see [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
-
-You may refer to the [Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md) for troubleshooting help. If that doesn't help, please file an [issue](https://github.com/tensorflow/tensorflow/issues).
-
-If you would like to see a visual description of your TensorFlow Lite model after conversion, you can use tensorflow/contrib/lite/tools/visualize.py by running
-```sh
-bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
-```
-and then visualize the resulting HTML file in a browser.
-
-## Step 3. Use the TensorFlow Lite model for inference in a mobile app
-
-After completion of Step 2 the developer should have a .tflite model.
-
-### For Android
-Because Android apps need to be written in Java, and core TensorFlow is in C++, a JNI library is provided to interface between the two. Its interface is aimed only at inference, so it provides the ability to load a graph, set up inputs, and run the model to calculate particular outputs. The full documentation for the set of methods can be seen [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/). The demo app is also open sourced on [github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
-
-The [demo app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app)  uses this interface, so it's a good place to look for example usage. You can also download the prebuilt binary [here](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-
-Note that you'd need to follow instructions for installing TensorFlow on Android, setting up bazel and Android Studio outlined [here](https://www.tensorflow.org/mobile/android_build).
-
-### For iOS
-Follow the documentation [here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md) to get integrate a TFLite model into your app.
-
-## Core ML support
-
-Core ML is a machine learning framework used across Apple products. In addition to using Tensorflow Lite models directly in their applications, developers have the option to convert their trained Tensorflow models to the [CoreML](https://developer.apple.com/machine-learning/) format for use on Apple devices. For information on how to use the converter please refer to the [Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+See the documentation: https://www.tensorflow.org/mobile/tflite/
+Documentation edits can be made here: [tensorflow/docs_src/mobile/tflite](../../docs_src/mobile/tflite)
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index 4b322e027d48f4bf9f90d5b873c449d1ec31cc49..a4772731ecda92431c412672610a39c188dabf27 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstdint>
 #include <cstring>
+#include <utility>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/context.h"
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 8e47e2375e2e306c345a2b6caa2411abd9b3ceb0..4f836d367747e06de682b5764206d33f6e2fb983 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/arena_planner.h"
+#include <utility>
 
 namespace tflite {
 
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index f84b3dad9550e789237c8e45971002c7d336b9d3..e9d0fbc5a9b5aec06e28da8757466b25f40da2f5 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,7 +25,7 @@ limitations under the License.
 
 namespace tflite {
 
-class AllocationInfo;
+struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
 //
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 2813d1c347163e67c70983d3dd49773f4a4b4544..b8f6b7fd59af9834edb4aa7aefa524c25ede66d2 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -200,8 +200,7 @@ def gen_zipped_test_files(name, files):
     native.genrule(
         name = name + "_" + f + ".files",
         cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f +
-               " $(@D) zipped"),
+               + " --zip_to_output " + f + " $(@D)"),
         outs = [out_file],
         tools = [
             ":generate_examples",
diff --git a/tensorflow/contrib/lite/build_ios_universal_lib.sh b/tensorflow/contrib/lite/build_ios_universal_lib.sh
index 4a9023ff33de15dd384531d51e39de4ffeecdb8b..9f398f4a9f3dcafd7bd49fd5d95e9991b8b36b75 100755
--- a/tensorflow/contrib/lite/build_ios_universal_lib.sh
+++ b/tensorflow/contrib/lite/build_ios_universal_lib.sh
@@ -19,11 +19,16 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../.."
 
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8
-make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=x86_64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_x86_64/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=i386 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_i386/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=armv7s -j 8 \
+$SCRIPT_DIR/gen/lib/ios_armv7s/libtensorflow-lite.a
+make -f tensorflow/contrib/lite/Makefile TARGET=IOS IOS_ARCH=arm64 -j 8 \
+$SCRIPT_DIR/gen/lib/ios_arm64/libtensorflow-lite.a
 
 lipo \
 tensorflow/contrib/lite/gen/lib/ios_x86_64/libtensorflow-lite.a \
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 5fc8954743e5b3b458e5c2004f4378cbad6056c0..f5fb2f15e3396cc664c0b4e7da1fea1e6a66d218 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include "tensorflow/contrib/lite/context.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -174,6 +176,11 @@ typedef struct {
   int block_size;
 } TfLiteSpaceToDepthParams;
 
+typedef struct {
+  TfLiteType in_data_type;
+  TfLiteType out_data_type;
+} TfLiteCastParams;
+
 typedef enum {
   kTfLiteCombinerTypeSum = 0,
   kTfLiteCombinerTypeMean = 1,
@@ -214,6 +221,10 @@ typedef struct {
   int shrink_axis_mask;
 } TfLiteStridedSliceParams;
 
+typedef struct {
+  TfLiteType output_type;
+} TfLiteArgMaxParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index d7993e60cc77839b823e17ce11f8a57d3e0972db..1ceefafc5643ac1d7d2b94d222ea96894a2acce9 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -79,6 +79,9 @@ typedef enum {
   kTfLiteBuiltinBidirectionalSequenceLstm = 52,
   kTfLiteBuiltinCast = 53,
   kTfLiteBuiltinPrelu = 54,
+  kTfLiteBuiltinMaximum = 55,
+  kTfLiteBuiltinArgMax = 56,
+  kTfLiteBuiltinMinimum = 57,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/examples/android/AndroidManifest.xml b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..bc9574d646b7661de8ac9b745bd53cbba1eb9f31
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/AndroidManifest.xml
@@ -0,0 +1,65 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.demo">
+
+    <uses-permission android:name="android.permission.CAMERA" />
+    <uses-feature android:name="android.hardware.camera" />
+    <uses-feature android:name="android.hardware.camera.autofocus" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE"/>
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="23" />
+
+    <application android:allowBackup="true"
+        android:debuggable="true"
+        android:label="@string/app_name"
+        android:icon="@drawable/ic_launcher"
+        android:theme="@style/MaterialTheme">
+
+        <activity android:name="org.tensorflow.demo.ClassifierActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_classification">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.DetectorActivity"
+                  android:screenOrientation="portrait"
+                  android:label="@string/activity_name_detection">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+
+        <activity android:name="org.tensorflow.demo.SpeechActivity"
+            android:screenOrientation="portrait"
+            android:label="@string/activity_name_speech">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..49280129971e38247c2216d9422bc5de9176e13d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/BUILD
@@ -0,0 +1,86 @@
+# Description:
+#   TensorFlow camera demo app for Android.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Build the demo native demo lib from the original directory to reduce code
+# reuse. Note that the Java counterparts (ObjectTracker.java and
+# ImageUtils.java) are still duplicated.
+cc_library(
+    name = "tensorflow_native_libs",
+    srcs = [
+        "//tensorflow/examples/android:libtensorflow_demo.so",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+)
+
+android_binary(
+    name = "tflite_demo",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    # Package assets from assets dir as well as all model targets.
+    # Remove undesired models (and corresponding Activities in source)
+    # to reduce APK size.
+    assets = [
+        "//tensorflow/contrib/lite/examples/android/assets:labels_mobilenet_quant_v1_224.txt",
+        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:conv_actions_labels.txt",
+        "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
+        "//tensorflow/contrib/lite/examples/android/assets:box_priors.txt",
+        "//tensorflow/contrib/lite/examples/android/assets:coco_labels_list.txt",
+    ],
+    assets_dir = "",
+    custom_package = "org.tensorflow.lite.demo",
+    inline_constants = 1,
+    manifest = "AndroidManifest.xml",
+    manifest_merger = "android",
+    nocompress_extensions = [
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":tensorflow_native_libs",
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+            "bin/**",
+            "gen/**",
+            "gradleBuild/**",
+            "libs/**",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+filegroup(
+    name = "java_files",
+    srcs = glob(["src/**/*.java"]),
+)
+
+filegroup(
+    name = "resource_files",
+    srcs = glob(["res/**"]),
+)
+
+exports_files(["AndroidManifest.xml"])
diff --git a/tensorflow/contrib/lite/examples/android/assets/BUILD b/tensorflow/contrib/lite/examples/android/assets/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..dd0cd6c98ff878e9c41875cab74c12191cadb173
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
diff --git a/tensorflow/contrib/lite/examples/android/assets/box_priors.txt b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7246b073fe7fd8b1d1340536457c8aeac24cd5a3
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/box_priors.txt
@@ -0,0 +1,5 @@
+        0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.02631579 0.02631579 0.026315793 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.078947365 0.07894737 0.078947365 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.13157895 0.13157895 0.13157894 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.18421052 0.18421051 0.18421052 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.23684211 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.28947368 0.28947368 0.28947365 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.34210524 0.34210524 0.3421052 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.92105263 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.97368425 0.9736843 0.97368425 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.049999997 0.049999997 0.049999997 0.05 0.050000012 0.049999997 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.25 0.25 0.25 0.25 0.25000003 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000005 0.35000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.45 0.45000002 0.45000002 0.45000002 0.45000002 0.45000002 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.55 0.55 0.55 0.55 0.54999995 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.099999994 0.1 0.099999994 0.1 0.099999994 0.099999994 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.30000004 0.3 0.3 0.3 0.3 0.30000004 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.49999997 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.90000004 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.16666667 0.16666667 0.16666666 0.16666667 0.16666669 0.16666667 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.5 0.5 0.49999997 0.5 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.8333334 0.25 0.25 0.25 0.24999999 0.25 0.25 0.25 0.25 0.25 0.24999999 0.25 0.25 0.75 0.75 0.75 0.75 0.74999994 0.75 0.75 0.75 0.75 0.75 0.74999994 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.02631579 0.026315793 0.02631579 0.078947365 0.078947365 0.07894737 0.13157895 0.13157894 0.13157895 0.18421052 0.18421052 0.18421051 0.23684211 0.23684211 0.23684211 0.28947368 0.28947365 0.28947368 0.34210524 0.3421052 0.34210524 0.39473683 0.39473683 0.39473683 0.4473684 0.4473684 0.4473684 0.5 0.5 0.5 0.5526316 0.5526316 0.5526316 0.6052632 0.6052632 0.6052632 0.65789473 0.65789473 0.65789473 0.71052635 0.71052635 0.71052635 0.7631579 0.7631579 0.7631579 0.8157895 0.8157895 0.8157895 0.8684211 0.8684211 0.8684211 0.92105263 0.92105263 0.92105263 0.97368425 0.97368425 0.9736843 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.049999997 0.049999997 0.050000004 0.050000012 0.05 0.049999997 0.15 0.14999999 0.15 0.15 0.15 0.15 0.25 0.25 0.25 0.25 0.25 0.25 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.35000002 0.45000002 0.45 0.45000002 0.45000002 0.45 0.45000002 0.55 0.55 0.55 0.55 0.55 0.55 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.65000004 0.75 0.75 0.75 0.75 0.75 0.75 0.85 0.85 0.85 0.85 0.85 0.85 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.95000005 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.10000001 0.099999994 0.1 0.099999994 0.1 0.099999994 0.3 0.3 0.3 0.29999998 0.3 0.30000004 0.5 0.5 0.5 0.5 0.5 0.49999997 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.70000005 0.9 0.90000004 0.90000004 0.9 0.90000004 0.90000004 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.16666667 0.16666669 0.16666667 0.16666669 0.16666667 0.16666667 0.49999997 0.5 0.5 0.50000006 0.5 0.5 0.8333334 0.8333334 0.8333334 0.8333333 0.8333334 0.8333334 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.25 0.25 0.25 0.25 0.25 0.25 0.75 0.75 0.75 0.75 0.75 0.75 0.5 0.5 0.5 0.5 0.5 0.5 
+        0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.1 0.14142136 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.28284273 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142138 0.2828427 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.099999994 0.14142135 0.28284273 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.10000001 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142138 0.2828427 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142135 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.2828427 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142132 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.100000024 0.14142138 0.28284276 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.2474874 0.4949748 0.20207259 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748738 0.4949748 0.20207258 0.6062481 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.4949748 0.2020726 0.60624814 0.41833 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35000002 0.24748741 0.49497482 0.2020726 0.60624814 0.41832998 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35 0.24748737 0.4949748 0.20207256 0.6062481 0.41833 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497476 0.20207262 0.606248 0.41833004 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000002 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.35000008 0.24748743 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.34999996 0.24748737 0.49497485 0.20207262 0.60624814 0.41832995 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.50000006 0.3535534 0.7071068 0.28867513 0.8660687 0.57008773 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5000001 0.3535534 0.7071068 0.28867513 0.8660687 0.5700878 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.2886751 0.8660687 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5 0.3535534 0.7071068 0.28867507 0.8660688 0.5700877 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.5000001 0.3535534 0.70710677 0.2886752 0.8660687 0.5700878 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.65000004 0.45961943 0.91923887 0.37527767 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.4596194 0.9192388 0.37527764 1.1258893 0.7211102 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.6500001 0.45961946 0.9192388 0.3752777 1.1258893 0.72111017 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.8000001 0.5656855 1.131371 0.4618802 1.3857099 0.8717798 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.80000013 0.5656855 1.131371 0.4618802 1.3857098 0.87177986 0.95000005 0.6717515 1.343503 0.5484828 1.6455305 0.97467947 
+        0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.1 0.28284273 0.14142136 0.099999994 0.28284273 0.14142138 0.099999994 0.2828427 0.14142138 0.099999994 0.28284273 0.14142135 0.099999994 0.28284273 0.14142135 0.10000001 0.2828427 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142138 0.100000024 0.2828427 0.14142138 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142135 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.2828427 0.14142132 0.100000024 0.28284276 0.14142132 0.100000024 0.28284276 0.14142138 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.34999996 0.4949747 0.24748735 0.60621774 0.20206249 0.41833 0.34999996 0.49497467 0.24748737 0.60621774 0.20206249 0.41833 0.34999996 0.49497473 0.24748737 0.60621774 0.20206249 0.41833 0.34999993 0.49497473 0.24748737 0.60621774 0.20206249 0.41832998 0.34999996 0.49497467 0.24748737 0.60621774 0.20206246 0.41833 0.35 0.49497473 0.24748734 0.60621774 0.20206249 0.41833004 0.35 0.49497473 0.2474873 0.60621774 0.20206249 0.41833004 0.3499999 0.49497473 0.2474873 0.6062178 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062177 0.20206249 0.41832995 0.3499999 0.49497467 0.2474873 0.6062178 0.20206255 0.41832995 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.49999997 0.7071067 0.35355335 0.8660254 0.2886607 0.57008773 0.5 0.7071067 0.35355335 0.8660253 0.2886607 0.5700878 0.5 0.7071067 0.35355332 0.86602545 0.28866073 0.5700877 0.5 0.70710665 0.3535533 0.86602545 0.28866076 0.5700877 0.49999994 0.7071067 0.3535534 0.8660253 0.28866065 0.5700878 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.6499999 0.9192387 0.45961934 1.1258329 0.3752589 0.7211102 0.64999986 0.9192387 0.4596193 1.125833 0.37525892 0.7211102 0.64999986 0.91923875 0.45961928 1.1258328 0.37525892 0.72111017 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.79999995 1.1313708 0.5656854 1.3856406 0.46185714 0.8717798 0.79999995 1.1313708 0.56568533 1.3856406 0.46185708 0.87177986 0.9499999 1.3435028 0.6717514 1.6454482 0.54845536 0.97467947 
+
diff --git a/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a70ff82aa7b0fa7315ca591820e4cf7d2f5ad18
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/coco_labels_list.txt
@@ -0,0 +1,91 @@
+???
+person
+bicycle
+car
+motorcycle
+airplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+???
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+???
+backpack
+umbrella
+???
+???
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+???
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+couch
+potted plant
+bed
+???
+dining table
+???
+???
+toilet
+???
+tv
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+???
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
diff --git a/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ba416458b011a7f4b96739eb6fcb6275a6ab3bec
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/conv_actions_labels.txt
@@ -0,0 +1,12 @@
+_silence_
+_unknown_
+yes
+no
+up
+down
+left
+right
+on
+off
+stop
+go
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fe811239d8e2989de19fecabb1ebb0c9dddac514
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/assets/labels_mobilenet_quant_v1_224.txt
@@ -0,0 +1,1001 @@
+background
+tench
+goldfish
+great white shark
+tiger shark
+hammerhead
+electric ray
+stingray
+cock
+hen
+ostrich
+brambling
+goldfinch
+house finch
+junco
+indigo bunting
+robin
+bulbul
+jay
+magpie
+chickadee
+water ouzel
+kite
+bald eagle
+vulture
+great grey owl
+European fire salamander
+common newt
+eft
+spotted salamander
+axolotl
+bullfrog
+tree frog
+tailed frog
+loggerhead
+leatherback turtle
+mud turtle
+terrapin
+box turtle
+banded gecko
+common iguana
+American chameleon
+whiptail
+agama
+frilled lizard
+alligator lizard
+Gila monster
+green lizard
+African chameleon
+Komodo dragon
+African crocodile
+American alligator
+triceratops
+thunder snake
+ringneck snake
+hognose snake
+green snake
+king snake
+garter snake
+water snake
+vine snake
+night snake
+boa constrictor
+rock python
+Indian cobra
+green mamba
+sea snake
+horned viper
+diamondback
+sidewinder
+trilobite
+harvestman
+scorpion
+black and gold garden spider
+barn spider
+garden spider
+black widow
+tarantula
+wolf spider
+tick
+centipede
+black grouse
+ptarmigan
+ruffed grouse
+prairie chicken
+peacock
+quail
+partridge
+African grey
+macaw
+sulphur-crested cockatoo
+lorikeet
+coucal
+bee eater
+hornbill
+hummingbird
+jacamar
+toucan
+drake
+red-breasted merganser
+goose
+black swan
+tusker
+echidna
+platypus
+wallaby
+koala
+wombat
+jellyfish
+sea anemone
+brain coral
+flatworm
+nematode
+conch
+snail
+slug
+sea slug
+chiton
+chambered nautilus
+Dungeness crab
+rock crab
+fiddler crab
+king crab
+American lobster
+spiny lobster
+crayfish
+hermit crab
+isopod
+white stork
+black stork
+spoonbill
+flamingo
+little blue heron
+American egret
+bittern
+crane
+limpkin
+European gallinule
+American coot
+bustard
+ruddy turnstone
+red-backed sandpiper
+redshank
+dowitcher
+oystercatcher
+pelican
+king penguin
+albatross
+grey whale
+killer whale
+dugong
+sea lion
+Chihuahua
+Japanese spaniel
+Maltese dog
+Pekinese
+Shih-Tzu
+Blenheim spaniel
+papillon
+toy terrier
+Rhodesian ridgeback
+Afghan hound
+basset
+beagle
+bloodhound
+bluetick
+black-and-tan coonhound
+Walker hound
+English foxhound
+redbone
+borzoi
+Irish wolfhound
+Italian greyhound
+whippet
+Ibizan hound
+Norwegian elkhound
+otterhound
+Saluki
+Scottish deerhound
+Weimaraner
+Staffordshire bullterrier
+American Staffordshire terrier
+Bedlington terrier
+Border terrier
+Kerry blue terrier
+Irish terrier
+Norfolk terrier
+Norwich terrier
+Yorkshire terrier
+wire-haired fox terrier
+Lakeland terrier
+Sealyham terrier
+Airedale
+cairn
+Australian terrier
+Dandie Dinmont
+Boston bull
+miniature schnauzer
+giant schnauzer
+standard schnauzer
+Scotch terrier
+Tibetan terrier
+silky terrier
+soft-coated wheaten terrier
+West Highland white terrier
+Lhasa
+flat-coated retriever
+curly-coated retriever
+golden retriever
+Labrador retriever
+Chesapeake Bay retriever
+German short-haired pointer
+vizsla
+English setter
+Irish setter
+Gordon setter
+Brittany spaniel
+clumber
+English springer
+Welsh springer spaniel
+cocker spaniel
+Sussex spaniel
+Irish water spaniel
+kuvasz
+schipperke
+groenendael
+malinois
+briard
+kelpie
+komondor
+Old English sheepdog
+Shetland sheepdog
+collie
+Border collie
+Bouvier des Flandres
+Rottweiler
+German shepherd
+Doberman
+miniature pinscher
+Greater Swiss Mountain dog
+Bernese mountain dog
+Appenzeller
+EntleBucher
+boxer
+bull mastiff
+Tibetan mastiff
+French bulldog
+Great Dane
+Saint Bernard
+Eskimo dog
+malamute
+Siberian husky
+dalmatian
+affenpinscher
+basenji
+pug
+Leonberg
+Newfoundland
+Great Pyrenees
+Samoyed
+Pomeranian
+chow
+keeshond
+Brabancon griffon
+Pembroke
+Cardigan
+toy poodle
+miniature poodle
+standard poodle
+Mexican hairless
+timber wolf
+white wolf
+red wolf
+coyote
+dingo
+dhole
+African hunting dog
+hyena
+red fox
+kit fox
+Arctic fox
+grey fox
+tabby
+tiger cat
+Persian cat
+Siamese cat
+Egyptian cat
+cougar
+lynx
+leopard
+snow leopard
+jaguar
+lion
+tiger
+cheetah
+brown bear
+American black bear
+ice bear
+sloth bear
+mongoose
+meerkat
+tiger beetle
+ladybug
+ground beetle
+long-horned beetle
+leaf beetle
+dung beetle
+rhinoceros beetle
+weevil
+fly
+bee
+ant
+grasshopper
+cricket
+walking stick
+cockroach
+mantis
+cicada
+leafhopper
+lacewing
+dragonfly
+damselfly
+admiral
+ringlet
+monarch
+cabbage butterfly
+sulphur butterfly
+lycaenid
+starfish
+sea urchin
+sea cucumber
+wood rabbit
+hare
+Angora
+hamster
+porcupine
+fox squirrel
+marmot
+beaver
+guinea pig
+sorrel
+zebra
+hog
+wild boar
+warthog
+hippopotamus
+ox
+water buffalo
+bison
+ram
+bighorn
+ibex
+hartebeest
+impala
+gazelle
+Arabian camel
+llama
+weasel
+mink
+polecat
+black-footed ferret
+otter
+skunk
+badger
+armadillo
+three-toed sloth
+orangutan
+gorilla
+chimpanzee
+gibbon
+siamang
+guenon
+patas
+baboon
+macaque
+langur
+colobus
+proboscis monkey
+marmoset
+capuchin
+howler monkey
+titi
+spider monkey
+squirrel monkey
+Madagascar cat
+indri
+Indian elephant
+African elephant
+lesser panda
+giant panda
+barracouta
+eel
+coho
+rock beauty
+anemone fish
+sturgeon
+gar
+lionfish
+puffer
+abacus
+abaya
+academic gown
+accordion
+acoustic guitar
+aircraft carrier
+airliner
+airship
+altar
+ambulance
+amphibian
+analog clock
+apiary
+apron
+ashcan
+assault rifle
+backpack
+bakery
+balance beam
+balloon
+ballpoint
+Band Aid
+banjo
+bannister
+barbell
+barber chair
+barbershop
+barn
+barometer
+barrel
+barrow
+baseball
+basketball
+bassinet
+bassoon
+bathing cap
+bath towel
+bathtub
+beach wagon
+beacon
+beaker
+bearskin
+beer bottle
+beer glass
+bell cote
+bib
+bicycle-built-for-two
+bikini
+binder
+binoculars
+birdhouse
+boathouse
+bobsled
+bolo tie
+bonnet
+bookcase
+bookshop
+bottlecap
+bow
+bow tie
+brass
+brassiere
+breakwater
+breastplate
+broom
+bucket
+buckle
+bulletproof vest
+bullet train
+butcher shop
+cab
+caldron
+candle
+cannon
+canoe
+can opener
+cardigan
+car mirror
+carousel
+carpenter's kit
+carton
+car wheel
+cash machine
+cassette
+cassette player
+castle
+catamaran
+CD player
+cello
+cellular telephone
+chain
+chainlink fence
+chain mail
+chain saw
+chest
+chiffonier
+chime
+china cabinet
+Christmas stocking
+church
+cinema
+cleaver
+cliff dwelling
+cloak
+clog
+cocktail shaker
+coffee mug
+coffeepot
+coil
+combination lock
+computer keyboard
+confectionery
+container ship
+convertible
+corkscrew
+cornet
+cowboy boot
+cowboy hat
+cradle
+crane
+crash helmet
+crate
+crib
+Crock Pot
+croquet ball
+crutch
+cuirass
+dam
+desk
+desktop computer
+dial telephone
+diaper
+digital clock
+digital watch
+dining table
+dishrag
+dishwasher
+disk brake
+dock
+dogsled
+dome
+doormat
+drilling platform
+drum
+drumstick
+dumbbell
+Dutch oven
+electric fan
+electric guitar
+electric locomotive
+entertainment center
+envelope
+espresso maker
+face powder
+feather boa
+file
+fireboat
+fire engine
+fire screen
+flagpole
+flute
+folding chair
+football helmet
+forklift
+fountain
+fountain pen
+four-poster
+freight car
+French horn
+frying pan
+fur coat
+garbage truck
+gasmask
+gas pump
+goblet
+go-kart
+golf ball
+golfcart
+gondola
+gong
+gown
+grand piano
+greenhouse
+grille
+grocery store
+guillotine
+hair slide
+hair spray
+half track
+hammer
+hamper
+hand blower
+hand-held computer
+handkerchief
+hard disc
+harmonica
+harp
+harvester
+hatchet
+holster
+home theater
+honeycomb
+hook
+hoopskirt
+horizontal bar
+horse cart
+hourglass
+iPod
+iron
+jack-o'-lantern
+jean
+jeep
+jersey
+jigsaw puzzle
+jinrikisha
+joystick
+kimono
+knee pad
+knot
+lab coat
+ladle
+lampshade
+laptop
+lawn mower
+lens cap
+letter opener
+library
+lifeboat
+lighter
+limousine
+liner
+lipstick
+Loafer
+lotion
+loudspeaker
+loupe
+lumbermill
+magnetic compass
+mailbag
+mailbox
+maillot
+maillot
+manhole cover
+maraca
+marimba
+mask
+matchstick
+maypole
+maze
+measuring cup
+medicine chest
+megalith
+microphone
+microwave
+military uniform
+milk can
+minibus
+miniskirt
+minivan
+missile
+mitten
+mixing bowl
+mobile home
+Model T
+modem
+monastery
+monitor
+moped
+mortar
+mortarboard
+mosque
+mosquito net
+motor scooter
+mountain bike
+mountain tent
+mouse
+mousetrap
+moving van
+muzzle
+nail
+neck brace
+necklace
+nipple
+notebook
+obelisk
+oboe
+ocarina
+odometer
+oil filter
+organ
+oscilloscope
+overskirt
+oxcart
+oxygen mask
+packet
+paddle
+paddlewheel
+padlock
+paintbrush
+pajama
+palace
+panpipe
+paper towel
+parachute
+parallel bars
+park bench
+parking meter
+passenger car
+patio
+pay-phone
+pedestal
+pencil box
+pencil sharpener
+perfume
+Petri dish
+photocopier
+pick
+pickelhaube
+picket fence
+pickup
+pier
+piggy bank
+pill bottle
+pillow
+ping-pong ball
+pinwheel
+pirate
+pitcher
+plane
+planetarium
+plastic bag
+plate rack
+plow
+plunger
+Polaroid camera
+pole
+police van
+poncho
+pool table
+pop bottle
+pot
+potter's wheel
+power drill
+prayer rug
+printer
+prison
+projectile
+projector
+puck
+punching bag
+purse
+quill
+quilt
+racer
+racket
+radiator
+radio
+radio telescope
+rain barrel
+recreational vehicle
+reel
+reflex camera
+refrigerator
+remote control
+restaurant
+revolver
+rifle
+rocking chair
+rotisserie
+rubber eraser
+rugby ball
+rule
+running shoe
+safe
+safety pin
+saltshaker
+sandal
+sarong
+sax
+scabbard
+scale
+school bus
+schooner
+scoreboard
+screen
+screw
+screwdriver
+seat belt
+sewing machine
+shield
+shoe shop
+shoji
+shopping basket
+shopping cart
+shovel
+shower cap
+shower curtain
+ski
+ski mask
+sleeping bag
+slide rule
+sliding door
+slot
+snorkel
+snowmobile
+snowplow
+soap dispenser
+soccer ball
+sock
+solar dish
+sombrero
+soup bowl
+space bar
+space heater
+space shuttle
+spatula
+speedboat
+spider web
+spindle
+sports car
+spotlight
+stage
+steam locomotive
+steel arch bridge
+steel drum
+stethoscope
+stole
+stone wall
+stopwatch
+stove
+strainer
+streetcar
+stretcher
+studio couch
+stupa
+submarine
+suit
+sundial
+sunglass
+sunglasses
+sunscreen
+suspension bridge
+swab
+sweatshirt
+swimming trunks
+swing
+switch
+syringe
+table lamp
+tank
+tape player
+teapot
+teddy
+television
+tennis ball
+thatch
+theater curtain
+thimble
+thresher
+throne
+tile roof
+toaster
+tobacco shop
+toilet seat
+torch
+totem pole
+tow truck
+toyshop
+tractor
+trailer truck
+tray
+trench coat
+tricycle
+trimaran
+tripod
+triumphal arch
+trolleybus
+trombone
+tub
+turnstile
+typewriter keyboard
+umbrella
+unicycle
+upright
+vacuum
+vase
+vault
+velvet
+vending machine
+vestment
+viaduct
+violin
+volleyball
+waffle iron
+wall clock
+wallet
+wardrobe
+warplane
+washbasin
+washer
+water bottle
+water jug
+water tower
+whiskey jug
+whistle
+wig
+window screen
+window shade
+Windsor tie
+wine bottle
+wing
+wok
+wooden spoon
+wool
+worm fence
+wreck
+yawl
+yurt
+web site
+comic book
+crossword puzzle
+street sign
+traffic light
+book jacket
+menu
+plate
+guacamole
+consomme
+hot pot
+trifle
+ice cream
+ice lolly
+French loaf
+bagel
+pretzel
+cheeseburger
+hotdog
+mashed potato
+head cabbage
+broccoli
+cauliflower
+zucchini
+spaghetti squash
+acorn squash
+butternut squash
+cucumber
+artichoke
+bell pepper
+cardoon
+mushroom
+Granny Smith
+strawberry
+orange
+lemon
+fig
+pineapple
+banana
+jackfruit
+custard apple
+pomegranate
+hay
+carbonara
+chocolate sauce
+dough
+meat loaf
+pizza
+potpie
+burrito
+red wine
+espresso
+cup
+eggnog
+alp
+bubble
+cliff
+coral reef
+geyser
+lakeside
+promontory
+sandbar
+seashore
+valley
+volcano
+ballplayer
+groom
+scuba diver
+rapeseed
+daisy
+yellow lady's slipper
+corn
+acorn
+hip
+buckeye
+coral fungus
+agaric
+gyromitra
+stinkhorn
+earthstar
+hen-of-the-woods
+bolete
+ear
+toilet tissue
diff --git a/tensorflow/contrib/lite/examples/android/build.gradle b/tensorflow/contrib/lite/examples/android/build.gradle
new file mode 100644
index 0000000000000000000000000000000000000000..0d4de358156a5d139e35cc542b8d36ab24e763b9
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/build.gradle
@@ -0,0 +1,52 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "org.tensorflow.lite.demo"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
new file mode 100644
index 0000000000000000000000000000000000000000..891d8cc1d4f3e59d0371030fd763c5ad468e7887
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/animator/color_animation.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<set xmlns:android="http://schemas.android.com/apk/res/android"
+  android:ordering="sequentially">
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0x00b3ccff"
+    android:valueTo="0xffb3ccff"
+    android:valueType="colorType"/>
+  <objectAnimator
+    android:propertyName="backgroundColor"
+    android:duration="375"
+    android:valueFrom="0xffb3ccff"
+    android:valueTo="0x00b3ccff"
+    android:valueType="colorType"/>
+</set>
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..32bd1aabcabb85ded957230533c00e735183a323
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3113cd15c3255405ee34c622a1e83674e6e5487
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png
new file mode 100644
index 0000000000000000000000000000000000000000..135862883e26eddce2b19db021adf62e10357ad0
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-hdpi/tile.9.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..8efbbf8b3c44418551699db9388cd77a88362112
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..51f87ee6507cebec6bff32b1a03b36ffc711689d
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..ba143ea7a80f03b0e850775ad672ccb2d6195e4c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..6361d792dacd8ce09a14258878b5ce6db5e0debb
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png
new file mode 100644
index 0000000000000000000000000000000000000000..394eb7e534905e36fd24c3defac92c09b403ee39
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_action_info.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 0000000000000000000000000000000000000000..2e27bec9785d4d51fe597bced7f04508994aa10c
Binary files /dev/null and b/tensorflow/contrib/lite/examples/android/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/examples/android/res/drawable/border.xml b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d64d1d61f359422c79533f726991c78e47d99
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/drawable/border.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<shape xmlns:android="http://schemas.android.com/apk/res/android" android:shape="rectangle" >
+  <solid android:color="#00000000" />
+  <stroke android:width="1dip" android:color="#cccccc" />
+</shape>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1a22d4b33ebbd755104272863c5cc6c93793b86b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_camera.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:id="@+id/container"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#000"
+    tools:context="org.tensorflow.demo.CameraActivity" />
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
new file mode 100644
index 0000000000000000000000000000000000000000..2fe1338da57122c7e26c64c653076b6746a25497
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/activity_speech.xml
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:app="http://schemas.android.com/apk/res-auto"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    tools:context="org.tensorflow.demo.SpeechActivity">
+
+    <TextView
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Say one of the words below!"
+        android:id="@+id/textView"
+        android:textAlignment="center"
+        android:layout_gravity="top"
+        android:textSize="24dp"
+        android:layout_marginTop="10dp"
+        android:layout_marginLeft="10dp"
+        />
+
+    <ListView
+        android:id="@+id/list_view"
+        android:layout_width="240dp"
+        android:layout_height="wrap_content"
+        android:background="@drawable/border"
+        android:layout_gravity="top|center_horizontal"
+        android:textAlignment="center"
+        android:layout_marginTop="100dp"
+        />
+
+    <Button
+        android:id="@+id/quit"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:text="Quit"
+        android:layout_gravity="bottom|center_horizontal"
+        android:layout_marginBottom="10dp"
+        />
+
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
new file mode 100644
index 0000000000000000000000000000000000000000..a1bbdf1702cea79088715d30b8746f7fc8fdac56
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+    <org.tensorflow.demo.AutoFitTextureView
+        android:id="@+id/texture"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:layout_alignParentBottom="true" />
+
+    <org.tensorflow.demo.RecognitionScoreView
+        android:id="@+id/results"
+        android:layout_width="match_parent"
+        android:layout_height="112dp"
+        android:layout_alignParentTop="true" />
+
+    <org.tensorflow.demo.OverlayView
+        android:id="@+id/debug_overlay"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_alignParentBottom="true" />
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1cdb24cab03222934ca2aa326a765150d58aa6a8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_stylize.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
+      android:orientation="vertical"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent">
+  <org.tensorflow.demo.AutoFitTextureView
+    android:id="@+id/texture"
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:layout_alignParentTop="true" />
+
+  <RelativeLayout
+    android:id="@+id/black"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:background="#FF000000" />
+
+  <GridView
+    android:id="@+id/grid_layout"
+    android:numColumns="7"
+    android:stretchMode="columnWidth"
+    android:layout_alignParentBottom="true"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content" />
+
+  <org.tensorflow.demo.OverlayView
+      android:id="@+id/overlay"
+      android:layout_width="match_parent"
+      android:layout_height="match_parent"
+      android:layout_alignParentTop="true" />
+
+  <org.tensorflow.demo.OverlayView
+    android:id="@+id/debug_overlay"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:layout_alignParentTop="true" />
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ca18ea075dbb65d93bc895bc33211a171e52d62e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/camera_connection_fragment_tracking.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<FrameLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+      <org.tensorflow.demo.AutoFitTextureView
+          android:id="@+id/texture"
+          android:layout_width="wrap_content"
+          android:layout_height="wrap_content"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/tracking_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+
+      <org.tensorflow.demo.OverlayView
+          android:id="@+id/debug_overlay"
+          android:layout_width="match_parent"
+          android:layout_height="match_parent"/>
+</FrameLayout>
diff --git a/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
new file mode 100644
index 0000000000000000000000000000000000000000..526017fbb24ecfa6765a21378f1ae0890a97a004
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/layout/list_text_item.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<TextView
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    android:id="@+id/list_text_item"
+    android:layout_width="match_parent"
+    android:layout_height="wrap_content"
+    android:text="TextView"
+    android:textSize="24dp"
+    android:textAlignment="center"
+    android:gravity="center_horizontal"
+    />
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..820eda0e5585284c4b3f2bbaebdfee9d074d4c19
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-dimens.xml
@@ -0,0 +1,24 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_huge</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..09303314e91eed623b5aca91189372aaea767c9e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-sw600dp/template-styles.xml
@@ -0,0 +1,25 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceLarge</item>
+        <item name="android:lineSpacingMultiplier">1.2</item>
+        <item name="android:shadowDy">-6.5</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c2d1babc121ec5680f85e9d8b6a8f65f8fefbb6e
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+  <!--
+        Base application theme for API 11+. This theme completely replaces
+        AppBaseTheme from res/values/styles.xml on API 11+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light">
+    <!-- API 11 theme customizations can go here. -->
+  </style>
+
+  <style name="FullscreenTheme" parent="android:Theme.Holo">
+    <item name="android:actionBarStyle">@style/FullscreenActionBarStyle</item>
+    <item name="android:windowActionBarOverlay">true</item>
+    <item name="android:windowBackground">@null</item>
+    <item name="metaButtonBarStyle">?android:attr/buttonBarStyle</item>
+    <item name="metaButtonBarButtonStyle">?android:attr/buttonBarButtonStyle</item>
+  </style>
+
+  <style name="FullscreenActionBarStyle" parent="android:Widget.Holo.ActionBar">
+    <!--  <item name="android:background">@color/black_overlay</item>  -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1ad048439cf1c5207a609d4664674e9a4278ee6c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v11/template-styles.xml
@@ -0,0 +1,22 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Holo.Light" />
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..cc370849c0f90627283345bcfa03d0bb0b40e1b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v14/styles.xml
@@ -0,0 +1,12 @@
+<resources>
+
+  <!--
+        Base application theme for API 14+. This theme completely replaces
+        AppBaseTheme from BOTH res/values/styles.xml and
+        res/values-v11/styles.xml on API 14+ devices.
+  -->
+  <style name="AppBaseTheme" parent="android:Theme.Holo.Light.DarkActionBar">
+    <!-- API 14 theme customizations can go here. -->
+  </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..c16da7c51ceeb3e634c349ff098f86eccb53b8f8
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-colors.xml
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..8890d2f4a507e30c28457ea9692f03af5834c82f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values-v21/base-template-styles.xml
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+
+    <!-- Activity themes -->
+    <style name="Theme.Base" parent="android:Theme.Material.Light">
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/attrs.xml b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
new file mode 100644
index 0000000000000000000000000000000000000000..56e5beae76f2a148c147d599fe0e02bd78a5f729
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/attrs.xml
@@ -0,0 +1,14 @@
+<resources>
+
+  <!--
+         Declare custom theme attributes that allow changing which styles are
+         used for button bars depending on the API level.
+         ?android:attr/buttonBarStyle is new as of API 11 so this is
+         necessary to support previous API levels.
+  -->
+  <declare-styleable name="ButtonBarContainerTheme">
+    <attr name="metaButtonBarStyle" format="reference" />
+    <attr name="metaButtonBarButtonStyle" format="reference" />
+  </declare-styleable>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ebc5dc8423ca6a481bbfcfabcbcd66e4367428eb
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/base-strings.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<resources>
+    <string name="app_name">TFLite Demo</string>
+    <string name="activity_name_classification">TFL Classify</string>
+    <string name="activity_name_detection">TFL Detect</string>
+    <string name="activity_name_speech">TFL Speech</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/colors.xml b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
new file mode 100644
index 0000000000000000000000000000000000000000..584ed6052d4746bce5b60fef0b25633777262a11
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/colors.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <color name="control_background">#cc4285f4</color>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/strings.xml b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..ea20ee78e0b99c0ad7f1c315269a7fd5435cff98
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/strings.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="description_info">Info</string>
+    <string name="request_permission">This sample needs camera permission.</string>
+    <string name="camera_error">This device doesn\'t support Camera2 API.</string>
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/styles.xml b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..dd1d973e9be8c82b68e39f755650efec71d95005
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/styles.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+ Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
new file mode 100644
index 0000000000000000000000000000000000000000..069977b6a6f4c9d14ed859d4e8dd95d42f7ce74f
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-dimens.xml
@@ -0,0 +1,32 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Define standard dimensions to comply with Holo-style grids and rhythm. -->
+
+    <dimen name="margin_tiny">4dp</dimen>
+    <dimen name="margin_small">8dp</dimen>
+    <dimen name="margin_medium">16dp</dimen>
+    <dimen name="margin_large">32dp</dimen>
+    <dimen name="margin_huge">64dp</dimen>
+
+    <!-- Semantic definitions -->
+
+    <dimen name="horizontal_page_margin">@dimen/margin_medium</dimen>
+    <dimen name="vertical_page_margin">@dimen/margin_medium</dimen>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
new file mode 100644
index 0000000000000000000000000000000000000000..1b87714a49409a40d3a4649d83f0d0ff0fd57b9d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/res/values/template-styles.xml
@@ -0,0 +1,42 @@
+<!--
+  Copyright 2013 The TensorFlow Authors. All Rights Reserved.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+  -->
+
+<resources>
+
+    <!-- Activity themes -->
+
+    <style name="Theme.Base" parent="android:Theme.Light" />
+
+    <style name="Theme.Sample" parent="Theme.Base" />
+
+    <style name="AppTheme" parent="Theme.Sample" />
+    <!-- Widget styling -->
+
+    <style name="Widget" />
+
+    <style name="Widget.SampleMessage">
+        <item name="android:textAppearance">?android:textAppearanceMedium</item>
+        <item name="android:lineSpacingMultiplier">1.1</item>
+    </style>
+
+    <style name="Widget.SampleMessageTile">
+        <item name="android:background">@drawable/tile</item>
+        <item name="android:shadowColor">#7F000000</item>
+        <item name="android:shadowDy">-3.5</item>
+        <item name="android:shadowRadius">2</item>
+    </style>
+
+</resources>
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
new file mode 100644
index 0000000000000000000000000000000000000000..eff24afdba44e5d56c760c1692df5fc40f5c2f42
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.util.AttributeSet;
+import android.view.TextureView;
+
+/**
+ * A {@link TextureView} that can be adjusted to a specified aspect ratio.
+ */
+public class AutoFitTextureView extends TextureView {
+  private int ratioWidth = 0;
+  private int ratioHeight = 0;
+
+  public AutoFitTextureView(final Context context) {
+    this(context, null);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs) {
+    this(context, attrs, 0);
+  }
+
+  public AutoFitTextureView(final Context context, final AttributeSet attrs, final int defStyle) {
+    super(context, attrs, defStyle);
+  }
+
+  /**
+   * Sets the aspect ratio for this view. The size of the view will be measured based on the ratio
+   * calculated from the parameters. Note that the actual sizes of parameters don't matter, that
+   * is, calling setAspectRatio(2, 3) and setAspectRatio(4, 6) make the same result.
+   *
+   * @param width  Relative horizontal size
+   * @param height Relative vertical size
+   */
+  public void setAspectRatio(final int width, final int height) {
+    if (width < 0 || height < 0) {
+      throw new IllegalArgumentException("Size cannot be negative.");
+    }
+    ratioWidth = width;
+    ratioHeight = height;
+    requestLayout();
+  }
+
+  @Override
+  protected void onMeasure(final int widthMeasureSpec, final int heightMeasureSpec) {
+    super.onMeasure(widthMeasureSpec, heightMeasureSpec);
+    final int width = MeasureSpec.getSize(widthMeasureSpec);
+    final int height = MeasureSpec.getSize(heightMeasureSpec);
+    if (0 == ratioWidth || 0 == ratioHeight) {
+      setMeasuredDimension(width, height);
+    } else {
+      if (width < height * ratioWidth / ratioHeight) {
+        setMeasuredDimension(width, width * ratioHeight / ratioWidth);
+      } else {
+        setMeasuredDimension(height * ratioWidth / ratioHeight, height);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..15d5456f027b27ae3cbb93f736dbb104af0218de
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraActivity.java
@@ -0,0 +1,450 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.Manifest;
+import android.app.Activity;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.pm.PackageManager;
+import android.hardware.Camera;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.Image;
+import android.media.Image.Plane;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Build;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Trace;
+import android.util.Size;
+import android.view.KeyEvent;
+import android.view.Surface;
+import android.view.WindowManager;
+import android.widget.Toast;
+import java.nio.ByteBuffer;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public abstract class CameraActivity extends Activity
+    implements OnImageAvailableListener, Camera.PreviewCallback {
+  private static final Logger LOGGER = new Logger();
+
+  private static final int PERMISSIONS_REQUEST = 1;
+
+  private static final String PERMISSION_CAMERA = Manifest.permission.CAMERA;
+  private static final String PERMISSION_STORAGE = Manifest.permission.WRITE_EXTERNAL_STORAGE;
+
+  private boolean debug = false;
+
+  private Handler handler;
+  private HandlerThread handlerThread;
+  private boolean useCamera2API;
+  private boolean isProcessingFrame = false;
+  private byte[][] yuvBytes = new byte[3][];
+  private int[] rgbBytes = null;
+  private int yRowStride;
+
+  protected int previewWidth = 0;
+  protected int previewHeight = 0;
+
+  private Runnable postInferenceCallback;
+  private Runnable imageConverter;
+
+  @Override
+  protected void onCreate(final Bundle savedInstanceState) {
+    LOGGER.d("onCreate " + this);
+    super.onCreate(null);
+    getWindow().addFlags(WindowManager.LayoutParams.FLAG_KEEP_SCREEN_ON);
+
+    setContentView(R.layout.activity_camera);
+
+    if (hasPermission()) {
+      setFragment();
+    } else {
+      requestPermission();
+    }
+  }
+
+
+  protected int[] getRgbBytes() {
+    imageConverter.run();
+    return rgbBytes;
+  }
+
+  protected int getLuminanceStride() {
+    return yRowStride;
+  }
+
+  protected byte[] getLuminance() {
+    return yuvBytes[0];
+  }
+
+  /**
+   * Callback for android.hardware.Camera API
+   */
+  @Override
+  public void onPreviewFrame(final byte[] bytes, final Camera camera) {
+    if (isProcessingFrame) {
+      LOGGER.w("Dropping frame!");
+      return;
+    }
+
+    try {
+      // Initialize the storage bitmaps once when the resolution is known.
+      if (rgbBytes == null) {
+        Camera.Size previewSize = camera.getParameters().getPreviewSize();
+        previewHeight = previewSize.height;
+        previewWidth = previewSize.width;
+        rgbBytes = new int[previewWidth * previewHeight];
+        onPreviewSizeChosen(new Size(previewSize.width, previewSize.height), 90);
+      }
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      return;
+    }
+
+    isProcessingFrame = true;
+    yuvBytes[0] = bytes;
+    yRowStride = previewWidth;
+
+    imageConverter =
+        new Runnable() {
+          @Override
+          public void run() {
+            ImageUtils.convertYUV420SPToARGB8888(bytes, previewWidth, previewHeight, rgbBytes);
+          }
+        };
+
+    postInferenceCallback =
+        new Runnable() {
+          @Override
+          public void run() {
+            camera.addCallbackBuffer(bytes);
+            isProcessingFrame = false;
+          }
+        };
+    processImage();
+  }
+
+  /**
+   * Callback for Camera2 API
+   */
+  @Override
+  public void onImageAvailable(final ImageReader reader) {
+    //We need wait until we have some size from onPreviewSizeChosen
+    if (previewWidth == 0 || previewHeight == 0) {
+      return;
+    }
+    if (rgbBytes == null) {
+      rgbBytes = new int[previewWidth * previewHeight];
+    }
+    try {
+      final Image image = reader.acquireLatestImage();
+
+      if (image == null) {
+        return;
+      }
+
+      if (isProcessingFrame) {
+        image.close();
+        return;
+      }
+      isProcessingFrame = true;
+      Trace.beginSection("imageAvailable");
+      final Plane[] planes = image.getPlanes();
+      fillBytes(planes, yuvBytes);
+      yRowStride = planes[0].getRowStride();
+      final int uvRowStride = planes[1].getRowStride();
+      final int uvPixelStride = planes[1].getPixelStride();
+
+      imageConverter =
+          new Runnable() {
+            @Override
+            public void run() {
+              ImageUtils.convertYUV420ToARGB8888(
+                  yuvBytes[0],
+                  yuvBytes[1],
+                  yuvBytes[2],
+                  previewWidth,
+                  previewHeight,
+                  yRowStride,
+                  uvRowStride,
+                  uvPixelStride,
+                  rgbBytes);
+            }
+          };
+
+      postInferenceCallback =
+          new Runnable() {
+            @Override
+            public void run() {
+              image.close();
+              isProcessingFrame = false;
+            }
+          };
+
+      processImage();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+      Trace.endSection();
+      return;
+    }
+    Trace.endSection();
+  }
+
+  @Override
+  public synchronized void onStart() {
+    LOGGER.d("onStart " + this);
+    super.onStart();
+  }
+
+  @Override
+  public synchronized void onResume() {
+    LOGGER.d("onResume " + this);
+    super.onResume();
+
+    handlerThread = new HandlerThread("inference");
+    handlerThread.start();
+    handler = new Handler(handlerThread.getLooper());
+  }
+
+  @Override
+  public synchronized void onPause() {
+    LOGGER.d("onPause " + this);
+
+    if (!isFinishing()) {
+      LOGGER.d("Requesting finish");
+      finish();
+    }
+
+    handlerThread.quitSafely();
+    try {
+      handlerThread.join();
+      handlerThread = null;
+      handler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+
+    super.onPause();
+  }
+
+  @Override
+  public synchronized void onStop() {
+    LOGGER.d("onStop " + this);
+    super.onStop();
+  }
+
+  @Override
+  public synchronized void onDestroy() {
+    LOGGER.d("onDestroy " + this);
+    super.onDestroy();
+  }
+
+  protected synchronized void runInBackground(final Runnable r) {
+    if (handler != null) {
+      handler.post(r);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      final int requestCode, final String[] permissions, final int[] grantResults) {
+    if (requestCode == PERMISSIONS_REQUEST) {
+      if (grantResults.length > 0
+          && grantResults[0] == PackageManager.PERMISSION_GRANTED
+          && grantResults[1] == PackageManager.PERMISSION_GRANTED) {
+        setFragment();
+      } else {
+        requestPermission();
+      }
+    }
+  }
+
+  private boolean hasPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      return checkSelfPermission(PERMISSION_CAMERA) == PackageManager.PERMISSION_GRANTED &&
+          checkSelfPermission(PERMISSION_STORAGE) == PackageManager.PERMISSION_GRANTED;
+    } else {
+      return true;
+    }
+  }
+
+  private void requestPermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      if (shouldShowRequestPermissionRationale(PERMISSION_CAMERA) ||
+          shouldShowRequestPermissionRationale(PERMISSION_STORAGE)) {
+        Toast.makeText(CameraActivity.this,
+            "Camera AND storage permission are required for this demo", Toast.LENGTH_LONG).show();
+      }
+      requestPermissions(new String[] {PERMISSION_CAMERA, PERMISSION_STORAGE}, PERMISSIONS_REQUEST);
+    }
+  }
+
+  // Returns true if the device supports the required hardware level, or better.
+  private boolean isHardwareLevelSupported(
+      CameraCharacteristics characteristics, int requiredLevel) {
+    int deviceLevel = characteristics.get(CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL);
+    if (deviceLevel == CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_LEGACY) {
+      return requiredLevel == deviceLevel;
+    }
+    // deviceLevel is not LEGACY, can use numerical sort
+    return requiredLevel <= deviceLevel;
+  }
+
+  private String chooseCamera() {
+    final CameraManager manager = (CameraManager) getSystemService(Context.CAMERA_SERVICE);
+    try {
+      for (final String cameraId : manager.getCameraIdList()) {
+        final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+        // We don't use a front facing camera in this sample.
+        final Integer facing = characteristics.get(CameraCharacteristics.LENS_FACING);
+        if (facing != null && facing == CameraCharacteristics.LENS_FACING_FRONT) {
+          continue;
+        }
+
+        final StreamConfigurationMap map =
+            characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+        if (map == null) {
+          continue;
+        }
+
+        // Fallback to camera1 API for internal cameras that don't have full support.
+        // This should help with legacy situations where using the camera2 API causes
+        // distorted or otherwise broken previews.
+        useCamera2API = (facing == CameraCharacteristics.LENS_FACING_EXTERNAL)
+            || isHardwareLevelSupported(characteristics, 
+                                        CameraCharacteristics.INFO_SUPPORTED_HARDWARE_LEVEL_FULL);
+        LOGGER.i("Camera API lv2?: %s", useCamera2API);
+        return cameraId;
+      }
+    } catch (CameraAccessException e) {
+      LOGGER.e(e, "Not allowed to access camera");
+    }
+
+    return null;
+  }
+
+  protected void setFragment() {
+    String cameraId = chooseCamera();
+
+    Fragment fragment;
+    if (useCamera2API) {
+      CameraConnectionFragment camera2Fragment =
+          CameraConnectionFragment.newInstance(
+              new CameraConnectionFragment.ConnectionCallback() {
+                @Override
+                public void onPreviewSizeChosen(final Size size, final int rotation) {
+                  previewHeight = size.getHeight();
+                  previewWidth = size.getWidth();
+                  CameraActivity.this.onPreviewSizeChosen(size, rotation);
+                }
+              },
+              this,
+              getLayoutId(),
+              getDesiredPreviewFrameSize());
+
+      camera2Fragment.setCamera(cameraId);
+      fragment = camera2Fragment;
+    } else {
+      fragment =
+          new LegacyCameraConnectionFragment(this, getLayoutId(), getDesiredPreviewFrameSize());
+    }
+
+    getFragmentManager()
+        .beginTransaction()
+        .replace(R.id.container, fragment)
+        .commit();
+  }
+
+  protected void fillBytes(final Plane[] planes, final byte[][] yuvBytes) {
+    // Because of the variable row stride it's not possible to know in
+    // advance the actual necessary dimensions of the yuv planes.
+    for (int i = 0; i < planes.length; ++i) {
+      final ByteBuffer buffer = planes[i].getBuffer();
+      if (yuvBytes[i] == null) {
+        LOGGER.d("Initializing buffer %d at size %d", i, buffer.capacity());
+        yuvBytes[i] = new byte[buffer.capacity()];
+      }
+      buffer.get(yuvBytes[i]);
+    }
+  }
+
+  public boolean isDebug() {
+    return debug;
+  }
+
+  public void requestRender() {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.postInvalidate();
+    }
+  }
+
+  public void addCallback(final OverlayView.DrawCallback callback) {
+    final OverlayView overlay = (OverlayView) findViewById(R.id.debug_overlay);
+    if (overlay != null) {
+      overlay.addCallback(callback);
+    }
+  }
+
+  public void onSetDebug(final boolean debug) {}
+
+  @Override
+  public boolean onKeyDown(final int keyCode, final KeyEvent event) {
+    if (keyCode == KeyEvent.KEYCODE_VOLUME_DOWN || keyCode == KeyEvent.KEYCODE_VOLUME_UP) {
+      debug = !debug;
+      requestRender();
+      onSetDebug(debug);
+      return true;
+    }
+    return super.onKeyDown(keyCode, event);
+  }
+
+  protected void readyForNextImage() {
+    if (postInferenceCallback != null) {
+      postInferenceCallback.run();
+    }
+  }
+
+  protected int getScreenOrientation() {
+    switch (getWindowManager().getDefaultDisplay().getRotation()) {
+      case Surface.ROTATION_270:
+        return 270;
+      case Surface.ROTATION_180:
+        return 180;
+      case Surface.ROTATION_90:
+        return 90;
+      default:
+        return 0;
+    }
+  }
+
+  protected abstract void processImage();
+
+  protected abstract void onPreviewSizeChosen(final Size size, final int rotation);
+  protected abstract int getLayoutId();
+  protected abstract Size getDesiredPreviewFrameSize();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..51a1adb538e48cad4807d35f3efc6feefe81309b
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
@@ -0,0 +1,634 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.app.Activity;
+import android.app.AlertDialog;
+import android.app.Dialog;
+import android.app.DialogFragment;
+import android.app.Fragment;
+import android.content.Context;
+import android.content.DialogInterface;
+import android.content.res.Configuration;
+import android.graphics.ImageFormat;
+import android.graphics.Matrix;
+import android.graphics.RectF;
+import android.graphics.SurfaceTexture;
+import android.hardware.camera2.CameraAccessException;
+import android.hardware.camera2.CameraCaptureSession;
+import android.hardware.camera2.CameraCharacteristics;
+import android.hardware.camera2.CameraDevice;
+import android.hardware.camera2.CameraManager;
+import android.hardware.camera2.CaptureRequest;
+import android.hardware.camera2.CaptureResult;
+import android.hardware.camera2.TotalCaptureResult;
+import android.hardware.camera2.params.StreamConfigurationMap;
+import android.media.ImageReader;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.text.TextUtils;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import android.widget.Toast;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.TimeUnit;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class CameraConnectionFragment extends Fragment {
+  private static final Logger LOGGER = new Logger();
+
+  /**
+   * The camera preview size will be chosen to be the smallest frame by pixel size capable of
+   * containing a DESIRED_SIZE x DESIRED_SIZE square.
+   */
+  private static final int MINIMUM_PREVIEW_SIZE = 320;
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+  private static final String FRAGMENT_DIALOG = "dialog";
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+          openCamera(width, height);
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {
+          configureTransform(width, height);
+        }
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * Callback for Activities to use to initialize their data once the
+   * selected preview size is known.
+   */
+  public interface ConnectionCallback {
+    void onPreviewSizeChosen(Size size, int cameraRotation);
+  }
+
+  /**
+   * ID of the current {@link CameraDevice}.
+   */
+  private String cameraId;
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * A {@link CameraCaptureSession } for camera preview.
+   */
+  private CameraCaptureSession captureSession;
+
+  /**
+   * A reference to the opened {@link CameraDevice}.
+   */
+  private CameraDevice cameraDevice;
+
+  /**
+   * The rotation in degrees of the camera sensor from the display.
+   */
+  private Integer sensorOrientation;
+
+  /**
+   * The {@link android.util.Size} of camera preview.
+   */
+  private Size previewSize;
+
+  /**
+   * {@link android.hardware.camera2.CameraDevice.StateCallback}
+   * is called when {@link CameraDevice} changes its state.
+   */
+  private final CameraDevice.StateCallback stateCallback =
+      new CameraDevice.StateCallback() {
+        @Override
+        public void onOpened(final CameraDevice cd) {
+          // This method is called when the camera is opened.  We start camera preview here.
+          cameraOpenCloseLock.release();
+          cameraDevice = cd;
+          createCameraPreviewSession();
+        }
+
+        @Override
+        public void onDisconnected(final CameraDevice cd) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+        }
+
+        @Override
+        public void onError(final CameraDevice cd, final int error) {
+          cameraOpenCloseLock.release();
+          cd.close();
+          cameraDevice = null;
+          final Activity activity = getActivity();
+          if (null != activity) {
+            activity.finish();
+          }
+        }
+      };
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  /**
+   * A {@link Handler} for running tasks in the background.
+   */
+  private Handler backgroundHandler;
+
+  /**
+   * An {@link ImageReader} that handles preview frame capture.
+   */
+  private ImageReader previewReader;
+
+  /**
+   * {@link android.hardware.camera2.CaptureRequest.Builder} for the camera preview
+   */
+  private CaptureRequest.Builder previewRequestBuilder;
+
+  /**
+   * {@link CaptureRequest} generated by {@link #previewRequestBuilder}
+   */
+  private CaptureRequest previewRequest;
+
+  /**
+   * A {@link Semaphore} to prevent the app from exiting before closing the camera.
+   */
+  private final Semaphore cameraOpenCloseLock = new Semaphore(1);
+
+  /**
+   * A {@link OnImageAvailableListener} to receive frames as they are available.
+   */
+  private final OnImageAvailableListener imageListener;
+
+  /** The input size in pixels desired by TensorFlow (width and height of a square bitmap). */
+  private final Size inputSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private final int layout;
+
+
+  private final ConnectionCallback cameraConnectionCallback;
+
+  private CameraConnectionFragment(
+      final ConnectionCallback connectionCallback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    this.cameraConnectionCallback = connectionCallback;
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.inputSize = inputSize;
+  }
+
+  /**
+   * Shows a {@link Toast} on the UI thread.
+   *
+   * @param text The message to show
+   */
+  private void showToast(final String text) {
+    final Activity activity = getActivity();
+    if (activity != null) {
+      activity.runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              Toast.makeText(activity, text, Toast.LENGTH_SHORT).show();
+            }
+          });
+    }
+  }
+
+  /**
+   * Given {@code choices} of {@code Size}s supported by a camera, chooses the smallest one whose
+   * width and height are at least as large as the minimum of both, or an exact match if possible.
+   *
+   * @param choices The list of sizes that the camera supports for the intended output class
+   * @param width The minimum desired width
+   * @param height The minimum desired height
+   * @return The optimal {@code Size}, or an arbitrary one if none were big enough
+   */
+  protected static Size chooseOptimalSize(final Size[] choices, final int width, final int height) {
+    final int minSize = Math.max(Math.min(width, height), MINIMUM_PREVIEW_SIZE);
+    final Size desiredSize = new Size(width, height);
+
+    // Collect the supported resolutions that are at least as big as the preview Surface
+    boolean exactSizeFound = false;
+    final List<Size> bigEnough = new ArrayList<Size>();
+    final List<Size> tooSmall = new ArrayList<Size>();
+    for (final Size option : choices) {
+      if (option.equals(desiredSize)) {
+        // Set the size but don't return yet so that remaining sizes will still be logged.
+        exactSizeFound = true;
+      }
+
+      if (option.getHeight() >= minSize && option.getWidth() >= minSize) {
+        bigEnough.add(option);
+      } else {
+        tooSmall.add(option);
+      }
+    }
+
+    LOGGER.i("Desired size: " + desiredSize + ", min size: " + minSize + "x" + minSize);
+    LOGGER.i("Valid preview sizes: [" + TextUtils.join(", ", bigEnough) + "]");
+    LOGGER.i("Rejected preview sizes: [" + TextUtils.join(", ", tooSmall) + "]");
+
+    if (exactSizeFound) {
+      LOGGER.i("Exact size match found.");
+      return desiredSize;
+    }
+
+    // Pick the smallest of those, assuming we found any
+    if (bigEnough.size() > 0) {
+      final Size chosenSize = Collections.min(bigEnough, new CompareSizesByArea());
+      LOGGER.i("Chosen size: " + chosenSize.getWidth() + "x" + chosenSize.getHeight());
+      return chosenSize;
+    } else {
+      LOGGER.e("Couldn't find any suitable preview size");
+      return choices[0];
+    }
+  }
+
+  public static CameraConnectionFragment newInstance(
+      final ConnectionCallback callback,
+      final OnImageAvailableListener imageListener,
+      final int layout,
+      final Size inputSize) {
+    return new CameraConnectionFragment(callback, imageListener, layout, inputSize);
+  }
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+    if (textureView.isAvailable()) {
+      openCamera(textureView.getWidth(), textureView.getHeight());
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    closeCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  public void setCamera(String cameraId) {
+    this.cameraId = cameraId;
+  }
+
+  /**
+   * Sets up member variables related to camera.
+   */
+  private void setUpCameraOutputs() {
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      final CameraCharacteristics characteristics = manager.getCameraCharacteristics(cameraId);
+
+      final StreamConfigurationMap map =
+          characteristics.get(CameraCharacteristics.SCALER_STREAM_CONFIGURATION_MAP);
+
+      sensorOrientation = characteristics.get(CameraCharacteristics.SENSOR_ORIENTATION);
+
+      // Danger, W.R.! Attempting to use too large a preview size could  exceed the camera
+      // bus' bandwidth limitation, resulting in gorgeous previews but the storage of
+      // garbage capture data.
+      previewSize =
+          chooseOptimalSize(map.getOutputSizes(SurfaceTexture.class),
+              inputSize.getWidth(),
+              inputSize.getHeight());
+
+      // We fit the aspect ratio of TextureView to the size of preview we picked.
+      final int orientation = getResources().getConfiguration().orientation;
+      if (orientation == Configuration.ORIENTATION_LANDSCAPE) {
+        textureView.setAspectRatio(previewSize.getWidth(), previewSize.getHeight());
+      } else {
+        textureView.setAspectRatio(previewSize.getHeight(), previewSize.getWidth());
+      }
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final NullPointerException e) {
+      // Currently an NPE is thrown when the Camera2API is used but not supported on the
+      // device this code runs.
+      // TODO(andrewharp): abstract ErrorDialog/RuntimeException handling out into new method and
+      // reuse throughout app.
+      ErrorDialog.newInstance(getString(R.string.camera_error))
+          .show(getChildFragmentManager(), FRAGMENT_DIALOG);
+      throw new RuntimeException(getString(R.string.camera_error));
+    }
+
+    cameraConnectionCallback.onPreviewSizeChosen(previewSize, sensorOrientation);
+  }
+
+  /**
+   * Opens the camera specified by {@link CameraConnectionFragment#cameraId}.
+   */
+  private void openCamera(final int width, final int height) {
+    setUpCameraOutputs();
+    configureTransform(width, height);
+    final Activity activity = getActivity();
+    final CameraManager manager = (CameraManager) activity.getSystemService(Context.CAMERA_SERVICE);
+    try {
+      if (!cameraOpenCloseLock.tryAcquire(2500, TimeUnit.MILLISECONDS)) {
+        throw new RuntimeException("Time out waiting to lock camera opening.");
+      }
+      manager.openCamera(cameraId, stateCallback, backgroundHandler);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera opening.", e);
+    }
+  }
+
+  /**
+   * Closes the current {@link CameraDevice}.
+   */
+  private void closeCamera() {
+    try {
+      cameraOpenCloseLock.acquire();
+      if (null != captureSession) {
+        captureSession.close();
+        captureSession = null;
+      }
+      if (null != cameraDevice) {
+        cameraDevice.close();
+        cameraDevice = null;
+      }
+      if (null != previewReader) {
+        previewReader.close();
+        previewReader = null;
+      }
+    } catch (final InterruptedException e) {
+      throw new RuntimeException("Interrupted while trying to lock camera closing.", e);
+    } finally {
+      cameraOpenCloseLock.release();
+    }
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("ImageListener");
+    backgroundThread.start();
+    backgroundHandler = new Handler(backgroundThread.getLooper());
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+      backgroundHandler = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  private final CameraCaptureSession.CaptureCallback captureCallback =
+      new CameraCaptureSession.CaptureCallback() {
+        @Override
+        public void onCaptureProgressed(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final CaptureResult partialResult) {}
+
+        @Override
+        public void onCaptureCompleted(
+            final CameraCaptureSession session,
+            final CaptureRequest request,
+            final TotalCaptureResult result) {}
+      };
+
+  /**
+   * Creates a new {@link CameraCaptureSession} for camera preview.
+   */
+  private void createCameraPreviewSession() {
+    try {
+      final SurfaceTexture texture = textureView.getSurfaceTexture();
+      assert texture != null;
+
+      // We configure the size of default buffer to be the size of camera preview we want.
+      texture.setDefaultBufferSize(previewSize.getWidth(), previewSize.getHeight());
+
+      // This is the output Surface we need to start preview.
+      final Surface surface = new Surface(texture);
+
+      // We set up a CaptureRequest.Builder with the output Surface.
+      previewRequestBuilder = cameraDevice.createCaptureRequest(CameraDevice.TEMPLATE_PREVIEW);
+      previewRequestBuilder.addTarget(surface);
+
+      LOGGER.i("Opening camera preview: " + previewSize.getWidth() + "x" + previewSize.getHeight());
+
+      // Create the reader for the preview frames.
+      previewReader =
+          ImageReader.newInstance(
+              previewSize.getWidth(), previewSize.getHeight(), ImageFormat.YUV_420_888, 2);
+
+      previewReader.setOnImageAvailableListener(imageListener, backgroundHandler);
+      previewRequestBuilder.addTarget(previewReader.getSurface());
+
+      // Here, we create a CameraCaptureSession for camera preview.
+      cameraDevice.createCaptureSession(
+          Arrays.asList(surface, previewReader.getSurface()),
+          new CameraCaptureSession.StateCallback() {
+
+            @Override
+            public void onConfigured(final CameraCaptureSession cameraCaptureSession) {
+              // The camera is already closed
+              if (null == cameraDevice) {
+                return;
+              }
+
+              // When the session is ready, we start displaying the preview.
+              captureSession = cameraCaptureSession;
+              try {
+                // Auto focus should be continuous for camera preview.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AF_MODE,
+                    CaptureRequest.CONTROL_AF_MODE_CONTINUOUS_PICTURE);
+                // Flash is automatically enabled when necessary.
+                previewRequestBuilder.set(
+                    CaptureRequest.CONTROL_AE_MODE, CaptureRequest.CONTROL_AE_MODE_ON_AUTO_FLASH);
+
+                // Finally, we start displaying the camera preview.
+                previewRequest = previewRequestBuilder.build();
+                captureSession.setRepeatingRequest(
+                    previewRequest, captureCallback, backgroundHandler);
+              } catch (final CameraAccessException e) {
+                LOGGER.e(e, "Exception!");
+              }
+            }
+
+            @Override
+            public void onConfigureFailed(final CameraCaptureSession cameraCaptureSession) {
+              showToast("Failed");
+            }
+          },
+          null);
+    } catch (final CameraAccessException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  /**
+   * Configures the necessary {@link android.graphics.Matrix} transformation to `mTextureView`.
+   * This method should be called after the camera preview size is determined in
+   * setUpCameraOutputs and also the size of `mTextureView` is fixed.
+   *
+   * @param viewWidth  The width of `mTextureView`
+   * @param viewHeight The height of `mTextureView`
+   */
+  private void configureTransform(final int viewWidth, final int viewHeight) {
+    final Activity activity = getActivity();
+    if (null == textureView || null == previewSize || null == activity) {
+      return;
+    }
+    final int rotation = activity.getWindowManager().getDefaultDisplay().getRotation();
+    final Matrix matrix = new Matrix();
+    final RectF viewRect = new RectF(0, 0, viewWidth, viewHeight);
+    final RectF bufferRect = new RectF(0, 0, previewSize.getHeight(), previewSize.getWidth());
+    final float centerX = viewRect.centerX();
+    final float centerY = viewRect.centerY();
+    if (Surface.ROTATION_90 == rotation || Surface.ROTATION_270 == rotation) {
+      bufferRect.offset(centerX - bufferRect.centerX(), centerY - bufferRect.centerY());
+      matrix.setRectToRect(viewRect, bufferRect, Matrix.ScaleToFit.FILL);
+      final float scale =
+          Math.max(
+              (float) viewHeight / previewSize.getHeight(),
+              (float) viewWidth / previewSize.getWidth());
+      matrix.postScale(scale, scale, centerX, centerY);
+      matrix.postRotate(90 * (rotation - 2), centerX, centerY);
+    } else if (Surface.ROTATION_180 == rotation) {
+      matrix.postRotate(180, centerX, centerY);
+    }
+    textureView.setTransform(matrix);
+  }
+
+  /**
+   * Compares two {@code Size}s based on their areas.
+   */
+  static class CompareSizesByArea implements Comparator<Size> {
+    @Override
+    public int compare(final Size lhs, final Size rhs) {
+      // We cast here to ensure the multiplications won't overflow
+      return Long.signum(
+          (long) lhs.getWidth() * lhs.getHeight() - (long) rhs.getWidth() * rhs.getHeight());
+    }
+  }
+
+  /**
+   * Shows an error message dialog.
+   */
+  public static class ErrorDialog extends DialogFragment {
+    private static final String ARG_MESSAGE = "message";
+
+    public static ErrorDialog newInstance(final String message) {
+      final ErrorDialog dialog = new ErrorDialog();
+      final Bundle args = new Bundle();
+      args.putString(ARG_MESSAGE, message);
+      dialog.setArguments(args);
+      return dialog;
+    }
+
+    @Override
+    public Dialog onCreateDialog(final Bundle savedInstanceState) {
+      final Activity activity = getActivity();
+      return new AlertDialog.Builder(activity)
+          .setMessage(getArguments().getString(ARG_MESSAGE))
+          .setPositiveButton(
+              android.R.string.ok,
+              new DialogInterface.OnClickListener() {
+                @Override
+                public void onClick(final DialogInterface dialogInterface, final int i) {
+                  activity.finish();
+                }
+              })
+          .create();
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..07995febaf5caab65dd4dfcc262ccf3750cfa303
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/Classifier.java
@@ -0,0 +1,107 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import java.util.List;
+
+/**
+ * Generic interface for interacting with different recognition engines.
+ */
+public interface Classifier {
+  /**
+   * An immutable result returned by a Classifier describing what was recognized.
+   */
+  public class Recognition {
+    /**
+     * A unique identifier for what has been recognized. Specific to the class, not the instance of
+     * the object.
+     */
+    private final String id;
+
+    /**
+     * Display name for the recognition.
+     */
+    private final String title;
+
+    /**
+     * A sortable score for how good the recognition is relative to others. Higher should be better.
+     */
+    private final Float confidence;
+
+    /** Optional location within the source image for the location of the recognized object. */
+    private RectF location;
+
+    public Recognition(
+        final String id, final String title, final Float confidence, final RectF location) {
+      this.id = id;
+      this.title = title;
+      this.confidence = confidence;
+      this.location = location;
+    }
+
+    public String getId() {
+      return id;
+    }
+
+    public String getTitle() {
+      return title;
+    }
+
+    public Float getConfidence() {
+      return confidence;
+    }
+
+    public RectF getLocation() {
+      return new RectF(location);
+    }
+
+    public void setLocation(RectF location) {
+      this.location = location;
+    }
+
+    @Override
+    public String toString() {
+      String resultString = "";
+      if (id != null) {
+        resultString += "[" + id + "] ";
+      }
+
+      if (title != null) {
+        resultString += title + " ";
+      }
+
+      if (confidence != null) {
+        resultString += String.format("(%.1f%%) ", confidence * 100.0f);
+      }
+
+      if (location != null) {
+        resultString += location + " ";
+      }
+
+      return resultString.trim();
+    }
+  }
+
+  List<Recognition> recognizeImage(Bitmap bitmap);
+
+  void enableStatLogging(final boolean debug);
+
+  String getStatString();
+
+  void close();
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..dcbbefbeab6627b37579902cd25841c0ae257dda
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class ClassifierActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  protected static final boolean SAVE_PREVIEW_BITMAP = false;
+
+  private ResultsView resultsView;
+
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private long lastProcessingTimeMs;
+
+  // These are the settings for the original v1 Inception model. If you want to
+  // use a model that's been produced from the TensorFlow for Poets codelab,
+  // you'll need to set IMAGE_SIZE = 299, IMAGE_MEAN = 128, IMAGE_STD = 128,
+  // INPUT_NAME = "Mul", and OUTPUT_NAME = "final_result".
+  // You'll also need to update the MODEL_FILE and LABEL_FILE paths to point to
+  // the ones you produced.
+  //
+  // To use v3 Inception model, strip the DecodeJpeg Op from your retrained
+  // model first:
+  //
+  // python strip_unused.py \
+  // --input_graph=<retrained-pb-file> \
+  // --output_graph=<your-stripped-pb-file> \
+  // --input_node_names="Mul" \
+  // --output_node_names="final_result" \
+  // --input_binary=true
+  private static final int INPUT_SIZE = 224;
+
+  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
+
+  private static final boolean MAINTAIN_ASPECT = true;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+
+  private Integer sensorOrientation;
+  private Classifier classifier;
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private BorderedText borderedText;
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  private static final float TEXT_SIZE_DIP = 10;
+
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx = TypedValue.applyDimension(
+        TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    classifier = TFLiteImageClassifier.create(getAssets(), MODEL_FILE, LABEL_FILE, INPUT_SIZE);
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(INPUT_SIZE, INPUT_SIZE, Config.ARGB_8888);
+
+    frameToCropTransform = ImageUtils.getTransformationMatrix(
+        previewWidth, previewHeight,
+        INPUT_SIZE, INPUT_SIZE,
+        sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            renderDebug(canvas);
+          }
+        });
+  }
+
+  @Override
+  protected void processImage() {
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = classifier.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+            LOGGER.i("Detect: %s", results);
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            if (resultsView == null) {
+              resultsView = (ResultsView) findViewById(R.id.results);
+            }
+            resultsView.setResults(results);
+            requestRender();
+            readyForNextImage();
+          }
+        });
+  }
+
+  @Override
+  public void onSetDebug(boolean debug) {
+    classifier.enableStatLogging(debug);
+  }
+
+  private void renderDebug(final Canvas canvas) {
+    if (!isDebug()) {
+      return;
+    }
+    final Bitmap copy = cropCopyBitmap;
+    if (copy != null) {
+      final Matrix matrix = new Matrix();
+      final float scaleFactor = 2;
+      matrix.postScale(scaleFactor, scaleFactor);
+      matrix.postTranslate(
+          canvas.getWidth() - copy.getWidth() * scaleFactor,
+          canvas.getHeight() - copy.getHeight() * scaleFactor);
+      canvas.drawBitmap(copy, matrix, new Paint());
+
+      final Vector<String> lines = new Vector<String>();
+      if (classifier != null) {
+        String statString = classifier.getStatString();
+        String[] statLines = statString.split("\n");
+        for (String line : statLines) {
+          lines.add(line);
+        }
+      }
+
+      lines.add("Frame: " + previewWidth + "x" + previewHeight);
+      lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+      lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+      lines.add("Rotation: " + sensorOrientation);
+      lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+      borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..de997e454a1e33254cb7c2c932ca79d0072539fa
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/DetectorActivity.java
@@ -0,0 +1,296 @@
+/*
+ * Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.graphics.Bitmap;
+import android.graphics.Bitmap.Config;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import android.media.ImageReader.OnImageAvailableListener;
+import android.os.SystemClock;
+import android.util.Size;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Vector;
+import org.tensorflow.demo.OverlayView.DrawCallback;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.tracking.MultiBoxTracker;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that uses a TensorFlowMultiBoxDetector and ObjectTracker to detect and then track
+ * objects.
+ */
+public class DetectorActivity extends CameraActivity implements OnImageAvailableListener {
+  private static final Logger LOGGER = new Logger();
+
+  // Configuration values for the prepackaged SSD model.
+  private static final int TF_OD_API_INPUT_SIZE = 300;
+  private static final String TF_OD_API_MODEL_FILE = "mobilenet_ssd.tflite";
+  private static final String TF_OD_API_LABELS_FILE = "file:///android_asset/coco_labels_list.txt";
+
+  // Which detection model to use: by default uses Tensorflow Object Detection API frozen
+  // checkpoints.
+  private enum DetectorMode {
+    TF_OD_API;
+  }
+
+  private static final DetectorMode MODE = DetectorMode.TF_OD_API;
+
+  // Minimum detection confidence to track a detection.
+  private static final float MINIMUM_CONFIDENCE_TF_OD_API = 0.6f;
+
+  private static final boolean MAINTAIN_ASPECT = false;
+
+  private static final Size DESIRED_PREVIEW_SIZE = new Size(640, 480);
+
+  private static final boolean SAVE_PREVIEW_BITMAP = false;
+  private static final float TEXT_SIZE_DIP = 10;
+
+  private Integer sensorOrientation;
+
+  private Classifier detector;
+
+  private long lastProcessingTimeMs;
+  private Bitmap rgbFrameBitmap = null;
+  private Bitmap croppedBitmap = null;
+  private Bitmap cropCopyBitmap = null;
+
+  private boolean computingDetection = false;
+
+  private long timestamp = 0;
+
+  private Matrix frameToCropTransform;
+  private Matrix cropToFrameTransform;
+
+  private MultiBoxTracker tracker;
+
+  private byte[] luminanceCopy;
+
+  private BorderedText borderedText;
+  @Override
+  public void onPreviewSizeChosen(final Size size, final int rotation) {
+    final float textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+    borderedText.setTypeface(Typeface.MONOSPACE);
+
+    tracker = new MultiBoxTracker(this);
+
+    int cropSize = TF_OD_API_INPUT_SIZE;
+
+    try {
+      detector =
+          TFLiteObjectDetectionAPIModel.create(
+              getAssets(), TF_OD_API_MODEL_FILE, TF_OD_API_LABELS_FILE, TF_OD_API_INPUT_SIZE);
+      cropSize = TF_OD_API_INPUT_SIZE;
+    } catch (final IOException e) {
+      LOGGER.e("Exception initializing classifier!", e);
+      Toast toast =
+          Toast.makeText(
+              getApplicationContext(), "Classifier could not be initialized", Toast.LENGTH_SHORT);
+      toast.show();
+      finish();
+    }
+
+
+    previewWidth = size.getWidth();
+    previewHeight = size.getHeight();
+
+    sensorOrientation = rotation - getScreenOrientation();
+    LOGGER.i("Camera orientation relative to screen canvas: %d", sensorOrientation);
+
+    LOGGER.i("Initializing at size %dx%d", previewWidth, previewHeight);
+    rgbFrameBitmap = Bitmap.createBitmap(previewWidth, previewHeight, Config.ARGB_8888);
+    croppedBitmap = Bitmap.createBitmap(cropSize, cropSize, Config.ARGB_8888);
+
+    frameToCropTransform =
+        ImageUtils.getTransformationMatrix(
+            previewWidth, previewHeight,
+            cropSize, cropSize,
+            sensorOrientation, MAINTAIN_ASPECT);
+
+    cropToFrameTransform = new Matrix();
+    frameToCropTransform.invert(cropToFrameTransform);
+
+    trackingOverlay = (OverlayView) findViewById(R.id.tracking_overlay);
+    trackingOverlay.addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            tracker.draw(canvas);
+            if (isDebug()) {
+              tracker.drawDebug(canvas);
+            }
+          }
+        });
+
+    addCallback(
+        new DrawCallback() {
+          @Override
+          public void drawCallback(final Canvas canvas) {
+            if (!isDebug()) {
+              return;
+            }
+            final Bitmap copy = cropCopyBitmap;
+            if (copy == null) {
+              return;
+            }
+
+            final int backgroundColor = Color.argb(100, 0, 0, 0);
+            canvas.drawColor(backgroundColor);
+
+            final Matrix matrix = new Matrix();
+            final float scaleFactor = 2;
+            matrix.postScale(scaleFactor, scaleFactor);
+            matrix.postTranslate(
+                canvas.getWidth() - copy.getWidth() * scaleFactor,
+                canvas.getHeight() - copy.getHeight() * scaleFactor);
+            canvas.drawBitmap(copy, matrix, new Paint());
+
+            final Vector<String> lines = new Vector<String>();
+            if (detector != null) {
+              final String statString = detector.getStatString();
+              final String[] statLines = statString.split("\n");
+              for (final String line : statLines) {
+                lines.add(line);
+              }
+            }
+            lines.add("");
+
+            lines.add("Frame: " + previewWidth + "x" + previewHeight);
+            lines.add("Crop: " + copy.getWidth() + "x" + copy.getHeight());
+            lines.add("View: " + canvas.getWidth() + "x" + canvas.getHeight());
+            lines.add("Rotation: " + sensorOrientation);
+            lines.add("Inference time: " + lastProcessingTimeMs + "ms");
+
+            borderedText.drawLines(canvas, 10, canvas.getHeight() - 10, lines);
+          }
+        });
+  }
+
+  OverlayView trackingOverlay;
+
+  @Override
+  protected void processImage() {
+    ++timestamp;
+    final long currTimestamp = timestamp;
+    byte[] originalLuminance = getLuminance();
+    tracker.onFrame(
+        previewWidth,
+        previewHeight,
+        getLuminanceStride(),
+        sensorOrientation,
+        originalLuminance,
+        timestamp);
+    trackingOverlay.postInvalidate();
+
+    // No mutex needed as this method is not reentrant.
+    if (computingDetection) {
+      readyForNextImage();
+      return;
+    }
+    computingDetection = true;
+    LOGGER.i("Preparing image " + currTimestamp + " for detection in bg thread.");
+
+    rgbFrameBitmap.setPixels(getRgbBytes(), 0, previewWidth, 0, 0, previewWidth, previewHeight);
+
+    if (luminanceCopy == null) {
+      luminanceCopy = new byte[originalLuminance.length];
+    }
+    System.arraycopy(originalLuminance, 0, luminanceCopy, 0, originalLuminance.length);
+    readyForNextImage();
+
+    final Canvas canvas = new Canvas(croppedBitmap);
+    canvas.drawBitmap(rgbFrameBitmap, frameToCropTransform, null);
+    // For examining the actual TF input.
+    if (SAVE_PREVIEW_BITMAP) {
+      ImageUtils.saveBitmap(croppedBitmap);
+    }
+
+    runInBackground(
+        new Runnable() {
+          @Override
+          public void run() {
+            LOGGER.i("Running detection on image " + currTimestamp);
+            final long startTime = SystemClock.uptimeMillis();
+            final List<Classifier.Recognition> results = detector.recognizeImage(croppedBitmap);
+            lastProcessingTimeMs = SystemClock.uptimeMillis() - startTime;
+
+            cropCopyBitmap = Bitmap.createBitmap(croppedBitmap);
+            final Canvas canvas = new Canvas(cropCopyBitmap);
+            final Paint paint = new Paint();
+            paint.setColor(Color.RED);
+            paint.setStyle(Style.STROKE);
+            paint.setStrokeWidth(2.0f);
+
+            float minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+            switch (MODE) {
+              case TF_OD_API:
+                minimumConfidence = MINIMUM_CONFIDENCE_TF_OD_API;
+                break;
+            }
+
+            final List<Classifier.Recognition> mappedRecognitions =
+                new LinkedList<Classifier.Recognition>();
+
+            for (final Classifier.Recognition result : results) {
+              final RectF location = result.getLocation();
+              if (location != null && result.getConfidence() >= minimumConfidence) {
+                canvas.drawRect(location, paint);
+
+                cropToFrameTransform.mapRect(location);
+                result.setLocation(location);
+                mappedRecognitions.add(result);
+              }
+            }
+
+            tracker.trackResults(mappedRecognitions, luminanceCopy, currTimestamp);
+            trackingOverlay.postInvalidate();
+
+            requestRender();
+            computingDetection = false;
+          }
+        });
+  }
+
+  @Override
+  protected int getLayoutId() {
+    return R.layout.camera_connection_fragment_tracking;
+  }
+
+  @Override
+  protected Size getDesiredPreviewFrameSize() {
+    return DESIRED_PREVIEW_SIZE;
+  }
+
+  @Override
+  public void onSetDebug(final boolean debug) {
+    detector.enableStatLogging(debug);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
new file mode 100644
index 0000000000000000000000000000000000000000..fd830297533bb8366e008a44a32255788d5e1ea6
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
@@ -0,0 +1,216 @@
+package org.tensorflow.demo;
+
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import android.app.Fragment;
+import android.graphics.SurfaceTexture;
+import android.hardware.Camera;
+import android.hardware.Camera.CameraInfo;
+import android.os.Bundle;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.util.Size;
+import android.util.SparseIntArray;
+import android.view.LayoutInflater;
+import android.view.Surface;
+import android.view.TextureView;
+import android.view.View;
+import android.view.ViewGroup;
+import java.io.IOException;
+import java.util.List;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+public class LegacyCameraConnectionFragment extends Fragment {
+  private Camera camera;
+  private static final Logger LOGGER = new Logger();
+  private Camera.PreviewCallback imageListener;
+  private Size desiredSize;
+
+  /**
+   * The layout identifier to inflate for this Fragment.
+   */
+  private int layout;
+
+  public LegacyCameraConnectionFragment(
+      final Camera.PreviewCallback imageListener, final int layout, final Size desiredSize) {
+    this.imageListener = imageListener;
+    this.layout = layout;
+    this.desiredSize = desiredSize;
+  }
+
+  /**
+   * Conversion from screen rotation to JPEG orientation.
+   */
+  private static final SparseIntArray ORIENTATIONS = new SparseIntArray();
+
+  static {
+    ORIENTATIONS.append(Surface.ROTATION_0, 90);
+    ORIENTATIONS.append(Surface.ROTATION_90, 0);
+    ORIENTATIONS.append(Surface.ROTATION_180, 270);
+    ORIENTATIONS.append(Surface.ROTATION_270, 180);
+  }
+
+  /**
+   * {@link android.view.TextureView.SurfaceTextureListener} handles several lifecycle events on a
+   * {@link TextureView}.
+   */
+  private final TextureView.SurfaceTextureListener surfaceTextureListener =
+      new TextureView.SurfaceTextureListener() {
+        @Override
+        public void onSurfaceTextureAvailable(
+            final SurfaceTexture texture, final int width, final int height) {
+
+          int index = getCameraId();
+          camera = Camera.open(index);
+
+          try {
+            Camera.Parameters parameters = camera.getParameters();
+            List<String> focusModes = parameters.getSupportedFocusModes();
+            if (focusModes != null
+                && focusModes.contains(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE)) {
+              parameters.setFocusMode(Camera.Parameters.FOCUS_MODE_CONTINUOUS_PICTURE);
+            }
+            List<Camera.Size> cameraSizes = parameters.getSupportedPreviewSizes();
+            Size[] sizes = new Size[cameraSizes.size()];
+            int i = 0;
+            for (Camera.Size size : cameraSizes) {
+              sizes[i++] = new Size(size.width, size.height);
+            }
+            Size previewSize =
+                CameraConnectionFragment.chooseOptimalSize(
+                    sizes, desiredSize.getWidth(), desiredSize.getHeight());
+            parameters.setPreviewSize(previewSize.getWidth(), previewSize.getHeight());
+            camera.setDisplayOrientation(90);
+            camera.setParameters(parameters);
+            camera.setPreviewTexture(texture);
+          } catch (IOException exception) {
+            camera.release();
+          }
+
+          camera.setPreviewCallbackWithBuffer(imageListener);
+          Camera.Size s = camera.getParameters().getPreviewSize();
+          camera.addCallbackBuffer(new byte[ImageUtils.getYUVByteSize(s.height, s.width)]);
+
+          textureView.setAspectRatio(s.height, s.width);
+
+          camera.startPreview();
+        }
+
+        @Override
+        public void onSurfaceTextureSizeChanged(
+            final SurfaceTexture texture, final int width, final int height) {}
+
+        @Override
+        public boolean onSurfaceTextureDestroyed(final SurfaceTexture texture) {
+          return true;
+        }
+
+        @Override
+        public void onSurfaceTextureUpdated(final SurfaceTexture texture) {}
+      };
+
+  /**
+   * An {@link AutoFitTextureView} for camera preview.
+   */
+  private AutoFitTextureView textureView;
+
+  /**
+   * An additional thread for running tasks that shouldn't block the UI.
+   */
+  private HandlerThread backgroundThread;
+
+  @Override
+  public View onCreateView(
+      final LayoutInflater inflater, final ViewGroup container, final Bundle savedInstanceState) {
+    return inflater.inflate(layout, container, false);
+  }
+
+  @Override
+  public void onViewCreated(final View view, final Bundle savedInstanceState) {
+    textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
+  }
+
+  @Override
+  public void onActivityCreated(final Bundle savedInstanceState) {
+    super.onActivityCreated(savedInstanceState);
+  }
+
+  @Override
+  public void onResume() {
+    super.onResume();
+    startBackgroundThread();
+    // When the screen is turned off and turned back on, the SurfaceTexture is already
+    // available, and "onSurfaceTextureAvailable" will not be called. In that case, we can open
+    // a camera and start preview from here (otherwise, we wait until the surface is ready in
+    // the SurfaceTextureListener).
+
+    if (textureView.isAvailable()) {
+      camera.startPreview();
+    } else {
+      textureView.setSurfaceTextureListener(surfaceTextureListener);
+    }
+  }
+
+  @Override
+  public void onPause() {
+    stopCamera();
+    stopBackgroundThread();
+    super.onPause();
+  }
+
+  /**
+   * Starts a background thread and its {@link Handler}.
+   */
+  private void startBackgroundThread() {
+    backgroundThread = new HandlerThread("CameraBackground");
+    backgroundThread.start();
+  }
+
+  /**
+   * Stops the background thread and its {@link Handler}.
+   */
+  private void stopBackgroundThread() {
+    backgroundThread.quitSafely();
+    try {
+      backgroundThread.join();
+      backgroundThread = null;
+    } catch (final InterruptedException e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  protected void stopCamera() {
+    if (camera != null) {
+      camera.stopPreview();
+      camera.setPreviewCallback(null);
+      camera.release();
+      camera = null;
+    }
+  }
+
+  private int getCameraId() {
+    CameraInfo ci = new CameraInfo();
+    for (int i = 0; i < Camera.getNumberOfCameras(); i++) {
+      Camera.getCameraInfo(i, ci);
+      if (ci.facing == CameraInfo.CAMERA_FACING_BACK)
+        return i;
+    }
+    return -1; // No camera found
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
new file mode 100644
index 0000000000000000000000000000000000000000..0f8d109fb46d769d0ada9c9daa6292a80470be8a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/OverlayView.java
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.util.AttributeSet;
+import android.view.View;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * A simple View providing a render callback to other classes.
+ */
+public class OverlayView extends View {
+  private final List<DrawCallback> callbacks = new LinkedList<DrawCallback>();
+
+  public OverlayView(final Context context, final AttributeSet attrs) {
+    super(context, attrs);
+  }
+
+  /**
+   * Interface defining the callback for client classes.
+   */
+  public interface DrawCallback {
+    public void drawCallback(final Canvas canvas);
+  }
+
+  public void addCallback(final DrawCallback callback) {
+    callbacks.add(callback);
+  }
+
+  @Override
+  public synchronized void draw(final Canvas canvas) {
+    for (final DrawCallback callback : callbacks) {
+      callback.drawCallback(canvas);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
new file mode 100644
index 0000000000000000000000000000000000000000..31a4b07c8387bf0b1da9e967f37628d0ce642dc4
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
@@ -0,0 +1,67 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Paint;
+import android.util.AttributeSet;
+import android.util.TypedValue;
+import android.view.View;
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
+
+public class RecognitionScoreView extends View implements ResultsView {
+  private static final float TEXT_SIZE_DIP = 24;
+  private List<Recognition> results;
+  private final float textSizePx;
+  private final Paint fgPaint;
+  private final Paint bgPaint;
+
+  public RecognitionScoreView(final Context context, final AttributeSet set) {
+    super(context, set);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, getResources().getDisplayMetrics());
+    fgPaint = new Paint();
+    fgPaint.setTextSize(textSizePx);
+
+    bgPaint = new Paint();
+    bgPaint.setColor(0xcc4285f4);
+  }
+
+  @Override
+  public void setResults(final List<Recognition> results) {
+    this.results = results;
+    postInvalidate();
+  }
+
+  @Override
+  public void onDraw(final Canvas canvas) {
+    final int x = 10;
+    int y = (int) (fgPaint.getTextSize() * 1.5f);
+
+    canvas.drawPaint(bgPaint);
+
+    if (results != null) {
+      for (final Recognition recog : results) {
+        canvas.drawText(recog.getTitle() + ": " + recog.getConfidence(), x, y, fgPaint);
+        y += (int) (fgPaint.getTextSize() * 1.5f);
+      }
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
new file mode 100644
index 0000000000000000000000000000000000000000..9e91aea7efc8c1aea00913ba863eb57f0692343a
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.tensorflow.demo;
+
+import android.util.Log;
+import android.util.Pair;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Deque;
+import java.util.List;
+
+/** Reads in results from an instantaneous audio recognition model and smoothes them over time. */
+public class RecognizeCommands {
+  // Configuration settings.
+  private List<String> labels = new ArrayList<String>();
+  private long averageWindowDurationMs;
+  private float detectionThreshold;
+  private int suppressionMs;
+  private int minimumCount;
+  private long minimumTimeBetweenSamplesMs;
+
+  // Working variables.
+  private Deque<Pair<Long, float[]>> previousResults = new ArrayDeque<Pair<Long, float[]>>();
+  private String previousTopLabel;
+  private int labelsCount;
+  private long previousTopLabelTime;
+  private float previousTopLabelScore;
+
+  private static final String SILENCE_LABEL = "_silence_";
+  private static final long MINIMUM_TIME_FRACTION = 4;
+
+  public RecognizeCommands(
+      List<String> inLabels,
+      long inAverageWindowDurationMs,
+      float inDetectionThreshold,
+      int inSuppressionMS,
+      int inMinimumCount,
+      long inMinimumTimeBetweenSamplesMS) {
+    labels = inLabels;
+    averageWindowDurationMs = inAverageWindowDurationMs;
+    detectionThreshold = inDetectionThreshold;
+    suppressionMs = inSuppressionMS;
+    minimumCount = inMinimumCount;
+    labelsCount = inLabels.size();
+    previousTopLabel = SILENCE_LABEL;
+    previousTopLabelTime = Long.MIN_VALUE;
+    previousTopLabelScore = 0.0f;
+    minimumTimeBetweenSamplesMs = inMinimumTimeBetweenSamplesMS;
+  }
+
+  /** Holds information about what's been recognized. */
+  public static class RecognitionResult {
+    public final String foundCommand;
+    public final float score;
+    public final boolean isNewCommand;
+
+    public RecognitionResult(String inFoundCommand, float inScore, boolean inIsNewCommand) {
+      foundCommand = inFoundCommand;
+      score = inScore;
+      isNewCommand = inIsNewCommand;
+    }
+  }
+
+  private static class ScoreForSorting implements Comparable<ScoreForSorting> {
+    public final float score;
+    public final int index;
+
+    public ScoreForSorting(float inScore, int inIndex) {
+      score = inScore;
+      index = inIndex;
+    }
+
+    @Override
+    public int compareTo(ScoreForSorting other) {
+      if (this.score > other.score) {
+        return -1;
+      } else if (this.score < other.score) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+
+  public RecognitionResult processLatestResults(float[] currentResults, long currentTimeMS) {
+    if (currentResults.length != labelsCount) {
+      throw new RuntimeException(
+          "The results for recognition should contain "
+              + labelsCount
+              + " elements, but there are "
+              + currentResults.length);
+    }
+
+    if ((!previousResults.isEmpty()) && (currentTimeMS < previousResults.getFirst().first)) {
+      throw new RuntimeException(
+          "You must feed results in increasing time order, but received a timestamp of "
+              + currentTimeMS
+              + " that was earlier than the previous one of "
+              + previousResults.getFirst().first);
+    }
+
+    final int howManyResults = previousResults.size();
+    // Ignore any results that are coming in too frequently.
+    if (howManyResults > 1) {
+      final long timeSinceMostRecent = currentTimeMS - previousResults.getLast().first;
+      if (timeSinceMostRecent < minimumTimeBetweenSamplesMs) {
+        return new RecognitionResult(previousTopLabel, previousTopLabelScore, false);
+      }
+    }
+
+    // Add the latest results to the head of the queue.
+    previousResults.addLast(new Pair<Long, float[]>(currentTimeMS, currentResults));
+
+    // Prune any earlier results that are too old for the averaging window.
+    final long timeLimit = currentTimeMS - averageWindowDurationMs;
+    while (previousResults.getFirst().first < timeLimit) {
+      previousResults.removeFirst();
+    }
+
+    // If there are too few results, assume the result will be unreliable and
+    // bail.
+    final long earliestTime = previousResults.getFirst().first;
+    final long samplesDuration = currentTimeMS - earliestTime;
+    if ((howManyResults < minimumCount)
+        || (samplesDuration < (averageWindowDurationMs / MINIMUM_TIME_FRACTION))) {
+      Log.v("RecognizeResult", "Too few results");
+      return new RecognitionResult(previousTopLabel, 0.0f, false);
+    }
+
+    // Calculate the average score across all the results in the window.
+    float[] averageScores = new float[labelsCount];
+    for (Pair<Long, float[]> previousResult : previousResults) {
+      final float[] scoresTensor = previousResult.second;
+      int i = 0;
+      while (i < scoresTensor.length) {
+        averageScores[i] += scoresTensor[i] / howManyResults;
+        ++i;
+      }
+    }
+
+    // Sort the averaged results in descending score order.
+    ScoreForSorting[] sortedAverageScores = new ScoreForSorting[labelsCount];
+    for (int i = 0; i < labelsCount; ++i) {
+      sortedAverageScores[i] = new ScoreForSorting(averageScores[i], i);
+    }
+    Arrays.sort(sortedAverageScores);
+
+    // See if the latest top score is enough to trigger a detection.
+    final int currentTopIndex = sortedAverageScores[0].index;
+    final String currentTopLabel = labels.get(currentTopIndex);
+    final float currentTopScore = sortedAverageScores[0].score;
+    // If we've recently had another label trigger, assume one that occurs too
+    // soon afterwards is a bad result.
+    long timeSinceLastTop;
+    if (previousTopLabel.equals(SILENCE_LABEL) || (previousTopLabelTime == Long.MIN_VALUE)) {
+      timeSinceLastTop = Long.MAX_VALUE;
+    } else {
+      timeSinceLastTop = currentTimeMS - previousTopLabelTime;
+    }
+    boolean isNewCommand;
+    if ((currentTopScore > detectionThreshold) && (timeSinceLastTop > suppressionMs)) {
+      previousTopLabel = currentTopLabel;
+      previousTopLabelTime = currentTimeMS;
+      previousTopLabelScore = currentTopScore;
+      isNewCommand = true;
+    } else {
+      isNewCommand = false;
+    }
+    return new RecognitionResult(currentTopLabel, currentTopScore, isNewCommand);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
new file mode 100644
index 0000000000000000000000000000000000000000..211d7e66fb20ce00e4e91ecc9134fbf2852e9f3d
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/ResultsView.java
@@ -0,0 +1,23 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import java.util.List;
+import org.tensorflow.demo.Classifier.Recognition;
+
+public interface ResultsView {
+  public void setResults(final List<Recognition> results);
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..9c9c30bc0985e529b46c322fd0ff02590967afa2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/SpeechActivity.java
@@ -0,0 +1,381 @@
+/*
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Demonstrates how to run an audio recognition model in Android.
+
+This example loads a simple speech recognition model trained by the tutorial at
+https://www.tensorflow.org/tutorials/audio_training
+
+The model files should be downloaded automatically from the TensorFlow website,
+but if you have a custom model you can update the LABEL_FILENAME and
+MODEL_FILENAME constants to point to your own files.
+
+The example application displays a list view with all of the known audio labels,
+and highlights each one when it thinks it has detected one through the
+microphone. The averaging of results to give a more reliable signal happens in
+the RecognizeCommands helper class.
+*/
+
+package org.tensorflow.demo;
+
+import android.animation.ValueAnimator;
+import android.app.Activity;
+import android.content.pm.PackageManager;
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.os.Build;
+import android.os.Bundle;
+import android.util.Log;
+import android.view.View;
+import android.widget.ArrayAdapter;
+import android.widget.Button;
+import android.widget.ListView;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.locks.ReentrantLock;
+import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.demo.R; // Explicit import needed for internal Google builds.
+
+/**
+ * An activity that listens for audio and then uses a TensorFlow model to detect particular classes,
+ * by default a small set of action words.
+ */
+public class SpeechActivity extends Activity {
+
+  // Constants that control the behavior of the recognition code and model
+  // settings. See the audio recognition tutorial for a detailed explanation of
+  // all these, but you should customize them to match your training settings if
+  // you are running your own model.
+  private static final int SAMPLE_RATE = 16000;
+  private static final int SAMPLE_DURATION_MS = 1000;
+  private static final int RECORDING_LENGTH = (int) (SAMPLE_RATE * SAMPLE_DURATION_MS / 1000);
+  private static final long AVERAGE_WINDOW_DURATION_MS = 500;
+  private static final float DETECTION_THRESHOLD = 0.70f;
+  private static final int SUPPRESSION_MS = 1500;
+  private static final int MINIMUM_COUNT = 3;
+  private static final long MINIMUM_TIME_BETWEEN_SAMPLES_MS = 30;
+  private static final String LABEL_FILENAME = "file:///android_asset/conv_actions_labels.txt";
+  private static final String MODEL_FILENAME = "file:///android_asset/conv_actions_frozen.tflite";
+
+  // UI elements.
+  private static final int REQUEST_RECORD_AUDIO = 13;
+  private Button quitButton;
+  private ListView labelsListView;
+  private static final String LOG_TAG = SpeechActivity.class.getSimpleName();
+
+  // Working variables.
+  short[] recordingBuffer = new short[RECORDING_LENGTH];
+  int recordingOffset = 0;
+  boolean shouldContinue = true;
+  private Thread recordingThread;
+  boolean shouldContinueRecognition = true;
+  private Thread recognitionThread;
+  private final ReentrantLock recordingBufferLock = new ReentrantLock();
+
+  private List<String> labels = new ArrayList<String>();
+  private List<String> displayedLabels = new ArrayList<>();
+  private RecognizeCommands recognizeCommands = null;
+
+  private Interpreter tfLite;
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    // Set up the UI.
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_speech);
+    quitButton = (Button) findViewById(R.id.quit);
+    quitButton.setOnClickListener(
+        new View.OnClickListener() {
+          @Override
+          public void onClick(View view) {
+            moveTaskToBack(true);
+            android.os.Process.killProcess(android.os.Process.myPid());
+            System.exit(1);
+          }
+        });
+    labelsListView = (ListView) findViewById(R.id.list_view);
+
+    // Load the labels for the model, but only display those that don't start
+    // with an underscore.
+    String actualLabelFilename = LABEL_FILENAME.split("file:///android_asset/", -1)[1];
+    Log.i(LOG_TAG, "Reading labels from: " + actualLabelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(getAssets().open(actualLabelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        labels.add(line);
+        if (line.charAt(0) != '_') {
+          displayedLabels.add(line.substring(0, 1).toUpperCase() + line.substring(1));
+        }
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!", e);
+    }
+
+    // Build a list view based on these labels.
+    ArrayAdapter<String> arrayAdapter =
+        new ArrayAdapter<String>(this, R.layout.list_text_item, displayedLabels);
+    labelsListView.setAdapter(arrayAdapter);
+
+    // Set up an object to smooth recognition results to increase accuracy.
+    recognizeCommands =
+        new RecognizeCommands(
+            labels,
+            AVERAGE_WINDOW_DURATION_MS,
+            DETECTION_THRESHOLD,
+            SUPPRESSION_MS,
+            MINIMUM_COUNT,
+            MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+
+    String actualModelFilename = MODEL_FILENAME.split("file:///android_asset/", -1)[1];
+    try {
+      tfLite = new Interpreter(loadModelFile(getAssets(), actualModelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    tfLite.resizeInput(0, new int[] {RECORDING_LENGTH, 1});
+    tfLite.resizeInput(1, new int[] {1});
+
+    // Start the recording and recognition threads.
+    requestMicrophonePermission();
+    startRecording();
+    startRecognition();
+  }
+
+  private void requestMicrophonePermission() {
+    if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.M) {
+      requestPermissions(
+          new String[]{android.Manifest.permission.RECORD_AUDIO}, REQUEST_RECORD_AUDIO);
+    }
+  }
+
+  @Override
+  public void onRequestPermissionsResult(
+      int requestCode, String[] permissions, int[] grantResults) {
+    if (requestCode == REQUEST_RECORD_AUDIO
+        && grantResults.length > 0
+        && grantResults[0] == PackageManager.PERMISSION_GRANTED) {
+      startRecording();
+      startRecognition();
+    }
+  }
+
+  public synchronized void startRecording() {
+    if (recordingThread != null) {
+      return;
+    }
+    shouldContinue = true;
+    recordingThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                record();
+              }
+            });
+    recordingThread.start();
+  }
+
+  public synchronized void stopRecording() {
+    if (recordingThread == null) {
+      return;
+    }
+    shouldContinue = false;
+    recordingThread = null;
+  }
+
+  private void record() {
+    android.os.Process.setThreadPriority(android.os.Process.THREAD_PRIORITY_AUDIO);
+
+    // Estimate the buffer size we'll need for this device.
+    int bufferSize =
+        AudioRecord.getMinBufferSize(
+            SAMPLE_RATE, AudioFormat.CHANNEL_IN_MONO, AudioFormat.ENCODING_PCM_16BIT);
+    if (bufferSize == AudioRecord.ERROR || bufferSize == AudioRecord.ERROR_BAD_VALUE) {
+      bufferSize = SAMPLE_RATE * 2;
+    }
+    short[] audioBuffer = new short[bufferSize / 2];
+
+    AudioRecord record =
+        new AudioRecord(
+            MediaRecorder.AudioSource.DEFAULT,
+            SAMPLE_RATE,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT,
+            bufferSize);
+
+    if (record.getState() != AudioRecord.STATE_INITIALIZED) {
+      Log.e(LOG_TAG, "Audio Record can't initialize!");
+      return;
+    }
+
+    record.startRecording();
+
+    Log.v(LOG_TAG, "Start recording");
+
+    // Loop, gathering audio data and copying it to a round-robin buffer.
+    while (shouldContinue) {
+      int numberRead = record.read(audioBuffer, 0, audioBuffer.length);
+      int maxLength = recordingBuffer.length;
+      int newRecordingOffset = recordingOffset + numberRead;
+      int secondCopyLength = Math.max(0, newRecordingOffset - maxLength);
+      int firstCopyLength = numberRead - secondCopyLength;
+      // We store off all the data for the recognition thread to access. The ML
+      // thread will copy out of this buffer into its own, while holding the
+      // lock, so this should be thread safe.
+      recordingBufferLock.lock();
+      try {
+        System.arraycopy(audioBuffer, 0, recordingBuffer, recordingOffset, firstCopyLength);
+        System.arraycopy(audioBuffer, firstCopyLength, recordingBuffer, 0, secondCopyLength);
+        recordingOffset = newRecordingOffset % maxLength;
+      } finally {
+        recordingBufferLock.unlock();
+      }
+    }
+
+    record.stop();
+    record.release();
+  }
+
+  public synchronized void startRecognition() {
+    if (recognitionThread != null) {
+      return;
+    }
+    shouldContinueRecognition = true;
+    recognitionThread =
+        new Thread(
+            new Runnable() {
+              @Override
+              public void run() {
+                recognize();
+              }
+            });
+    recognitionThread.start();
+  }
+
+  public synchronized void stopRecognition() {
+    if (recognitionThread == null) {
+      return;
+    }
+    shouldContinueRecognition = false;
+    recognitionThread = null;
+  }
+
+  private void recognize() {
+    Log.v(LOG_TAG, "Start recognition");
+
+    short[] inputBuffer = new short[RECORDING_LENGTH];
+    float[][] floatInputBuffer = new float[RECORDING_LENGTH][1];
+    float[][] outputScores = new float[1][labels.size()];
+    int[] sampleRateList = new int[] {SAMPLE_RATE};
+
+    // Loop, grabbing recorded data and running the recognition model on it.
+    while (shouldContinueRecognition) {
+      // The recording thread places data in this round-robin buffer, so lock to
+      // make sure there's no writing happening and then copy it to our own
+      // local version.
+      recordingBufferLock.lock();
+      try {
+        int maxLength = recordingBuffer.length;
+        int firstCopyLength = maxLength - recordingOffset;
+        int secondCopyLength = recordingOffset;
+        System.arraycopy(recordingBuffer, recordingOffset, inputBuffer, 0, firstCopyLength);
+        System.arraycopy(recordingBuffer, 0, inputBuffer, firstCopyLength, secondCopyLength);
+      } finally {
+        recordingBufferLock.unlock();
+      }
+
+      // We need to feed in float values between -1.0f and 1.0f, so divide the
+      // signed 16-bit inputs.
+      for (int i = 0; i < RECORDING_LENGTH; ++i) {
+        floatInputBuffer[i][0] = inputBuffer[i] / 32767.0f;
+      }
+
+      Object[] inputArray = {floatInputBuffer, sampleRateList};
+      Map<Integer, Object> outputMap = new HashMap<>();
+      outputMap.put(0, outputScores);
+
+      // Run the model.
+      tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+
+      // Use the smoother to figure out if we've had a real recognition event.
+      long currentTime = System.currentTimeMillis();
+      final RecognizeCommands.RecognitionResult result =
+          recognizeCommands.processLatestResults(outputScores[0], currentTime);
+
+      runOnUiThread(
+          new Runnable() {
+            @Override
+            public void run() {
+              // If we do have a new command, highlight the right list entry.
+              if (!result.foundCommand.startsWith("_") && result.isNewCommand) {
+                int labelIndex = -1;
+                for (int i = 0; i < labels.size(); ++i) {
+                  if (labels.get(i).equals(result.foundCommand)) {
+                    labelIndex = i;
+                  }
+                }
+                final View labelView = (View) labelsListView.getChildAt(labelIndex - 2);
+                ValueAnimator colorAnimation =
+                    ValueAnimator.ofArgb(0x00b3ccff, 0xffb3ccff, 0x00b3ccff);
+                colorAnimation.setDuration(750);
+                colorAnimation.addUpdateListener(
+                    new ValueAnimator.AnimatorUpdateListener() {
+                      @Override
+                      public void onAnimationUpdate(ValueAnimator animator) {
+                        labelView.setBackgroundColor((int) animator.getAnimatedValue());
+                      }
+                    });
+                colorAnimation.start();
+              }
+            }
+          });
+      try {
+        // We don't need to run too frequently, so snooze for a bit.
+        Thread.sleep(MINIMUM_TIME_BETWEEN_SAMPLES_MS);
+      } catch (InterruptedException e) {
+        // Ignore
+      }
+    }
+
+    Log.v(LOG_TAG, "End recognition");
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
new file mode 100644
index 0000000000000000000000000000000000000000..d75c3ceadabd2dad73b1e5feda3ae88181769e74
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteImageClassifier.java
@@ -0,0 +1,209 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.os.SystemClock;
+import android.os.Trace;
+import android.util.Log;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Vector;
+import org.tensorflow.lite.Interpreter;
+
+/** A classifier specialized to label images using TensorFlow. */
+public class TFLiteImageClassifier implements Classifier {
+  private static final String TAG = "TFLiteImageClassifier";
+
+  // Only return this many results with at least this confidence.
+  private static final int MAX_RESULTS = 3;
+
+  private Interpreter tfLite;
+
+  /** Dimensions of inputs. */
+  private static final int DIM_BATCH_SIZE = 1;
+
+  private static final int DIM_PIXEL_SIZE = 3;
+
+  private static final int DIM_IMG_SIZE_X = 224;
+  private static final int DIM_IMG_SIZE_Y = 224;
+
+  byte[][] labelProb;
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private ByteBuffer imgData = null;
+
+  private TFLiteImageClassifier() {}
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   * @param inputSize The input size. A square image of inputSize x inputSize is assumed.
+   * @throws IOException
+   */
+  public static Classifier create(
+      AssetManager assetManager, String modelFilename, String labelFilename, int inputSize) {
+    TFLiteImageClassifier c = new TFLiteImageClassifier();
+
+    // Read the label names into memory.
+    // TODO(andrewharp): make this handle non-assets.
+    Log.i(TAG, "Reading labels from: " + labelFilename);
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(assetManager.open(labelFilename)));
+      String line;
+      while ((line = br.readLine()) != null) {
+        c.labels.add(line);
+      }
+      br.close();
+    } catch (IOException e) {
+      throw new RuntimeException("Problem reading label file!" , e);
+    }
+
+    c.imgData =
+        ByteBuffer.allocateDirect(
+            DIM_BATCH_SIZE * DIM_IMG_SIZE_X * DIM_IMG_SIZE_Y * DIM_PIXEL_SIZE);
+
+    c.imgData.order(ByteOrder.nativeOrder());
+    try {
+      c.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // The shape of the output is [N, NUM_CLASSES], where N is the batch size.
+    Log.i(TAG, "Read " + c.labels.size() + " labels");
+
+    // Pre-allocate buffers.
+    c.intValues = new int[inputSize * inputSize];
+
+    c.labelProb = new byte[1][c.labels.size()];
+
+    return c;
+  }
+
+  /** Writes Image data into a {@code ByteBuffer}. */
+  private void convertBitmapToByteBuffer(Bitmap bitmap) {
+    if (imgData == null) {
+      return;
+    }
+    imgData.rewind();
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+    // Convert the image to floating point.
+    int pixel = 0;
+    long startTime = SystemClock.uptimeMillis();
+    for (int i = 0; i < DIM_IMG_SIZE_X; ++i) {
+      for (int j = 0; j < DIM_IMG_SIZE_Y; ++j) {
+        final int val = intValues[pixel++];
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    long endTime = SystemClock.uptimeMillis();
+    Log.d(TAG, "Timecost to put values into ByteBuffer: " + Long.toString(endTime - startTime));
+  }
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+
+    long startTime;
+    long endTime;
+    startTime = SystemClock.uptimeMillis();
+
+    convertBitmapToByteBuffer(bitmap);
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    startTime = SystemClock.uptimeMillis();
+    tfLite.run(imgData, labelProb);
+    endTime = SystemClock.uptimeMillis();
+    Log.i(TAG, "Inf time: " + (endTime - startTime));
+    Trace.endSection();
+
+    // Find the best classifications.
+    PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            3,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(Recognition lhs, Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+    for (int i = 0; i < labels.size(); ++i) {
+      pq.add(
+          new Recognition(
+              "" + i,
+              labels.size() > i ? labels.get(i) : "unknown",
+              (float) labelProb[0][i],
+              null));
+    }
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    int recognitionsSize = Math.min(pq.size(), MAX_RESULTS);
+    for (int i = 0; i < recognitionsSize; ++i) {
+      recognitions.add(pq.poll());
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
new file mode 100644
index 0000000000000000000000000000000000000000..bfb4a0a04bc90566736864bf62340d1032961858
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/TFLiteObjectDetectionAPIModel.java
@@ -0,0 +1,292 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo;
+
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.RectF;
+import android.os.Trace;
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.StringTokenizer;
+import java.util.Vector;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.lite.Interpreter;
+
+/**
+ * Wrapper for frozen detection models trained using the Tensorflow Object Detection API:
+ * github.com/tensorflow/models/tree/master/research/object_detection
+ */
+public class TFLiteObjectDetectionAPIModel implements Classifier {
+  private static final Logger LOGGER = new Logger();
+
+  // Only return this many results.
+  private static final int NUM_RESULTS = 1917;
+  private static final int NUM_CLASSES = 91;
+
+  private static final float Y_SCALE = 10.0f;
+  private static final float X_SCALE = 10.0f;
+  private static final float H_SCALE = 5.0f;
+  private static final float W_SCALE = 5.0f;
+
+  // Config values.
+  private int inputSize;
+
+  private final float[][] boxPriors = new float[4][NUM_RESULTS];
+
+  // Pre-allocated buffers.
+  private Vector<String> labels = new Vector<String>();
+  private int[] intValues;
+  private float[][][] outputLocations;
+  private float[][][] outputClasses;
+
+  float[][][][] img;
+
+  private Interpreter tfLite;
+
+  private float expit(final float x) {
+    return (float) (1. / (1. + Math.exp(-x)));
+  }
+
+  /** Memory-map the model file in Assets. */
+  private static MappedByteBuffer loadModelFile(AssetManager assets, String modelFilename)
+      throws IOException {
+    AssetFileDescriptor fileDescriptor = assets.openFd(modelFilename);
+    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+
+  private void loadCoderOptions(
+      final AssetManager assetManager, final String locationFilename, final float[][] boxPriors)
+      throws IOException {
+    // Try to be intelligent about opening from assets or sdcard depending on prefix.
+    final String assetPrefix = "file:///android_asset/";
+    InputStream is;
+    if (locationFilename.startsWith(assetPrefix)) {
+      is = assetManager.open(locationFilename.split(assetPrefix, -1)[1]);
+    } else {
+      is = new FileInputStream(locationFilename);
+    }
+
+    final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
+
+    for (int lineNum = 0; lineNum < 4; ++lineNum) {
+      String line = reader.readLine();
+      final StringTokenizer st = new StringTokenizer(line, ", ");
+      int priorIndex = 0;
+      while (st.hasMoreTokens()) {
+        final String token = st.nextToken();
+        try {
+          final float number = Float.parseFloat(token);
+          boxPriors[lineNum][priorIndex++] = number;
+        } catch (final NumberFormatException e) {
+          // Silently ignore.
+        }
+      }
+      if (priorIndex != NUM_RESULTS) {
+        throw new RuntimeException(
+            "BoxPrior length mismatch: " + priorIndex + " vs " + NUM_RESULTS);
+      }
+    }
+
+    LOGGER.i("Loaded box priors!");
+  }
+
+  void decodeCenterSizeBoxes(float[][][] predictions) {
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float ycenter = predictions[0][i][0] / Y_SCALE * boxPriors[2][i] + boxPriors[0][i];
+      float xcenter = predictions[0][i][1] / X_SCALE * boxPriors[3][i] + boxPriors[1][i];
+      float h = (float) Math.exp(predictions[0][i][2] / H_SCALE) * boxPriors[2][i];
+      float w = (float) Math.exp(predictions[0][i][3] / W_SCALE) * boxPriors[3][i];
+
+      float ymin = ycenter - h / 2.f;
+      float xmin = xcenter - w / 2.f;
+      float ymax = ycenter + h / 2.f;
+      float xmax = xcenter + w / 2.f;
+
+      predictions[0][i][0] = ymin;
+      predictions[0][i][1] = xmin;
+      predictions[0][i][2] = ymax;
+      predictions[0][i][3] = xmax;
+    }
+  }
+
+  /**
+   * Initializes a native TensorFlow session for classifying images.
+   *
+   * @param assetManager The asset manager to be used to load assets.
+   * @param modelFilename The filepath of the model GraphDef protocol buffer.
+   * @param labelFilename The filepath of label file for classes.
+   */
+  public static Classifier create(
+      final AssetManager assetManager,
+      final String modelFilename,
+      final String labelFilename,
+      final int inputSize) throws IOException {
+    final TFLiteObjectDetectionAPIModel d = new TFLiteObjectDetectionAPIModel();
+
+    d.loadCoderOptions(assetManager, "file:///android_asset/box_priors.txt", d.boxPriors);
+
+    InputStream labelsInput = null;
+    String actualFilename = labelFilename.split("file:///android_asset/")[1];
+    labelsInput = assetManager.open(actualFilename);
+    BufferedReader br = null;
+    br = new BufferedReader(new InputStreamReader(labelsInput));
+    String line;
+    while ((line = br.readLine()) != null) {
+      LOGGER.w(line);
+      d.labels.add(line);
+    }
+    br.close();
+
+    d.inputSize = inputSize;
+
+    try {
+      d.tfLite = new Interpreter(loadModelFile(assetManager, modelFilename));
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+
+    // Pre-allocate buffers.
+    d.img = new float[1][inputSize][inputSize][3];
+
+    d.intValues = new int[d.inputSize * d.inputSize];
+    d.outputLocations = new float[1][NUM_RESULTS][4];
+    d.outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+    return d;
+  }
+
+  private TFLiteObjectDetectionAPIModel() {}
+
+  @Override
+  public List<Recognition> recognizeImage(final Bitmap bitmap) {
+    // Log this method so that it can be analyzed with systrace.
+    Trace.beginSection("recognizeImage");
+
+    Trace.beginSection("preprocessBitmap");
+    // Preprocess the image data from 0-255 int to normalized float based
+    // on the provided parameters.
+    bitmap.getPixels(intValues, 0, bitmap.getWidth(), 0, 0, bitmap.getWidth(), bitmap.getHeight());
+
+    for (int i = 0; i < inputSize; ++i) {
+      for (int j = 0; j < inputSize; ++j) {
+        int pixel = intValues[j * inputSize + i];
+        img[0][j][i][2] = (float) (pixel & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][1] = (float) ((pixel >> 8) & 0xFF) / 128.0f - 1.0f;
+        img[0][j][i][0] = (float) ((pixel >> 16) & 0xFF) / 128.0f - 1.0f;
+      }
+    }
+    Trace.endSection(); // preprocessBitmap
+
+    // Copy the input data into TensorFlow.
+    Trace.beginSection("feed");
+    outputLocations = new float[1][NUM_RESULTS][4];
+    outputClasses = new float[1][NUM_RESULTS][NUM_CLASSES];
+
+    Object[] inputArray = {img};
+    Map<Integer, Object> outputMap = new HashMap<>();
+    outputMap.put(0, outputLocations);
+    outputMap.put(1, outputClasses);
+    Trace.endSection();
+
+    // Run the inference call.
+    Trace.beginSection("run");
+    tfLite.runForMultipleInputsOutputs(inputArray, outputMap);
+    Trace.endSection();
+
+    decodeCenterSizeBoxes(outputLocations);
+
+    // Find the best detections.
+    final PriorityQueue<Recognition> pq =
+        new PriorityQueue<Recognition>(
+            1,
+            new Comparator<Recognition>() {
+              @Override
+              public int compare(final Recognition lhs, final Recognition rhs) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return Float.compare(rhs.getConfidence(), lhs.getConfidence());
+              }
+            });
+
+    // Scale them back to the input size.
+    for (int i = 0; i < NUM_RESULTS; ++i) {
+      float topClassScore = -1000f;
+      int topClassScoreIndex = -1;
+
+      // Skip the first catch-all class.
+      for (int j = 1; j < NUM_CLASSES; ++j) {
+        float score = expit(outputClasses[0][i][j]);
+
+        if (score > topClassScore) {
+          topClassScoreIndex = j;
+          topClassScore = score;
+        }
+      }
+
+      if (topClassScore > 0.001f) {
+        final RectF detection =
+            new RectF(
+                outputLocations[0][i][1] * inputSize,
+                outputLocations[0][i][0] * inputSize,
+                outputLocations[0][i][3] * inputSize,
+                outputLocations[0][i][2] * inputSize);
+
+        pq.add(
+            new Recognition(
+                "" + i,
+                labels.get(topClassScoreIndex),
+                outputClasses[0][i][topClassScoreIndex],
+                detection));
+      }
+    }
+
+    final ArrayList<Recognition> recognitions = new ArrayList<Recognition>();
+    for (int i = 0; i < Math.min(pq.size(), 10); ++i) {
+      Recognition recog = pq.poll();
+      recognitions.add(recog);
+    }
+    Trace.endSection(); // "recognizeImage"
+    return recognitions;
+  }
+
+  @Override
+  public void enableStatLogging(final boolean logStats) {
+  }
+
+  @Override
+  public String getStatString() {
+    return "";
+  }
+
+  @Override
+  public void close() {
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..c50efdf889145ad717445015fb94a37568939b73
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/AssetUtils.java
@@ -0,0 +1,78 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.content.Context;
+import android.content.res.AssetManager;
+import android.util.Log;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/** Utilities for dealing with assets. */
+public class AssetUtils {
+
+  private static final String TAG = AssetUtils.class.getSimpleName();
+
+  private static final int BYTE_BUF_SIZE = 2048;
+
+  /**
+   * Copies a file from assets.
+   *
+   * @param context application context used to discover assets.
+   * @param assetName the relative file name within assets.
+   * @param targetName the target file name, always over write the existing file.
+   * @throws IOException if operation fails.
+   */
+  public static void copy(Context context, String assetName, String targetName) throws IOException {
+
+    Log.d(TAG, "creating file " + targetName + " from " + assetName);
+
+    File targetFile = null;
+    InputStream inputStream = null;
+    FileOutputStream outputStream = null;
+
+    try {
+      AssetManager assets = context.getAssets();
+      targetFile = new File(targetName);
+      inputStream = assets.open(assetName);
+      // TODO(kanlig): refactor log messages to make them more useful.
+      Log.d(TAG, "Creating outputstream");
+      outputStream = new FileOutputStream(targetFile, false /* append */);
+      copy(inputStream, outputStream);
+    } finally {
+      if (outputStream != null) {
+        outputStream.close();
+      }
+      if (inputStream != null) {
+        inputStream.close();
+      }
+    }
+  }
+
+  private static void copy(InputStream from, OutputStream to) throws IOException {
+    byte[] buf = new byte[BYTE_BUF_SIZE];
+    while (true) {
+      int r = from.read(buf);
+      if (r == -1) {
+        break;
+      }
+      to.write(buf, 0, r);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
new file mode 100644
index 0000000000000000000000000000000000000000..decfc3d8793d127800feb5d58cdaf3f84512d840
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/BorderedText.java
@@ -0,0 +1,117 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Paint;
+import android.graphics.Paint.Align;
+import android.graphics.Paint.Style;
+import android.graphics.Rect;
+import android.graphics.Typeface;
+import java.util.Vector;
+
+/**
+ * A class that encapsulates the tedious bits of rendering legible, bordered text onto a canvas.
+ */
+public class BorderedText {
+  private final Paint interiorPaint;
+  private final Paint exteriorPaint;
+
+  private final float textSize;
+
+  /**
+   * Creates a left-aligned bordered text object with a white interior, and a black exterior with
+   * the specified text size.
+   *
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final float textSize) {
+    this(Color.WHITE, Color.BLACK, textSize);
+  }
+
+  /**
+   * Create a bordered text object with the specified interior and exterior colors, text size and
+   * alignment.
+   *
+   * @param interiorColor the interior text color
+   * @param exteriorColor the exterior text color
+   * @param textSize text size in pixels
+   */
+  public BorderedText(final int interiorColor, final int exteriorColor, final float textSize) {
+    interiorPaint = new Paint();
+    interiorPaint.setTextSize(textSize);
+    interiorPaint.setColor(interiorColor);
+    interiorPaint.setStyle(Style.FILL);
+    interiorPaint.setAntiAlias(false);
+    interiorPaint.setAlpha(255);
+
+    exteriorPaint = new Paint();
+    exteriorPaint.setTextSize(textSize);
+    exteriorPaint.setColor(exteriorColor);
+    exteriorPaint.setStyle(Style.FILL_AND_STROKE);
+    exteriorPaint.setStrokeWidth(textSize / 8);
+    exteriorPaint.setAntiAlias(false);
+    exteriorPaint.setAlpha(255);
+
+    this.textSize = textSize;
+  }
+
+  public void setTypeface(Typeface typeface) {
+    interiorPaint.setTypeface(typeface);
+    exteriorPaint.setTypeface(typeface);
+  }
+
+  public void drawText(final Canvas canvas, final float posX, final float posY, final String text) {
+    canvas.drawText(text, posX, posY, exteriorPaint);
+    canvas.drawText(text, posX, posY, interiorPaint);
+  }
+
+  public void drawLines(Canvas canvas, final float posX, final float posY, Vector<String> lines) {
+    int lineNum = 0;
+    for (final String line : lines) {
+      drawText(canvas, posX, posY - getTextSize() * (lines.size() - lineNum - 1), line);
+      ++lineNum;
+    }
+  }
+
+  public void setInteriorColor(final int color) {
+    interiorPaint.setColor(color);
+  }
+
+  public void setExteriorColor(final int color) {
+    exteriorPaint.setColor(color);
+  }
+
+  public float getTextSize() {
+    return textSize;
+  }
+
+  public void setAlpha(final int alpha) {
+    interiorPaint.setAlpha(alpha);
+    exteriorPaint.setAlpha(alpha);
+  }
+
+  public void getTextBounds(
+      final String line, final int index, final int count, final Rect lineBounds) {
+    interiorPaint.getTextBounds(line, index, count, lineBounds);
+  }
+
+  public void setTextAlign(final Align align) {
+    interiorPaint.setTextAlign(align);
+    exteriorPaint.setTextAlign(align);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..e02c6559176d40d3df42bccc0c374e60f70371b2
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
@@ -0,0 +1,344 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.graphics.Matrix;
+import android.os.Environment;
+import java.io.File;
+import java.io.FileOutputStream;
+
+/**
+ * Utility class for manipulating images.
+ **/
+public class ImageUtils {
+  @SuppressWarnings("unused")
+  private static final Logger LOGGER = new Logger();
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.w("Native library not found, native RGB -> YUV conversion may be unavailable.");
+    }
+  }
+
+  /**
+   * Utility method to compute the allocated size in bytes of a YUV420SP image
+   * of the given dimensions.
+   */
+  public static int getYUVByteSize(final int width, final int height) {
+    // The luminance plane requires 1 byte per pixel.
+    final int ySize = width * height;
+
+    // The UV plane works on 2x2 blocks, so dimensions with odd size must be rounded up.
+    // Each 2x2 block takes 2 bytes to encode, one each for U and V.
+    final int uvSize = ((width + 1) / 2) * ((height + 1) / 2) * 2;
+
+    return ySize + uvSize;
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   */
+  public static void saveBitmap(final Bitmap bitmap) {
+    saveBitmap(bitmap, "preview.png");
+  }
+
+  /**
+   * Saves a Bitmap object to disk for analysis.
+   *
+   * @param bitmap The bitmap to save.
+   * @param filename The location to save the bitmap to.
+   */
+  public static void saveBitmap(final Bitmap bitmap, final String filename) {
+    final String root =
+        Environment.getExternalStorageDirectory().getAbsolutePath() + File.separator + "tensorflow";
+    LOGGER.i("Saving %dx%d bitmap to %s.", bitmap.getWidth(), bitmap.getHeight(), root);
+    final File myDir = new File(root);
+
+    if (!myDir.mkdirs()) {
+      LOGGER.i("Make dir failed");
+    }
+
+    final String fname = filename;
+    final File file = new File(myDir, fname);
+    if (file.exists()) {
+      file.delete();
+    }
+    try {
+      final FileOutputStream out = new FileOutputStream(file);
+      bitmap.compress(Bitmap.CompressFormat.PNG, 99, out);
+      out.flush();
+      out.close();
+    } catch (final Exception e) {
+      LOGGER.e(e, "Exception!");
+    }
+  }
+
+  // This value is 2 ^ 18 - 1, and is used to clamp the RGB values before their ranges
+  // are normalized to eight bits.
+  static final int kMaxChannelValue = 262143;
+
+  // Always prefer the native implementation if available.
+  private static boolean useNativeConversion = false;
+
+  public static void convertYUV420SPToARGB8888(
+      byte[] input,
+      int width,
+      int height,
+      int[] output) {
+    if (useNativeConversion) {
+      try {
+        ImageUtils.convertYUV420SPToARGB8888(input, output, width, height, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420SP -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    // Java implementation of YUV420SP to ARGB8888 converting
+    final int frameSize = width * height;
+    for (int j = 0, yp = 0; j < height; j++) {
+      int uvp = frameSize + (j >> 1) * width;
+      int u = 0;
+      int v = 0;
+
+      for (int i = 0; i < width; i++, yp++) {
+        int y = 0xff & input[yp];
+        if ((i & 1) == 0) {
+          v = 0xff & input[uvp++];
+          u = 0xff & input[uvp++];
+        }
+
+        output[yp] = YUV2RGB(y, u, v);
+      }
+    }
+  }
+
+  private static int YUV2RGB(int y, int u, int v) {
+    // Adjust and check YUV values
+    y = (y - 16) < 0 ? 0 : (y - 16);
+    u -= 128;
+    v -= 128;
+
+    // This is the floating point equivalent. We do the conversion in integer
+    // because some Android devices do not have floating point in hardware.
+    // nR = (int)(1.164 * nY + 2.018 * nU);
+    // nG = (int)(1.164 * nY - 0.813 * nV - 0.391 * nU);
+    // nB = (int)(1.164 * nY + 1.596 * nV);
+    int y1192 = 1192 * y;
+    int r = (y1192 + 1634 * v);
+    int g = (y1192 - 833 * v - 400 * u);
+    int b = (y1192 + 2066 * u);
+
+    // Clipping RGB values to be inside boundaries [ 0 , kMaxChannelValue ]
+    r = r > kMaxChannelValue ? kMaxChannelValue : (r < 0 ? 0 : r);
+    g = g > kMaxChannelValue ? kMaxChannelValue : (g < 0 ? 0 : g);
+    b = b > kMaxChannelValue ? kMaxChannelValue : (b < 0 ? 0 : b);
+
+    return 0xff000000 | ((r << 6) & 0xff0000) | ((g >> 2) & 0xff00) | ((b >> 10) & 0xff);
+  }
+
+
+  public static void convertYUV420ToARGB8888(
+      byte[] yData,
+      byte[] uData,
+      byte[] vData,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      int[] out) {
+    if (useNativeConversion) {
+      try {
+        convertYUV420ToARGB8888(
+            yData, uData, vData, out, width, height, yRowStride, uvRowStride, uvPixelStride, false);
+        return;
+      } catch (UnsatisfiedLinkError e) {
+        LOGGER.w(
+            "Native YUV420 -> RGB implementation not found, falling back to Java implementation");
+        useNativeConversion = false;
+      }
+    }
+
+    int yp = 0;
+    for (int j = 0; j < height; j++) {
+      int pY = yRowStride * j;
+      int pUV = uvRowStride * (j >> 1);
+
+      for (int i = 0; i < width; i++) {
+        int uv_offset = pUV + (i >> 1) * uvPixelStride;
+
+        out[yp++] = YUV2RGB(
+            0xff & yData[pY + i],
+            0xff & uData[uv_offset],
+            0xff & vData[uv_offset]);
+      }
+    }
+  }
+
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width and height. The
+   * input and output must already be allocated and non-null. For efficiency, no error checking is
+   * performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   */
+  private static native void convertYUV420SPToARGB8888(
+      byte[] input, int[] output, int width, int height, boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to ARGB 8888 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param y
+   * @param u
+   * @param v
+   * @param uvPixelStride
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   * @param halfSize If true, downsample to 50% in each dimension, otherwise not.
+   * @param output A pre-allocated array for the ARGB 8:8:8:8 output data.
+   */
+  private static native void convertYUV420ToARGB8888(
+      byte[] y,
+      byte[] u,
+      byte[] v,
+      int[] output,
+      int width,
+      int height,
+      int yRowStride,
+      int uvRowStride,
+      int uvPixelStride,
+      boolean halfSize);
+
+  /**
+   * Converts YUV420 semi-planar data to RGB 565 data using the supplied width
+   * and height. The input and output must already be allocated and non-null.
+   * For efficiency, no error checking is performed.
+   *
+   * @param input The array of YUV 4:2:0 input data.
+   * @param output A pre-allocated array for the RGB 5:6:5 output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertYUV420SPToRGB565(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 32-bit ARGB8888 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in ARGB8888 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertARGB8888ToYUV420SP(
+      int[] input, byte[] output, int width, int height);
+
+  /**
+   * Converts 16-bit RGB565 image data to YUV420SP data.  This is useful, for
+   * instance, in creating data to feed the classes that rely on raw camera
+   * preview frames.
+   *
+   * @param input An array of input pixels in RGB565 format.
+   * @param output A pre-allocated array for the YUV420SP output data.
+   * @param width The width of the input image.
+   * @param height The height of the input image.
+   */
+  private static native void convertRGB565ToYUV420SP(
+      byte[] input, byte[] output, int width, int height);
+
+  /**
+   * Returns a transformation matrix from one reference frame into another.
+   * Handles cropping (if maintaining aspect ratio is desired) and rotation.
+   *
+   * @param srcWidth Width of source frame.
+   * @param srcHeight Height of source frame.
+   * @param dstWidth Width of destination frame.
+   * @param dstHeight Height of destination frame.
+   * @param applyRotation Amount of rotation to apply from one frame to another.
+   *  Must be a multiple of 90.
+   * @param maintainAspectRatio If true, will ensure that scaling in x and y remains constant,
+   * cropping the image if necessary.
+   * @return The transformation fulfilling the desired requirements.
+   */
+  public static Matrix getTransformationMatrix(
+      final int srcWidth,
+      final int srcHeight,
+      final int dstWidth,
+      final int dstHeight,
+      final int applyRotation,
+      final boolean maintainAspectRatio) {
+    final Matrix matrix = new Matrix();
+
+    if (applyRotation != 0) {
+      if (applyRotation % 90 != 0) {
+        LOGGER.w("Rotation of %d % 90 != 0", applyRotation);
+      }
+
+      // Translate so center of image is at origin.
+      matrix.postTranslate(-srcWidth / 2.0f, -srcHeight / 2.0f);
+
+      // Rotate around origin.
+      matrix.postRotate(applyRotation);
+    }
+
+    // Account for the already applied rotation, if any, and then determine how
+    // much scaling is needed for each axis.
+    final boolean transpose = (Math.abs(applyRotation) + 90) % 180 == 0;
+
+    final int inWidth = transpose ? srcHeight : srcWidth;
+    final int inHeight = transpose ? srcWidth : srcHeight;
+
+    // Apply scaling if necessary.
+    if (inWidth != dstWidth || inHeight != dstHeight) {
+      final float scaleFactorX = dstWidth / (float) inWidth;
+      final float scaleFactorY = dstHeight / (float) inHeight;
+
+      if (maintainAspectRatio) {
+        // Scale by minimum factor so that dst is filled completely while
+        // maintaining the aspect ratio. Some image may fall off the edge.
+        final float scaleFactor = Math.max(scaleFactorX, scaleFactorY);
+        matrix.postScale(scaleFactor, scaleFactor);
+      } else {
+        // Scale exactly to fill dst from src.
+        matrix.postScale(scaleFactorX, scaleFactorY);
+      }
+    }
+
+    if (applyRotation != 0) {
+      // Translate back from origin centered reference to destination frame.
+      matrix.postTranslate(dstWidth / 2.0f, dstHeight / 2.0f);
+    }
+
+    return matrix;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
new file mode 100644
index 0000000000000000000000000000000000000000..0d984096a08cff5640a9dad3a33069fd9c77bbd0
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Logger.java
@@ -0,0 +1,190 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.util.Log;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Wrapper for the platform log function, allows convenient message prefixing and log disabling.
+ */
+public final class Logger {
+  private static final String DEFAULT_TAG = "tensorflow";
+  private static final int DEFAULT_MIN_LOG_LEVEL = Log.DEBUG;
+
+  // Classes to be ignored when examining the stack trace
+  private static final Set<String> IGNORED_CLASS_NAMES;
+
+  static {
+    IGNORED_CLASS_NAMES = new HashSet<String>(3);
+    IGNORED_CLASS_NAMES.add("dalvik.system.VMStack");
+    IGNORED_CLASS_NAMES.add("java.lang.Thread");
+    IGNORED_CLASS_NAMES.add(Logger.class.getCanonicalName());
+  }
+
+  private final String tag;
+  private final String messagePrefix;
+  private int minLogLevel = DEFAULT_MIN_LOG_LEVEL;
+
+  /**
+   * Creates a Logger using the class name as the message prefix.
+   *
+   * @param clazz the simple name of this class is used as the message prefix.
+   */
+  public Logger(final Class<?> clazz) {
+    this(clazz.getSimpleName());
+  }
+
+  /**
+   * Creates a Logger using the specified message prefix.
+   *
+   * @param messagePrefix is prepended to the text of every message.
+   */
+  public Logger(final String messagePrefix) {
+    this(DEFAULT_TAG, messagePrefix);
+  }
+
+  /**
+   * Creates a Logger with a custom tag and a custom message prefix. If the message prefix
+   * is set to <pre>null</pre>, the caller's class name is used as the prefix.
+   *
+   * @param tag identifies the source of a log message.
+   * @param messagePrefix prepended to every message if non-null. If null, the name of the caller is
+   *                      being used
+   */
+  public Logger(final String tag, final String messagePrefix) {
+    this.tag = tag;
+    final String prefix = messagePrefix == null ? getCallerSimpleName() : messagePrefix;
+    this.messagePrefix = (prefix.length() > 0) ? prefix + ": " : prefix;
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger() {
+    this(DEFAULT_TAG, null);
+  }
+
+  /**
+   * Creates a Logger using the caller's class name as the message prefix.
+   */
+  public Logger(final int minLogLevel) {
+    this(DEFAULT_TAG, null);
+    this.minLogLevel = minLogLevel;
+  }
+
+  public void setMinLogLevel(final int minLogLevel) {
+    this.minLogLevel = minLogLevel;
+  }
+
+  public boolean isLoggable(final int logLevel) {
+    return logLevel >= minLogLevel || Log.isLoggable(tag, logLevel);
+  }
+
+  /**
+   * Return caller's simple name.
+   *
+   * Android getStackTrace() returns an array that looks like this:
+   *     stackTrace[0]: dalvik.system.VMStack
+   *     stackTrace[1]: java.lang.Thread
+   *     stackTrace[2]: com.google.android.apps.unveil.env.UnveilLogger
+   *     stackTrace[3]: com.google.android.apps.unveil.BaseApplication
+   *
+   * This function returns the simple version of the first non-filtered name.
+   *
+   * @return caller's simple name
+   */
+  private static String getCallerSimpleName() {
+    // Get the current callstack so we can pull the class of the caller off of it.
+    final StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace();
+
+    for (final StackTraceElement elem : stackTrace) {
+      final String className = elem.getClassName();
+      if (!IGNORED_CLASS_NAMES.contains(className)) {
+        // We're only interested in the simple name of the class, not the complete package.
+        final String[] classParts = className.split("\\.");
+        return classParts[classParts.length - 1];
+      }
+    }
+
+    return Logger.class.getSimpleName();
+  }
+
+  private String toMessage(final String format, final Object... args) {
+    return messagePrefix + (args.length > 0 ? String.format(format, args) : format);
+  }
+
+  public void v(final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args));
+    }
+  }
+
+  public void v(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.VERBOSE)) {
+      Log.v(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void d(final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args));
+    }
+  }
+
+  public void d(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.DEBUG)) {
+      Log.d(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void i(final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args));
+    }
+  }
+
+  public void i(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.INFO)) {
+      Log.i(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void w(final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args));
+    }
+  }
+
+  public void w(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.WARN)) {
+      Log.w(tag, toMessage(format, args), t);
+    }
+  }
+
+  public void e(final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args));
+    }
+  }
+
+  public void e(final Throwable t, final String format, final Object... args) {
+    if (isLoggable(Log.ERROR)) {
+      Log.e(tag, toMessage(format, args), t);
+    }
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
new file mode 100644
index 0000000000000000000000000000000000000000..ef15d14daa841bf185d1839393c68c211d1e04d7
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/Size.java
@@ -0,0 +1,143 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.graphics.Bitmap;
+import android.text.TextUtils;
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Size class independent of a Camera object.
+ */
+public class Size implements Comparable<Size>, Serializable {
+
+  // 1.4 went out with this UID so we'll need to maintain it to preserve pending queries when
+  // upgrading.
+  public static final long serialVersionUID = 7689808733290872361L;
+
+  public final int width;
+  public final int height;
+
+  public Size(final int width, final int height) {
+    this.width = width;
+    this.height = height;
+  }
+
+  public Size(final Bitmap bmp) {
+    this.width = bmp.getWidth();
+    this.height = bmp.getHeight();
+  }
+
+  /**
+   * Rotate a size by the given number of degrees.
+   * @param size Size to rotate.
+   * @param rotation Degrees {0, 90, 180, 270} to rotate the size.
+   * @return Rotated size.
+   */
+  public static Size getRotatedSize(final Size size, final int rotation) {
+    if (rotation % 180 != 0) {
+      // The phone is portrait, therefore the camera is sideways and frame should be rotated.
+      return new Size(size.height, size.width);
+    }
+    return size;
+  }
+
+  public static Size parseFromString(String sizeString) {
+    if (TextUtils.isEmpty(sizeString)) {
+      return null;
+    }
+
+    sizeString = sizeString.trim();
+
+    // The expected format is "<width>x<height>".
+    final String[] components = sizeString.split("x");
+    if (components.length == 2) {
+      try {
+        final int width = Integer.parseInt(components[0]);
+        final int height = Integer.parseInt(components[1]);
+        return new Size(width, height);
+      } catch (final NumberFormatException e) {
+        return null;
+      }
+    } else {
+      return null;
+    }
+  }
+
+  public static List<Size> sizeStringToList(final String sizes) {
+    final List<Size> sizeList = new ArrayList<Size>();
+    if (sizes != null) {
+      final String[] pairs = sizes.split(",");
+      for (final String pair : pairs) {
+        final Size size = Size.parseFromString(pair);
+        if (size != null) {
+          sizeList.add(size);
+        }
+      }
+    }
+    return sizeList;
+  }
+
+  public static String sizeListToString(final List<Size> sizes) {
+    String sizesString = "";
+    if (sizes != null && sizes.size() > 0) {
+      sizesString = sizes.get(0).toString();
+      for (int i = 1; i < sizes.size(); i++) {
+        sizesString += "," + sizes.get(i).toString();
+      }
+    }
+    return sizesString;
+  }
+
+  public final float aspectRatio() {
+    return (float) width / (float) height;
+  }
+
+  @Override
+  public int compareTo(final Size other) {
+    return width * height - other.width * other.height;
+  }
+
+  @Override
+  public boolean equals(final Object other) {
+    if (other == null) {
+      return false;
+    }
+
+    if (!(other instanceof Size)) {
+      return false;
+    }
+
+    final Size otherSize = (Size) other;
+    return (width == otherSize.width && height == otherSize.height);
+  }
+
+  @Override
+  public int hashCode() {
+    return width * 32713 + height;
+  }
+
+  @Override
+  public String toString() {
+    return dimensionsAsString(width, height);
+  }
+
+  public static final String dimensionsAsString(final int width, final int height) {
+    return width + "x" + height;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
new file mode 100644
index 0000000000000000000000000000000000000000..459b0a0d4dbae0a9929f1a57d0b1f48b5d96b7ef
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
@@ -0,0 +1,50 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.env;
+
+import android.os.SystemClock;
+
+/**
+ * A simple utility timer for measuring CPU time and wall-clock splits.
+ */
+public class SplitTimer {
+  private final Logger logger;
+
+  private long lastWallTime;
+  private long lastCpuTime;
+
+  public SplitTimer(final String name) {
+    logger = new Logger(name);
+    newSplit();
+  }
+
+  public void newSplit() {
+    lastWallTime = SystemClock.uptimeMillis();
+    lastCpuTime = SystemClock.currentThreadTimeMillis();
+  }
+
+  public void endSplit(final String splitName) {
+    final long currWallTime = SystemClock.uptimeMillis();
+    final long currCpuTime = SystemClock.currentThreadTimeMillis();
+
+    logger.i(
+        "%s: cpu=%dms wall=%dms",
+        splitName, currCpuTime - lastCpuTime, currWallTime - lastWallTime);
+
+    lastWallTime = currWallTime;
+    lastCpuTime = currCpuTime;
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..af6af2bc8f508a70aa7e44a7236f0e7ea5e3d71c
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -0,0 +1,421 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.content.Context;
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.Paint.Cap;
+import android.graphics.Paint.Join;
+import android.graphics.Paint.Style;
+import android.graphics.RectF;
+import android.text.TextUtils;
+import android.util.Pair;
+import android.util.TypedValue;
+import android.widget.Toast;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import org.tensorflow.demo.Classifier.Recognition;
+import org.tensorflow.demo.env.BorderedText;
+import org.tensorflow.demo.env.ImageUtils;
+import org.tensorflow.demo.env.Logger;
+
+/**
+ * A tracker wrapping ObjectTracker that also handles non-max suppression and matching existing
+ * objects to new detections.
+ */
+public class MultiBoxTracker {
+  private final Logger logger = new Logger();
+
+  private static final float TEXT_SIZE_DIP = 18;
+
+  // Maximum percentage of a box that can be overlapped by another box at detection time. Otherwise
+  // the lower scored box (new or old) will be removed.
+  private static final float MAX_OVERLAP = 0.2f;
+
+  private static final float MIN_SIZE = 16.0f;
+
+  // Allow replacement of the tracked box with new results if
+  // correlation has dropped below this level.
+  private static final float MARGINAL_CORRELATION = 0.75f;
+
+  // Consider object to be lost if correlation falls below this threshold.
+  private static final float MIN_CORRELATION = 0.3f;
+
+  private static final int[] COLORS = {
+    Color.BLUE, Color.RED, Color.GREEN, Color.YELLOW, Color.CYAN, Color.MAGENTA, Color.WHITE,
+    Color.parseColor("#55FF55"), Color.parseColor("#FFA500"), Color.parseColor("#FF8888"),
+    Color.parseColor("#AAAAFF"), Color.parseColor("#FFFFAA"), Color.parseColor("#55AAAA"),
+    Color.parseColor("#AA33AA"), Color.parseColor("#0D0068")
+  };
+
+  private final Queue<Integer> availableColors = new LinkedList<Integer>();
+
+  public ObjectTracker objectTracker;
+
+  final List<Pair<Float, RectF>> screenRects = new LinkedList<Pair<Float, RectF>>();
+
+  private static class TrackedRecognition {
+    ObjectTracker.TrackedObject trackedObject;
+    RectF location;
+    float detectionConfidence;
+    int color;
+    String title;
+  }
+
+  private final List<TrackedRecognition> trackedObjects = new LinkedList<TrackedRecognition>();
+
+  private final Paint boxPaint = new Paint();
+
+  private final float textSizePx;
+  private final BorderedText borderedText;
+
+  private Matrix frameToCanvasMatrix;
+
+  private int frameWidth;
+  private int frameHeight;
+
+  private int sensorOrientation;
+  private Context context;
+
+  public MultiBoxTracker(final Context context) {
+    this.context = context;
+    for (final int color : COLORS) {
+      availableColors.add(color);
+    }
+
+    boxPaint.setColor(Color.RED);
+    boxPaint.setStyle(Style.STROKE);
+    boxPaint.setStrokeWidth(12.0f);
+    boxPaint.setStrokeCap(Cap.ROUND);
+    boxPaint.setStrokeJoin(Join.ROUND);
+    boxPaint.setStrokeMiter(100);
+
+    textSizePx =
+        TypedValue.applyDimension(
+            TypedValue.COMPLEX_UNIT_DIP, TEXT_SIZE_DIP, context.getResources().getDisplayMetrics());
+    borderedText = new BorderedText(textSizePx);
+  }
+
+  private Matrix getFrameToCanvasMatrix() {
+    return frameToCanvasMatrix;
+  }
+
+  public synchronized void drawDebug(final Canvas canvas) {
+    final Paint textPaint = new Paint();
+    textPaint.setColor(Color.WHITE);
+    textPaint.setTextSize(60.0f);
+
+    final Paint boxPaint = new Paint();
+    boxPaint.setColor(Color.RED);
+    boxPaint.setAlpha(200);
+    boxPaint.setStyle(Style.STROKE);
+
+    for (final Pair<Float, RectF> detection : screenRects) {
+      final RectF rect = detection.second;
+      canvas.drawRect(rect, boxPaint);
+      canvas.drawText("" + detection.first, rect.left, rect.top, textPaint);
+      borderedText.drawText(canvas, rect.centerX(), rect.centerY(), "" + detection.first);
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    // Draw correlations.
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+
+      final RectF trackedPos = trackedObject.getTrackedPositionInPreviewFrame();
+
+      if (getFrameToCanvasMatrix().mapRect(trackedPos)) {
+        final String labelString = String.format("%.2f", trackedObject.getCurrentCorrelation());
+        borderedText.drawText(canvas, trackedPos.right, trackedPos.bottom, labelString);
+      }
+    }
+
+    final Matrix matrix = getFrameToCanvasMatrix();
+    objectTracker.drawDebug(canvas, matrix);
+  }
+
+  public synchronized void trackResults(
+      final List<Recognition> results, final byte[] frame, final long timestamp) {
+    logger.i("Processing %d results from %d", results.size(), timestamp);
+    processResults(timestamp, results, frame);
+  }
+
+  public synchronized void draw(final Canvas canvas) {
+    final boolean rotated = sensorOrientation % 180 == 90;
+    final float multiplier =
+        Math.min(canvas.getHeight() / (float) (rotated ? frameWidth : frameHeight),
+                 canvas.getWidth() / (float) (rotated ? frameHeight : frameWidth));
+    frameToCanvasMatrix =
+        ImageUtils.getTransformationMatrix(
+            frameWidth,
+            frameHeight,
+            (int) (multiplier * (rotated ? frameHeight : frameWidth)),
+            (int) (multiplier * (rotated ? frameWidth : frameHeight)),
+            sensorOrientation,
+            false);
+    for (final TrackedRecognition recognition : trackedObjects) {
+      final RectF trackedPos =
+          (objectTracker != null)
+              ? recognition.trackedObject.getTrackedPositionInPreviewFrame()
+              : new RectF(recognition.location);
+
+      getFrameToCanvasMatrix().mapRect(trackedPos);
+      boxPaint.setColor(recognition.color);
+
+      final float cornerSize = Math.min(trackedPos.width(), trackedPos.height()) / 8.0f;
+      canvas.drawRoundRect(trackedPos, cornerSize, cornerSize, boxPaint);
+
+      final String labelString =
+          !TextUtils.isEmpty(recognition.title)
+              ? String.format("%s %.2f", recognition.title, recognition.detectionConfidence)
+              : String.format("%.2f", recognition.detectionConfidence);
+      borderedText.drawText(canvas, trackedPos.left + cornerSize, trackedPos.bottom, labelString);
+    }
+  }
+
+  private boolean initialized = false;
+
+  public synchronized void onFrame(
+      final int w,
+      final int h,
+      final int rowStride,
+      final int sensorOrientation,
+      final byte[] frame,
+      final long timestamp) {
+    if (objectTracker == null && !initialized) {
+      ObjectTracker.clearInstance();
+
+      logger.i("Initializing ObjectTracker: %dx%d", w, h);
+      objectTracker = ObjectTracker.getInstance(w, h, rowStride, true);
+      frameWidth = w;
+      frameHeight = h;
+      this.sensorOrientation = sensorOrientation;
+      initialized = true;
+
+      if (objectTracker == null) {
+        String message =
+            "Object tracking support not found. "
+                + "See tensorflow/examples/android/README.md for details.";
+        Toast.makeText(context, message, Toast.LENGTH_LONG).show();
+        logger.e(message);
+      }
+    }
+
+    if (objectTracker == null) {
+      return;
+    }
+
+    objectTracker.nextFrame(frame, null, timestamp, null, true);
+
+    // Clean up any objects not worth tracking any more.
+    final LinkedList<TrackedRecognition> copyList =
+        new LinkedList<TrackedRecognition>(trackedObjects);
+    for (final TrackedRecognition recognition : copyList) {
+      final ObjectTracker.TrackedObject trackedObject = recognition.trackedObject;
+      final float correlation = trackedObject.getCurrentCorrelation();
+      if (correlation < MIN_CORRELATION) {
+        logger.v("Removing tracked object %s because NCC is %.2f", trackedObject, correlation);
+        trackedObject.stopTracking();
+        trackedObjects.remove(recognition);
+
+        availableColors.add(recognition.color);
+      }
+    }
+  }
+
+  private void processResults(
+      final long timestamp, final List<Recognition> results, final byte[] originalFrame) {
+    final List<Pair<Float, Recognition>> rectsToTrack = new LinkedList<Pair<Float, Recognition>>();
+
+    screenRects.clear();
+    final Matrix rgbFrameToScreen = new Matrix(getFrameToCanvasMatrix());
+
+    for (final Recognition result : results) {
+      if (result.getLocation() == null) {
+        continue;
+      }
+      final RectF detectionFrameRect = new RectF(result.getLocation());
+
+      final RectF detectionScreenRect = new RectF();
+      rgbFrameToScreen.mapRect(detectionScreenRect, detectionFrameRect);
+
+      logger.v(
+          "Result! Frame: " + result.getLocation() + " mapped to screen:" + detectionScreenRect);
+
+      screenRects.add(new Pair<Float, RectF>(result.getConfidence(), detectionScreenRect));
+
+      if (detectionFrameRect.width() < MIN_SIZE || detectionFrameRect.height() < MIN_SIZE) {
+        logger.w("Degenerate rectangle! " + detectionFrameRect);
+        continue;
+      }
+
+      rectsToTrack.add(new Pair<Float, Recognition>(result.getConfidence(), result));
+    }
+
+    if (rectsToTrack.isEmpty()) {
+      logger.v("Nothing to track, aborting.");
+      return;
+    }
+
+    if (objectTracker == null) {
+      trackedObjects.clear();
+      for (final Pair<Float, Recognition> potential : rectsToTrack) {
+        final TrackedRecognition trackedRecognition = new TrackedRecognition();
+        trackedRecognition.detectionConfidence = potential.first;
+        trackedRecognition.location = new RectF(potential.second.getLocation());
+        trackedRecognition.trackedObject = null;
+        trackedRecognition.title = potential.second.getTitle();
+        trackedRecognition.color = COLORS[trackedObjects.size()];
+        trackedObjects.add(trackedRecognition);
+
+        if (trackedObjects.size() >= COLORS.length) {
+          break;
+        }
+      }
+      return;
+    }
+
+    logger.i("%d rects to track", rectsToTrack.size());
+    for (final Pair<Float, Recognition> potential : rectsToTrack) {
+      handleDetection(originalFrame, timestamp, potential);
+    }
+  }
+
+  private void handleDetection(
+      final byte[] frameCopy, final long timestamp, final Pair<Float, Recognition> potential) {
+    final ObjectTracker.TrackedObject potentialObject =
+        objectTracker.trackObject(potential.second.getLocation(), timestamp, frameCopy);
+
+    final float potentialCorrelation = potentialObject.getCurrentCorrelation();
+    logger.v(
+        "Tracked object went from %s to %s with correlation %.2f",
+        potential.second, potentialObject.getTrackedPositionInPreviewFrame(), potentialCorrelation);
+
+    if (potentialCorrelation < MARGINAL_CORRELATION) {
+      logger.v("Correlation too low to begin tracking %s.", potentialObject);
+      potentialObject.stopTracking();
+      return;
+    }
+
+    final List<TrackedRecognition> removeList = new LinkedList<TrackedRecognition>();
+
+    float maxIntersect = 0.0f;
+
+    // This is the current tracked object whose color we will take. If left null we'll take the
+    // first one from the color queue.
+    TrackedRecognition recogToReplace = null;
+
+    // Look for intersections that will be overridden by this object or an intersection that would
+    // prevent this one from being placed.
+    for (final TrackedRecognition trackedRecognition : trackedObjects) {
+      final RectF a = trackedRecognition.trackedObject.getTrackedPositionInPreviewFrame();
+      final RectF b = potentialObject.getTrackedPositionInPreviewFrame();
+      final RectF intersection = new RectF();
+      final boolean intersects = intersection.setIntersect(a, b);
+
+      final float intersectArea = intersection.width() * intersection.height();
+      final float totalArea = a.width() * a.height() + b.width() * b.height() - intersectArea;
+      final float intersectOverUnion = intersectArea / totalArea;
+
+      // If there is an intersection with this currently tracked box above the maximum overlap
+      // percentage allowed, either the new recognition needs to be dismissed or the old
+      // recognition needs to be removed and possibly replaced with the new one.
+      if (intersects && intersectOverUnion > MAX_OVERLAP) {
+        if (potential.first < trackedRecognition.detectionConfidence
+            && trackedRecognition.trackedObject.getCurrentCorrelation() > MARGINAL_CORRELATION) {
+          // If track for the existing object is still going strong and the detection score was
+          // good, reject this new object.
+          potentialObject.stopTracking();
+          return;
+        } else {
+          removeList.add(trackedRecognition);
+
+          // Let the previously tracked object with max intersection amount donate its color to
+          // the new object.
+          if (intersectOverUnion > maxIntersect) {
+            maxIntersect = intersectOverUnion;
+            recogToReplace = trackedRecognition;
+          }
+        }
+      }
+    }
+
+    // If we're already tracking the max object and no intersections were found to bump off,
+    // pick the worst current tracked object to remove, if it's also worse than this candidate
+    // object.
+    if (availableColors.isEmpty() && removeList.isEmpty()) {
+      for (final TrackedRecognition candidate : trackedObjects) {
+        if (candidate.detectionConfidence < potential.first) {
+          if (recogToReplace == null
+              || candidate.detectionConfidence < recogToReplace.detectionConfidence) {
+            // Save it so that we use this color for the new object.
+            recogToReplace = candidate;
+          }
+        }
+      }
+      if (recogToReplace != null) {
+        logger.v("Found non-intersecting object to remove.");
+        removeList.add(recogToReplace);
+      } else {
+        logger.v("No non-intersecting object found to remove");
+      }
+    }
+
+    // Remove everything that got intersected.
+    for (final TrackedRecognition trackedRecognition : removeList) {
+      logger.v(
+          "Removing tracked object %s with detection confidence %.2f, correlation %.2f",
+          trackedRecognition.trackedObject,
+          trackedRecognition.detectionConfidence,
+          trackedRecognition.trackedObject.getCurrentCorrelation());
+      trackedRecognition.trackedObject.stopTracking();
+      trackedObjects.remove(trackedRecognition);
+      if (trackedRecognition != recogToReplace) {
+        availableColors.add(trackedRecognition.color);
+      }
+    }
+
+    if (recogToReplace == null && availableColors.isEmpty()) {
+      logger.e("No room to track this object, aborting.");
+      potentialObject.stopTracking();
+      return;
+    }
+
+    // Finally safe to say we can track this object.
+    logger.v(
+        "Tracking object %s (%s) with detection confidence %.2f at position %s",
+        potentialObject,
+        potential.second.getTitle(),
+        potential.first,
+        potential.second.getLocation());
+    final TrackedRecognition trackedRecognition = new TrackedRecognition();
+    trackedRecognition.detectionConfidence = potential.first;
+    trackedRecognition.trackedObject = potentialObject;
+    trackedRecognition.title = potential.second.getTitle();
+
+    // Use the color from a replaced object before taking one from the color queue.
+    trackedRecognition.color =
+        recogToReplace != null ? recogToReplace.color : availableColors.poll();
+    trackedObjects.add(trackedRecognition);
+  }
+}
diff --git a/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
new file mode 100644
index 0000000000000000000000000000000000000000..8b4248d8fbcfa2d58621fb429edbc9498956d273
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -0,0 +1,661 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.demo.tracking;
+
+import android.graphics.Canvas;
+import android.graphics.Color;
+import android.graphics.Matrix;
+import android.graphics.Paint;
+import android.graphics.PointF;
+import android.graphics.RectF;
+import android.graphics.Typeface;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Vector;
+import javax.microedition.khronos.opengles.GL10;
+import org.tensorflow.demo.env.Logger;
+import org.tensorflow.demo.env.Size;
+
+/**
+ * True object detector/tracker class that tracks objects across consecutive preview frames.
+ * It provides a simplified Java interface to the analogous native object defined by
+ * jni/client_vision/tracking/object_tracker.*.
+ *
+ * Currently, the ObjectTracker is a singleton due to native code restrictions, and so must
+ * be allocated by ObjectTracker.getInstance(). In addition, release() should be called
+ * as soon as the ObjectTracker is no longer needed, and before a new one is created.
+ *
+ * nextFrame() should be called as new frames become available, preferably as often as possible.
+ *
+ * After allocation, new TrackedObjects may be instantiated via trackObject(). TrackedObjects
+ * are associated with the ObjectTracker that created them, and are only valid while that
+ * ObjectTracker still exists.
+ */
+public class ObjectTracker {
+  private static final Logger LOGGER = new Logger();
+
+  private static boolean libraryFound = false;
+
+  static {
+    try {
+      System.loadLibrary("tensorflow_demo");
+      libraryFound = true;
+    } catch (UnsatisfiedLinkError e) {
+      LOGGER.e("libtensorflow_demo.so not found, tracking unavailable");
+    }
+  }
+
+  private static final boolean DRAW_TEXT = false;
+
+  /**
+   * How many history points to keep track of and draw in the red history line.
+   */
+  private static final int MAX_DEBUG_HISTORY_SIZE = 30;
+
+  /**
+   * How many frames of optical flow deltas to record.
+   * TODO(andrewharp): Push this down to the native level so it can be polled
+   * efficiently into a an array for upload, instead of keeping a duplicate
+   * copy in Java.
+   */
+  private static final int MAX_FRAME_HISTORY_SIZE = 200;
+
+  private static final int DOWNSAMPLE_FACTOR = 2;
+
+  private final byte[] downsampledFrame;
+
+  protected static ObjectTracker instance;
+
+  private final Map<String, TrackedObject> trackedObjects;
+
+  private long lastTimestamp;
+
+  private FrameChange lastKeypoints;
+
+  private final Vector<PointF> debugHistory;
+
+  private final LinkedList<TimestampedDeltas> timestampedDeltas;
+
+  protected final int frameWidth;
+  protected final int frameHeight;
+  private final int rowStride;
+  protected final boolean alwaysTrack;
+
+  private static class TimestampedDeltas {
+    final long timestamp;
+    final byte[] deltas;
+
+    public TimestampedDeltas(final long timestamp, final byte[] deltas) {
+      this.timestamp = timestamp;
+      this.deltas = deltas;
+    }
+  }
+
+  /**
+   * A simple class that records keypoint information, which includes
+   * local location, score and type. This will be used in calculating
+   * FrameChange.
+   */
+  public static class Keypoint {
+    public final float x;
+    public final float y;
+    public final float score;
+    public final int type;
+
+    public Keypoint(final float x, final float y) {
+      this.x = x;
+      this.y = y;
+      this.score = 0;
+      this.type = -1;
+    }
+
+    public Keypoint(final float x, final float y, final float score, final int type) {
+      this.x = x;
+      this.y = y;
+      this.score = score;
+      this.type = type;
+    }
+
+    Keypoint delta(final Keypoint other) {
+      return new Keypoint(this.x - other.x, this.y - other.y);
+    }
+  }
+
+  /**
+   * A simple class that could calculate Keypoint delta.
+   * This class will be used in calculating frame translation delta
+   * for optical flow.
+   */
+  public static class PointChange {
+    public final Keypoint keypointA;
+    public final Keypoint keypointB;
+    Keypoint pointDelta;
+    private final boolean wasFound;
+
+    public PointChange(final float x1, final float y1,
+                       final float x2, final float y2,
+                       final float score, final int type,
+                       final boolean wasFound) {
+      this.wasFound = wasFound;
+
+      keypointA = new Keypoint(x1, y1, score, type);
+      keypointB = new Keypoint(x2, y2);
+    }
+
+    public Keypoint getDelta() {
+      if (pointDelta == null) {
+        pointDelta = keypointB.delta(keypointA);
+      }
+      return pointDelta;
+    }
+  }
+
+  /** A class that records a timestamped frame translation delta for optical flow. */
+  public static class FrameChange {
+    public static final int KEYPOINT_STEP = 7;
+
+    public final Vector<PointChange> pointDeltas;
+
+    private final float minScore;
+    private final float maxScore;
+
+    public FrameChange(final float[] framePoints) {
+      float minScore = 100.0f;
+      float maxScore = -100.0f;
+
+      pointDeltas = new Vector<PointChange>(framePoints.length / KEYPOINT_STEP);
+
+      for (int i = 0; i < framePoints.length; i += KEYPOINT_STEP) {
+        final float x1 = framePoints[i + 0] * DOWNSAMPLE_FACTOR;
+        final float y1 = framePoints[i + 1] * DOWNSAMPLE_FACTOR;
+
+        final boolean wasFound = framePoints[i + 2] > 0.0f;
+
+        final float x2 = framePoints[i + 3] * DOWNSAMPLE_FACTOR;
+        final float y2 = framePoints[i + 4] * DOWNSAMPLE_FACTOR;
+        final float score = framePoints[i + 5];
+        final int type = (int) framePoints[i + 6];
+
+        minScore = Math.min(minScore, score);
+        maxScore = Math.max(maxScore, score);
+
+        pointDeltas.add(new PointChange(x1, y1, x2, y2, score, type, wasFound));
+      }
+
+      this.minScore = minScore;
+      this.maxScore = maxScore;
+    }
+  }
+
+  public static synchronized ObjectTracker getInstance(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    if (!libraryFound) {
+      LOGGER.e(
+          "Native object tracking support not found. "
+              + "See tensorflow/examples/android/README.md for details.");
+      return null;
+    }
+
+    if (instance == null) {
+      instance = new ObjectTracker(frameWidth, frameHeight, rowStride, alwaysTrack);
+      instance.init();
+    } else {
+      throw new RuntimeException(
+          "Tried to create a new objectracker before releasing the old one!");
+    }
+    return instance;
+  }
+
+  public static synchronized void clearInstance() {
+    if (instance != null) {
+      instance.release();
+    }
+  }
+
+  protected ObjectTracker(
+      final int frameWidth, final int frameHeight, final int rowStride, final boolean alwaysTrack) {
+    this.frameWidth = frameWidth;
+    this.frameHeight = frameHeight;
+    this.rowStride = rowStride;
+    this.alwaysTrack = alwaysTrack;
+    this.timestampedDeltas = new LinkedList<TimestampedDeltas>();
+
+    trackedObjects = new HashMap<String, TrackedObject>();
+
+    debugHistory = new Vector<PointF>(MAX_DEBUG_HISTORY_SIZE);
+
+    downsampledFrame =
+        new byte
+            [(frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR
+                * (frameWidth + DOWNSAMPLE_FACTOR - 1)
+                / DOWNSAMPLE_FACTOR];
+  }
+
+  protected void init() {
+    // The native tracker never sees the full frame, so pre-scale dimensions
+    // by the downsample factor.
+    initNative(frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, alwaysTrack);
+  }
+
+  private final float[] matrixValues = new float[9];
+
+  private long downsampledTimestamp;
+
+  @SuppressWarnings("unused")
+  public synchronized void drawOverlay(final GL10 gl,
+      final Size cameraViewSize, final Matrix matrix) {
+    final Matrix tempMatrix = new Matrix(matrix);
+    tempMatrix.preScale(DOWNSAMPLE_FACTOR, DOWNSAMPLE_FACTOR);
+    tempMatrix.getValues(matrixValues);
+    drawNative(cameraViewSize.width, cameraViewSize.height, matrixValues);
+  }
+
+  public synchronized void nextFrame(
+      final byte[] frameData, final byte[] uvData,
+      final long timestamp, final float[] transformationMatrix,
+      final boolean updateDebugInfo) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+
+    // Do Lucas Kanade using the fullframe initializer.
+    nextFrameNative(downsampledFrame, uvData, timestamp, transformationMatrix);
+
+    timestampedDeltas.add(new TimestampedDeltas(timestamp, getKeypointsPacked(DOWNSAMPLE_FACTOR)));
+    while (timestampedDeltas.size() > MAX_FRAME_HISTORY_SIZE) {
+      timestampedDeltas.removeFirst();
+    }
+
+    for (final TrackedObject trackedObject : trackedObjects.values()) {
+      trackedObject.updateTrackedPosition();
+    }
+
+    if (updateDebugInfo) {
+      updateDebugHistory();
+    }
+
+    lastTimestamp = timestamp;
+  }
+
+  public synchronized void release() {
+    releaseMemoryNative();
+    synchronized (ObjectTracker.class) {
+      instance = null;
+    }
+  }
+
+  private void drawHistoryDebug(final Canvas canvas) {
+    drawHistoryPoint(
+        canvas, frameWidth * DOWNSAMPLE_FACTOR / 2, frameHeight * DOWNSAMPLE_FACTOR / 2);
+  }
+
+  private void drawHistoryPoint(final Canvas canvas, final float startX, final float startY) {
+    final Paint p = new Paint();
+    p.setAntiAlias(false);
+    p.setTypeface(Typeface.SERIF);
+
+    p.setColor(Color.RED);
+    p.setStrokeWidth(2.0f);
+
+    // Draw the center circle.
+    p.setColor(Color.GREEN);
+    canvas.drawCircle(startX, startY, 3.0f, p);
+
+    p.setColor(Color.RED);
+
+    // Iterate through in backwards order.
+    synchronized (debugHistory) {
+      final int numPoints = debugHistory.size();
+      float lastX = startX;
+      float lastY = startY;
+      for (int keypointNum = 0; keypointNum < numPoints; ++keypointNum) {
+        final PointF delta = debugHistory.get(numPoints - keypointNum - 1);
+        final float newX = lastX + delta.x;
+        final float newY = lastY + delta.y;
+        canvas.drawLine(lastX, lastY, newX, newY, p);
+        lastX = newX;
+        lastY = newY;
+      }
+    }
+  }
+
+  private static int floatToChar(final float value) {
+    return Math.max(0, Math.min((int) (value * 255.999f), 255));
+  }
+
+  private void drawKeypointsDebug(final Canvas canvas) {
+    final Paint p = new Paint();
+    if (lastKeypoints == null) {
+      return;
+    }
+    final int keypointSize = 3;
+
+    final float minScore = lastKeypoints.minScore;
+    final float maxScore = lastKeypoints.maxScore;
+
+    for (final PointChange keypoint : lastKeypoints.pointDeltas) {
+      if (keypoint.wasFound) {
+        final int r =
+            floatToChar((keypoint.keypointA.score - minScore) / (maxScore - minScore));
+        final int b =
+            floatToChar(1.0f - (keypoint.keypointA.score - minScore) / (maxScore - minScore));
+
+        final int color = 0xFF000000 | (r << 16) | b;
+        p.setColor(color);
+
+        final float[] screenPoints = {keypoint.keypointA.x, keypoint.keypointA.y,
+                                      keypoint.keypointB.x, keypoint.keypointB.y};
+        canvas.drawRect(screenPoints[2] - keypointSize,
+                        screenPoints[3] - keypointSize,
+                        screenPoints[2] + keypointSize,
+                        screenPoints[3] + keypointSize, p);
+        p.setColor(Color.CYAN);
+        canvas.drawLine(screenPoints[2], screenPoints[3],
+                        screenPoints[0], screenPoints[1], p);
+
+        if (DRAW_TEXT) {
+          p.setColor(Color.WHITE);
+          canvas.drawText(keypoint.keypointA.type + ": " + keypoint.keypointA.score,
+              keypoint.keypointA.x, keypoint.keypointA.y, p);
+        }
+      } else {
+        p.setColor(Color.YELLOW);
+        final float[] screenPoint = {keypoint.keypointA.x, keypoint.keypointA.y};
+        canvas.drawCircle(screenPoint[0], screenPoint[1], 5.0f, p);
+      }
+    }
+  }
+
+  private synchronized PointF getAccumulatedDelta(final long timestamp, final float positionX,
+      final float positionY, final float radius) {
+    final RectF currPosition = getCurrentPosition(timestamp,
+        new RectF(positionX - radius, positionY - radius, positionX + radius, positionY + radius));
+    return new PointF(currPosition.centerX() - positionX, currPosition.centerY() - positionY);
+  }
+
+  private synchronized RectF getCurrentPosition(final long timestamp, final RectF
+      oldPosition) {
+    final RectF downscaledFrameRect = downscaleRect(oldPosition);
+
+    final float[] delta = new float[4];
+    getCurrentPositionNative(timestamp, downscaledFrameRect.left, downscaledFrameRect.top,
+        downscaledFrameRect.right, downscaledFrameRect.bottom, delta);
+
+    final RectF newPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+    return upscaleRect(newPosition);
+  }
+
+  private void updateDebugHistory() {
+    lastKeypoints = new FrameChange(getKeypointsNative(false));
+
+    if (lastTimestamp == 0) {
+      return;
+    }
+
+    final PointF delta =
+        getAccumulatedDelta(
+            lastTimestamp, frameWidth / DOWNSAMPLE_FACTOR, frameHeight / DOWNSAMPLE_FACTOR, 100);
+
+    synchronized (debugHistory) {
+      debugHistory.add(delta);
+
+      while (debugHistory.size() > MAX_DEBUG_HISTORY_SIZE) {
+        debugHistory.remove(0);
+      }
+    }
+  }
+
+  public synchronized void drawDebug(final Canvas canvas, final Matrix frameToCanvas) {
+    canvas.save();
+    canvas.setMatrix(frameToCanvas);
+
+    drawHistoryDebug(canvas);
+    drawKeypointsDebug(canvas);
+
+    canvas.restore();
+  }
+
+  public Vector<String> getDebugText() {
+    final Vector<String> lines = new Vector<String>();
+
+    if (lastKeypoints != null) {
+      lines.add("Num keypoints " + lastKeypoints.pointDeltas.size());
+      lines.add("Min score: " + lastKeypoints.minScore);
+      lines.add("Max score: " + lastKeypoints.maxScore);
+    }
+
+    return lines;
+  }
+
+  public synchronized List<byte[]> pollAccumulatedFlowData(final long endFrameTime) {
+    final List<byte[]> frameDeltas = new ArrayList<byte[]>();
+    while (timestampedDeltas.size() > 0) {
+      final TimestampedDeltas currentDeltas = timestampedDeltas.peek();
+      if (currentDeltas.timestamp <= endFrameTime) {
+        frameDeltas.add(currentDeltas.deltas);
+        timestampedDeltas.removeFirst();
+      } else {
+        break;
+      }
+    }
+
+    return frameDeltas;
+  }
+
+  private RectF downscaleRect(final RectF fullFrameRect) {
+    return new RectF(
+        fullFrameRect.left / DOWNSAMPLE_FACTOR,
+        fullFrameRect.top / DOWNSAMPLE_FACTOR,
+        fullFrameRect.right / DOWNSAMPLE_FACTOR,
+        fullFrameRect.bottom / DOWNSAMPLE_FACTOR);
+  }
+
+  private RectF upscaleRect(final RectF downsampledFrameRect) {
+    return new RectF(
+        downsampledFrameRect.left * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.top * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.right * DOWNSAMPLE_FACTOR,
+        downsampledFrameRect.bottom * DOWNSAMPLE_FACTOR);
+  }
+
+  /**
+   * A TrackedObject represents a native TrackedObject, and provides access to the
+   * relevant native tracking information available after every frame update. They may
+   * be safely passed around and accessed externally, but will become invalid after
+   * stopTracking() is called or the related creating ObjectTracker is deactivated.
+   *
+   * @author andrewharp@google.com (Andrew Harp)
+   */
+  public class TrackedObject {
+    private final String id;
+
+    private long lastExternalPositionTime;
+
+    private RectF lastTrackedPosition;
+    private boolean visibleInLastFrame;
+
+    private boolean isDead;
+
+    TrackedObject(final RectF position, final long timestamp, final byte[] data) {
+      isDead = false;
+
+      id = Integer.toString(this.hashCode());
+
+      lastExternalPositionTime = timestamp;
+
+      synchronized (ObjectTracker.this) {
+        registerInitialAppearance(position, data);
+        setPreviousPosition(position, timestamp);
+        trackedObjects.put(id, this);
+      }
+    }
+
+    public void stopTracking() {
+      checkValidObject();
+
+      synchronized (ObjectTracker.this) {
+        isDead = true;
+        forgetNative(id);
+        trackedObjects.remove(id);
+      }
+    }
+
+    public float getCurrentCorrelation() {
+      checkValidObject();
+      return ObjectTracker.this.getCurrentCorrelation(id);
+    }
+
+    void registerInitialAppearance(final RectF position, final byte[] data) {
+      final RectF externalPosition = downscaleRect(position);
+      registerNewObjectWithAppearanceNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            data);
+    }
+
+    synchronized void setPreviousPosition(final RectF position, final long timestamp) {
+      checkValidObject();
+      synchronized (ObjectTracker.this) {
+        if (lastExternalPositionTime > timestamp) {
+          LOGGER.w("Tried to use older position time!");
+          return;
+        }
+        final RectF externalPosition = downscaleRect(position);
+        lastExternalPositionTime = timestamp;
+
+        setPreviousPositionNative(id,
+            externalPosition.left, externalPosition.top,
+            externalPosition.right, externalPosition.bottom,
+            lastExternalPositionTime);
+
+        updateTrackedPosition();
+      }
+    }
+
+    void setCurrentPosition(final RectF position) {
+      checkValidObject();
+      final RectF downsampledPosition = downscaleRect(position);
+      synchronized (ObjectTracker.this) {
+        setCurrentPositionNative(id,
+            downsampledPosition.left, downsampledPosition.top,
+            downsampledPosition.right, downsampledPosition.bottom);
+      }
+    }
+
+    private synchronized void updateTrackedPosition() {
+      checkValidObject();
+
+      final float[] delta = new float[4];
+      getTrackedPositionNative(id, delta);
+      lastTrackedPosition = new RectF(delta[0], delta[1], delta[2], delta[3]);
+
+      visibleInLastFrame = isObjectVisible(id);
+    }
+
+    public synchronized RectF getTrackedPositionInPreviewFrame() {
+      checkValidObject();
+
+      if (lastTrackedPosition == null) {
+        return null;
+      }
+      return upscaleRect(lastTrackedPosition);
+    }
+
+    synchronized long getLastExternalPositionTime() {
+      return lastExternalPositionTime;
+    }
+
+    public synchronized boolean visibleInLastPreviewFrame() {
+      return visibleInLastFrame;
+    }
+
+    private void checkValidObject() {
+      if (isDead) {
+        throw new RuntimeException("TrackedObject already removed from tracking!");
+      } else if (ObjectTracker.this != instance) {
+        throw new RuntimeException("TrackedObject created with another ObjectTracker!");
+      }
+    }
+  }
+
+  public synchronized TrackedObject trackObject(
+      final RectF position, final long timestamp, final byte[] frameData) {
+    if (downsampledTimestamp != timestamp) {
+      ObjectTracker.downsampleImageNative(
+          frameWidth, frameHeight, rowStride, frameData, DOWNSAMPLE_FACTOR, downsampledFrame);
+      downsampledTimestamp = timestamp;
+    }
+    return new TrackedObject(position, timestamp, downsampledFrame);
+  }
+
+  public synchronized TrackedObject trackObject(final RectF position, final byte[] frameData) {
+    return new TrackedObject(position, lastTimestamp, frameData);
+  }
+
+  /** ********************* NATIVE CODE ************************************ */
+
+  /** This will contain an opaque pointer to the native ObjectTracker */
+  private long nativeObjectTracker;
+
+  private native void initNative(int imageWidth, int imageHeight, boolean alwaysTrack);
+
+  protected native void registerNewObjectWithAppearanceNative(
+      String objectId, float x1, float y1, float x2, float y2, byte[] data);
+
+  protected native void setPreviousPositionNative(
+      String objectId, float x1, float y1, float x2, float y2, long timestamp);
+
+  protected native void setCurrentPositionNative(
+      String objectId, float x1, float y1, float x2, float y2);
+
+  protected native void forgetNative(String key);
+
+  protected native String getModelIdNative(String key);
+
+  protected native boolean haveObject(String key);
+  protected native boolean isObjectVisible(String key);
+  protected native float getCurrentCorrelation(String key);
+
+  protected native float getMatchScore(String key);
+
+  protected native void getTrackedPositionNative(String key, float[] points);
+
+  protected native void nextFrameNative(
+      byte[] frameData, byte[] uvData, long timestamp, float[] frameAlignMatrix);
+
+  protected native void releaseMemoryNative();
+
+  protected native void getCurrentPositionNative(long timestamp,
+      final float positionX1, final float positionY1,
+      final float positionX2, final float positionY2,
+      final float[] delta);
+
+  protected native byte[] getKeypointsPacked(float scaleFactor);
+
+  protected native float[] getKeypointsNative(boolean onlyReturnCorrespondingKeypoints);
+
+  protected native void drawNative(int viewWidth, int viewHeight, float[] frameToCanvas);
+
+  protected static native void downsampleImageNative(
+      int width, int height, int rowStride, byte[] input, int factor, byte[] output);
+}
diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 959347b5491514ddc13af57ea6f7385a0d39e418..9322e186a280e932a2441ab16ac8579d9ab67ee2 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -69,15 +69,3 @@ cc_library(
 #         "//testing/base/public:gunit",
 #     ],
 # )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg b/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg
deleted file mode 100644
index bc83946647c6a923a8a0bd3a041b42e4febe6a31..0000000000000000000000000000000000000000
Binary files a/tensorflow/contrib/lite/g3doc/TFLite-Architecture.jpg and /dev/null differ
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 48f43d4fc460a3a5307c5ee1f5e096a409a46af5..d8134d5a00097b3eef24d5583d7f114c34e3bef2 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,7 +1,13 @@
 # List of Hosted Models
 
-*   [Inception V3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
-*   [Inception V3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+*   [NASNet large](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_large_2018_03_27.zip)
+*   [NASNet mobile](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_mobile_2018_03_27.zip)
+*   [ResNet v2 101](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_101_2018_03_27.zip)
+*   [ResNet v2 50](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_50_2018_03_27.zip)
+*   [Inception ResNet v2](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_resnet_v2_2018_03_27.zip)
+*   [Inception v4](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v4_2018_03_27.zip)
+*   [Inception v3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
+*   [Inception v3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
 *   [Mobilenet 0.25 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_128_float_2017_11_08.zip)
 *   [Mobilenet 0.25 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_160_float_2017_11_08.zip)
 *   [Mobilenet 0.25 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_192_float_2017_11_08.zip)
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 77db17878318276c6cf5067274a3af3be262c8e1..a6d582a813e0dcaacb2f196cc71dc111638171b2 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -208,7 +208,7 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
-      int node_index) {
+      int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
       return nullptr;
     return &nodes_and_registration_[node_index];
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index f52d6ba6c5390e631d29e75f833aa4dd5bba1a68..1dda55b8edf8f85293c473b51b8a19066bac5f73 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -46,6 +46,23 @@ android_library(
     ],
 )
 
+java_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    visibility = ["//visibility:public"],
+    deps = [
+        ":libtensorflowlite_jni.so",
+        ":tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
 java_library(
     name = "tensorflowlitelib",
     srcs = glob(
@@ -100,6 +117,7 @@ java_test(
         "src/testdata/int64.bin",
         "src/testdata/invalid_model.bin",
         "src/testdata/uint8.bin",
+        "src/testdata/with_custom_op.lite",
     ],
     javacopts = JAVACOPTS,
     test_class = "org.tensorflow.lite.NativeInterpreterWrapperTest",
@@ -147,6 +165,28 @@ java_test(
     ],
 )
 
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "ovic/src/testdata/float_model.lite",
+        "ovic/src/testdata/labels.txt",
+        "ovic/src/testdata/low_res_model.lite",
+        "ovic/src/testdata/quantized_model.lite",
+        "ovic/src/testdata/test_image_128.jpg",
+        "ovic/src/testdata/test_image_224.jpg",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":ovicbenchmarkerlib",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
@@ -167,15 +207,3 @@ tflite_jni_binary(
         "//tensorflow/contrib/lite/java/src/main/native",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
index 5eb749aae6e224bec64b66832f116ebc3372c1ef..d6fbef9cc938993b283103984307ab51e609dd6e 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/BUILD
@@ -27,15 +27,3 @@ android_binary(
         "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
index dd0cd6c98ff878e9c41875cab74c12191cadb173..ce68160b68efd446c1dfa4c70c37aaa4048e4f2f 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/assets/BUILD
@@ -10,15 +10,3 @@ exports_files(
         ],
     ),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..76c33838bfe5b8596d78cae7d022c51d2a379e76
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -0,0 +1,83 @@
+# Benchmarker for LPIRC Workshop at CVPR 2018
+
+This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
+
+## Pre-requesits
+
+Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
+
+## To test the benchmarker:
+
+The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
+
+Note: for now the tests only provides correctness checks, i.e. classifier predicts the correct category on the test image, but no on-device latency measurements. To test the latency measurement functionality, the tests will print the latency running on a desktop computer, which is not indicative of the on-device run-time.
+We are releasing an benchmarker Apk that would allow developers to measure latency on their own devices.
+
+### Obtain the sample models
+
+The test data (models and images) should be downloaded automatically for you by Bazel. In case they are not, you can manually install them as below.
+
+Note: all commands should be called from your tensorflow installation folder (under this folder you should find `tensorflow/contrib/lite`).
+
+
+* Download the [testdata package](https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip):
+
+```sh
+curl -L https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip -o /tmp/ovic.zip
+```
+
+* Unzip the package into the testdata folder:
+
+```sh
+unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+### Run tests
+
+You can run test with Bazel as below. This helps to ensure that the installation is correct.
+
+```sh
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+```
+
+### Test your submissions
+
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+
+* Move your submission to the testdata folder:
+
+Let say the submission file is located at `/tmp/my_model.lite`, then
+
+```sh
+cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+```
+
+* Resize the test image to the resolutions that are expected by your submission:
+
+The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
+
+* Add your model and test image to the BUILD rule:
+
+```JSON
+java_test(
+  name = "OvicClassifierTest",
+  size = "medium",
+  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+  data = [
+      "ovic/src/testdata/float_model.lite",
+      "ovic/src/testdata/labels.txt",
+      "ovic/src/testdata/low_res_model.lite",
+      "ovic/src/testdata/quantized_model.lite",
+      "ovic/src/testdata/test_image_128.jpg",
+      "ovic/src/testdata/test_image_224.jpg",
+      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
+      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
+  ],
+      ...
+```
+
+* Modify `OvicClassifierTest.java` to test your model.
+
+Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+
+Now you can run the bazel tests to catch any runtime issues with the submission.
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 4fd23a99d25d715530cf36f398d949f7e70598de..098ed8ceba52b6d44868353b80cab0862e334d4d 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -26,7 +26,6 @@ import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
-import java.nio.file.Paths;
 import javax.imageio.ImageIO;
 import org.junit.Before;
 import org.junit.Test;
@@ -45,27 +44,33 @@ public final class OvicClassifierTest {
   private ByteBuffer testImage = null;
   private ByteBuffer lowResTestImage = null;
   private OvicSingleImageResult testResult = null;
-  private static final String LABELS_PATH = "testdata/labels.txt";
-  private static final String QUANTIZED_MODEL_PATH = "testdata/quantized_model.lite";
-  private static final String LOW_RES_MODEL_PATH = "testdata/low_res_model.lite";
-  private static final String FLOAT_MODEL_PATH = "testdata/float_model.lite";
-  private static final String TEST_IMAGE_PATH = "testdata/test_image_224.jpg";
-  private static final String TEST_LOW_RES_IMAGE_PATH = "testdata/test_image_128.jpg";
+  private static final String LABELS_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+  private static final String QUANTIZED_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+  private static final String LOW_RES_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+  private static final String FLOAT_MODEL_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+  private static final String TEST_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+  private static final String TEST_LOW_RES_IMAGE_PATH =
+      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
   private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
 
   @Before
   public void setUp() {
     try {
-      File labelsfile = new File(getTestDir(LABELS_PATH));
+      File labelsfile = new File(LABELS_PATH);
       labelsInputStream = new FileInputStream(labelsfile);
-      quantizedModel = loadModelFile(getTestDir(QUANTIZED_MODEL_PATH));
-      floatModel = loadModelFile(getTestDir(FLOAT_MODEL_PATH));
-      lowResModel = loadModelFile(getTestDir(LOW_RES_MODEL_PATH));
-      File imageFile = new File(getTestDir(TEST_IMAGE_PATH));
+      quantizedModel = loadModelFile(QUANTIZED_MODEL_PATH);
+      floatModel = loadModelFile(FLOAT_MODEL_PATH);
+      lowResModel = loadModelFile(LOW_RES_MODEL_PATH);
+      File imageFile = new File(TEST_IMAGE_PATH);
       BufferedImage img = ImageIO.read(imageFile);
       testImage = toByteBuffer(img);
       // Low res image and models.
-      imageFile = new File(getTestDir(TEST_LOW_RES_IMAGE_PATH));
+      imageFile = new File(TEST_LOW_RES_IMAGE_PATH);
       img = ImageIO.read(imageFile);
       lowResTestImage = toByteBuffer(img);
     } catch (IOException e) {
@@ -74,10 +79,6 @@ public final class OvicClassifierTest {
     System.out.println("Successful setup");
   }
 
-  private static String getTestDir(String testfile) throws IOException {
-    return Paths.get("third_party/tensorflow/contrib/lite/java/ovic/src/", testfile).toString();
-  }
-
   @Test
   public void ovicClassifier_quantizedModelCreateSuccess() throws Exception {
     classifier = new OvicClassifier(labelsInputStream, quantizedModel);
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 14f461f5f9ba8c0755d2a1968533a79cce10750a..a33959dca4954e3c2aaed987839bdec1ba079b5e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -67,6 +67,19 @@ public final class Interpreter implements AutoCloseable {
     wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath());
   }
 
+  /**
+   * Initializes a {@code Interpreter} and specifies the number of threads used for inference.
+   *
+   * @param modelFile: a file of a pre-trained TF Lite model
+   * @param numThreads: number of threads to use for inference
+   */
+  public Interpreter(@NonNull File modelFile, int numThreads) {
+    if (modelFile == null) {
+      return;
+    }
+    wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath(), numThreads);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
    *
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index dbf8f8f7cc2815a46130e342d7e45d4e471696de..fc8187acfebf272a72ceb7844333bd589359cc2e 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -32,9 +32,13 @@ import java.util.Map;
 final class NativeInterpreterWrapper implements AutoCloseable {
 
   NativeInterpreterWrapper(String modelPath) {
+    this(modelPath, /* numThreads= */ -1);
+  }
+
+  NativeInterpreterWrapper(String modelPath, int numThreads) {
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModel(modelPath, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, /* numThreads= */ -1);
+    interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
     isMemoryAllocated = true;
   }
 
@@ -44,11 +48,7 @@ final class NativeInterpreterWrapper implements AutoCloseable {
    * NativeInterpreterWrapper}.
    */
   NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
-    modelByteBuffer = mappedByteBuffer;
-    errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
-    modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
-    interpreterHandle = createInterpreter(modelHandle, errorHandle, /* numThreads= */ -1);
-    isMemoryAllocated = true;
+    this(mappedByteBuffer, /* numThreads= */ -1);
   }
 
   /**
diff --git a/tensorflow/contrib/lite/java/src/main/native/BUILD b/tensorflow/contrib/lite/java/src/main/native/BUILD
index 3571182ca92e959d54935cfdc76679ab69a8cfa9..4399ed202597082fba36c04a744bf6378e4539a2 100644
--- a/tensorflow/contrib/lite/java/src/main/native/BUILD
+++ b/tensorflow/contrib/lite/java/src/main/native/BUILD
@@ -95,15 +95,3 @@ exports_files(
         "version_script.lds",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index 2b4f37bc6cfe1dbc0c178a56b892f545e8ad4f3b..b524246d436858bbf506809a38cead2897f78d93 100644
--- a/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -16,15 +16,3 @@ android_library(
         "//tensorflow/contrib/lite/java:tensorflowlite_java",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 48021aea47573b1b24bae78a9532200dc222020e..f07eca0ba90b941ddd1710913c38c0c9e5817aad 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -135,6 +135,7 @@ cc_library(
     srcs = [
         "activations.cc",
         "add.cc",
+        "arg_max.cc",
         "audio_spectrogram.cc",
         "basic_rnn.cc",
         "batch_to_space_nd.cc",
@@ -156,7 +157,7 @@ cc_library(
         "local_response_norm.cc",
         "lsh_projection.cc",
         "lstm.cc",
-        "maximum.cc",
+        "maximum_minimum.cc",
         "mean.cc",
         "mfcc.cc",
         "mul.cc",
@@ -270,6 +271,22 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "arg_max_test",
+    size = "small",
+    srcs = ["arg_max_test.cc"],
+    tags = [
+        "tflite_not_portable_ios_arm64",
+        "tflite_not_portable_ios_x86_64",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "div_test",
     size = "small",
@@ -538,9 +555,9 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "maximum_test",
+    name = "maximum_minimum_test",
     size = "small",
-    srcs = ["maximum_test.cc"],
+    srcs = ["maximum_minimum_test.cc"],
     deps = [
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
@@ -723,6 +740,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2c5e4ceadbc905d22eb02b450c88745a351f58f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace arg_max {
+
+constexpr int kInputTensor = 0;
+constexpr int kAxis = 1;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  // Make sure the axis is only 1 dimension.
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  // Make sure the axis is only either int32 or int64.
+  TF_LITE_ENSURE(context,
+                 axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
+  switch (params->output_type) {
+    case kTfLiteInt32:
+      output->type = kTfLiteInt32;
+      break;
+    case kTfLiteInt64:
+      output->type = kTfLiteInt64;
+      break;
+    default:
+      context->ReportError(context, "Unknown index output data type");
+      return kTfLiteError;
+  }
+
+  // Check conditions for different types.
+  switch (input->type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt32:
+      break;
+
+    default:
+      context->ReportError(context, "Only float32 and int types are supported");
+      return kTfLiteError;
+  }
+
+  // Copy the input dimensions to output except make the last dimension 1.
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  output_size->data[NumDimensions(input) - 1] = 1;
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+// The current impl actually ignores the axis argument.
+// Only determine the index of the maximum value in the last dimension.
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
+  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
+  optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
+                        GetTensorData<data_type>(input), GetTensorDims(input), \
+                        GetTensorData<output_type>(output),                    \
+                        GetTensorDims(output))
+  if (axis->type == kTfLiteInt32) {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int32_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int32_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int32_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  } else {
+    switch (output->type) {
+      case kTfLiteInt32: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int32_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int32_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int32_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      case kTfLiteInt64: {
+        switch (input->type) {
+          case kTfLiteFloat32:
+            TF_LITE_ARG_MAX(float, int64_t, int64_t);
+            break;
+          case kTfLiteUInt8:
+            TF_LITE_ARG_MAX(uint8_t, int64_t, int64_t);
+            break;
+          case kTfLiteInt32:
+            TF_LITE_ARG_MAX(int32_t, int64_t, int64_t);
+            break;
+          default:
+            return kTfLiteError;
+        }
+      } break;
+      default:
+        return kTfLiteError;
+    }
+  }
+#undef TF_LITE_ARG_MAX
+
+  return kTfLiteOk;
+}
+
+}  // namespace arg_max
+
+TfLiteRegistration* Register_ARG_MAX() {
+  static TfLiteRegistration r = {nullptr, nullptr, arg_max::Prepare,
+                                 arg_max::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/arg_max_test.cc b/tensorflow/contrib/lite/kernels/arg_max_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31b15fe19ab87027c28bde9eaff7d88d03b2c213
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/arg_max_test.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class ArgMaxOpModel : public SingleOpModel {
+ public:
+  ArgMaxOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+                TensorType output_type, TensorType index_output_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(output_type);
+    SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
+                 CreateArgMaxOptions(builder_, index_output_type).Union());
+    BuildInterpreter({input_shape, {1, 1, 1, 1}});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ArgMaxOpTest, GetMaxArgFloat) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_FLOAT32,
+                               TensorType_INT32, TensorType_INT32);
+  model.PopulateTensor<float>(model.input(), {0.1, 0.9, 0.7, 0.3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgInt) {
+  ArgMaxOpModel<int32_t> model({1, 1, 1, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgMulDimensions) {
+  ArgMaxOpModel<int32_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT32,
+                               TensorType_INT32);
+  model.PopulateTensor<int>(model.input(), {1, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+TEST(ArgMaxOpTest, GetMaxArgOutput64) {
+  ArgMaxOpModel<int64_t> model({1, 1, 2, 4}, TensorType_INT32, TensorType_INT64,
+                               TensorType_INT64);
+  model.PopulateTensor<int>(model.input(), {10, 2, 7, 8, 1, 9, 7, 3});
+  // Currently only support the last dimension.
+  model.PopulateTensor<int>(model.axis(), {3});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({0, 1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
index 19942de7bc0c083f192a4b337b224b778d991140..17ef2c572ebbfa54ba6856f7eebbcd6fd9e63868 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -34,6 +34,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(ahentz): these two checks would make the new implementation
+  // incompatible with some existing models, where params is not specified. It
+  // is OK not to have them because toco would have set input and output types
+  // to match the parameters.
+  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
+  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
+  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
+
   return context->ResizeTensor(context, output,
                                TfLiteIntArrayCopy(input->dims));
 }
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index a619ada86af64c299f8e518a7493db20f1011a50..45ea8d00498455be98467f2f1addc8ad7dcf35fa 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -67,10 +67,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteTensor* t = &context->tensors[node->inputs->data[i]];
     TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
     TF_LITE_ENSURE_EQ(context, t->type, input_type);
-    if (input_type == kTfLiteUInt8) {
-      TF_LITE_ENSURE_EQ(context, t->params.zero_point, t0->params.zero_point);
-      TF_LITE_ENSURE_EQ(context, t->params.scale, t0->params.scale);
-    }
     for (int d = 0; d < t0->dims->size; ++d) {
       if (d == axis) {
         sum_axis += t->dims->data[axis];
@@ -87,11 +83,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output = &context->tensors[node->outputs->data[0]];
   TF_LITE_ENSURE_EQ(context, output->type, input_type);
-  if (input_type == kTfLiteUInt8) {
-    TF_LITE_ENSURE_EQ(context, output->params.zero_point,
-                      t0->params.zero_point);
-    TF_LITE_ENSURE_EQ(context, output->params.scale, t0->params.scale);
-  }
 
   return context->ResizeTensor(context, output, output_size);
 }
@@ -115,6 +106,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       all_inputs.dims(), node->inputs->size, GetTensorData<scalar>(output), \
       GetTensorDims(output))
 
+#define TF_LITE_CONCATENATION_QUANTIZED(type)                                  \
+  VectorOfQuantizedTensors all_inputs(*context, *node->inputs);                \
+  type::Concatenation(                                                         \
+      RemapDim(NumDimensions(output), axis), all_inputs.data(),                \
+      all_inputs.dims(), all_inputs.zero_point(), all_inputs.scale(),          \
+      node->inputs->size, GetTensorData<uint8>(output), GetTensorDims(output), \
+      output->params.zero_point, output->params.scale)
+
   switch (output->type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
@@ -125,9 +124,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_CONCATENATION(reference_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(reference_ops);
       } else {
-        TF_LITE_CONCATENATION(optimized_ops, uint8_t);
+        TF_LITE_CONCATENATION_QUANTIZED(optimized_ops);
       }
       break;
     default:
@@ -136,6 +135,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+#undef TF_LITE_CONCATENATION_QUANTIZED
 #undef TF_LITE_CONCATENATION
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/concatenation_test.cc b/tensorflow/contrib/lite/kernels/concatenation_test.cc
index ba1ffc5f8423b9626c9c8e2a1086ea0dcca43f50..467ff6f7e149e35ae1fd11031c10d7087c4b398c 100644
--- a/tensorflow/contrib/lite/kernels/concatenation_test.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation_test.cc
@@ -28,6 +28,7 @@ class BaseConcatenationOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): Also test different activation types, axis, input
   // dimensions.
+  BaseConcatenationOpModel() {}
   BaseConcatenationOpModel(const TensorData& input_template, int axis,
                            int num_inputs) {
     std::vector<std::vector<int>> all_input_shapes;
@@ -60,6 +61,23 @@ class ConcatenationOpModel : public BaseConcatenationOpModel {
 class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
  public:
   using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
   void SetInput(int index, std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(index, data);
   }
@@ -168,6 +186,56 @@ TEST(ConcatenationOpTest, FourInputsQuantized) {
                               }));
 }
 
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(ConcatenationOpTest, FourInputsQuantizedMixedRangeClampingLogic) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -1., 1.});
+
+  m0.SetInput(0, {1.0f, -3.0f, -4.0f, -7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, -3.2f, -4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f,   //
+                      -1.0f, -1.0f, 1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f,  //
+                  },
+                  4e-3)));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  255, 0, 255, 255, 255, 0, 255, 255,  //
+                                  0, 0, 255, 255, 0, 255, 255, 255,    //
+                              }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index a0f766c4f4580d7679275c0b63aa200410fcb5ad..87413000a93a0a361d81b1f0eb46550b5b90f9ac 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -19,12 +19,25 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
 
 namespace tflite {
+
+namespace ops {
+namespace builtin {
+
+TfLiteRegistration* Register_FULLY_CONNECTED_REF();
+TfLiteRegistration* Register_FULLY_CONNECTED_NEON_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT();
+TfLiteRegistration* Register_FULLY_CONNECTED_PIE();
+
+}  // namespace builtin
+}  // namespace ops
+
 namespace {
 
 using ::testing::ElementsAre;
@@ -119,7 +132,8 @@ static float fully_connected_golden_output[] = {
 class BaseFullyConnectedOpModel : public SingleOpModel {
  public:
   // TODO(ahentz): test different activation types too.
-  BaseFullyConnectedOpModel(int units, int batches, const TensorData& input,
+  BaseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
+                            int batches, const TensorData& input,
                             const TensorData& output = {TensorType_FLOAT32})
       : batches_(batches), units_(units) {
     int total_input_size = 1;
@@ -149,6 +163,8 @@ class BaseFullyConnectedOpModel : public SingleOpModel {
         BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
         CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
             .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED, registration);
     BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
   }
 
@@ -208,10 +224,25 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   }
 };
 
+const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+class FullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMap;
+  }
+};
+
 // TODO(ahentz): add more small tests like this one, focused on making sure the
 // calculations are correct.
-TEST(FullyConnectedOpTest, SimpleTest) {
-  FloatFullyConnectedOpModel m(3, 2, {TensorType_FLOAT32, {2, 10}});
+TEST_P(FullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 3, 2,
+                               {TensorType_FLOAT32, {2, 10}});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -229,9 +260,9 @@ TEST(FullyConnectedOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
-TEST(FullyConnectedOpTest, SimpleTestQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTestQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -261,7 +292,8 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
   // batches. In this case, we need the input to have multiples of '2'.
-  FloatFullyConnectedOpModel m(/*units=*/3,
+  FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
+                               /*units=*/3,
                                /*batches=*/2,
                                /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
   m.SetWeights({
@@ -284,9 +316,9 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   QuantizedFullyConnectedOpModel m(
-      3, 2,
+      GetRegistration(), 3, 2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -312,10 +344,15 @@ TEST(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+INSTANTIATE_TEST_CASE_P(
+    FullyConnectedOpTest, FullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
+
 // TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
 // to debug errors and doesn't necessarily test all the important details.
-TEST(FullyConnectedOpTest, BlackBoxTest) {
-  FloatFullyConnectedOpModel m(16, 2, {TensorType_FLOAT32, {2, 8}});
+TEST_P(FullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), 16, 2,
+                               {TensorType_FLOAT32, {2, 8}});
   m.SetWeights(
       {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
        -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index aa3957bee133c8b51a82e9c62884ce365e086d2e..32a0acf8883bc227e9de983d4942aec0f7f755ed 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -154,6 +154,7 @@ cc_library(
     ],
     copts = tflite_copts(),
     deps = [
+        ":quantization_util",
         ":types",
         ":round",
         "//third_party/eigen3",
@@ -238,6 +239,7 @@ cc_library(
         "reference/reference_ops.h",
     ],
     deps = [
+        ":quantization_util",
         ":round",
         ":types",
         "//third_party/eigen3",
@@ -431,15 +433,3 @@ cc_library(
 )
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index c71b070680ead77769dd8b04d0d7a133ad694abc..0f78e0f728585ab27a8116a4707ac9614a6ea060 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1694,12 +1694,11 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
 
 #ifdef __aarch64__
-  // Call kernel optimized for depthwise convolutions using 3x3 filters,
-  // stride = 1, no padding, depth_multiplier = 1 and depth a multiple of 16.
-  if (filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
-      (stride_width == 1 || stride_width == 2) &&
-      (stride_height == 1 || stride_height == 2) && pad_width == 0 &&
-      pad_height == 0 && (input_depth % 16) == 0) {
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (Fast3by3FilterKernelSupported(input_dims, filter_dims, stride_width,
+                                    stride_height, pad_width, pad_height,
+                                    depth_multiplier, output_dims)) {
     DepthwiseConv3by3FilterDepth16(
         input_data, input_dims, input_offset, filter_data, filter_dims,
         filter_offset, bias_data, bias_dims, stride_width, stride_height,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 9dc76e7608f170fcf21bb188226bf30995df8cda..a349892076fcc4989e2f4cad188b383d2b31d470 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -440,6 +440,47 @@ struct ConvKernel3x3FilterDepth16<1, 1> {
   }
 };
 
+inline bool Fast3by3FilterKernelSupported(const Dims<4>& input_dims,
+                                          const Dims<4>& filter_dims,
+                                          int stride_width, int stride_height,
+                                          int pad_width, int pad_height,
+                                          int depth_multiplier,
+                                          const Dims<4>& output_dims) {
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+
+  bool supported = filter_width == 3 && filter_height == 3 &&
+                   depth_multiplier == 1 &&
+                   (stride_width == 1 || stride_width == 2) &&
+                   (stride_height == 1 || stride_height == 2) &&
+                   pad_width == 0 && pad_height == 0 && (input_depth % 16) == 0;
+
+  if (!supported) {
+    return false;
+  }
+
+  // Handle case where padding is zero but type is not kValid. This would
+  // require special boundary case handling that is not supported yet.
+
+  const int out_x = output_width - 1;
+  const int out_y = output_height - 1;
+
+  const int in_x_origin = (out_x * stride_width) - pad_width;
+  const int in_y_origin = (out_y * stride_height) - pad_height;
+
+  const int in_x_end = in_x_origin + filter_width;
+  const int in_y_end = in_y_origin + filter_height;
+
+  // Supported only if filter on the right and bottom boundary lies completely
+  // within the input.
+  return in_x_end <= input_width && in_y_end <= input_height;
+}
+
 inline void DepthwiseConv3by3FilterDepth16(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
     const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
@@ -634,7 +675,7 @@ inline void DepthwiseConv3by3FilterDepth16(
         // Handle the rest of the right side.
         for (; out_x < output_width; out_x++) {
           // This code path can only be reached if we're handling >1 x outputs
-          // at a time or support padding.
+          // at a time or support kSame padding.
         }
       }
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index aff47d6e4877542cec9a4ba6eb0071421a9fcc07..fc58c192f8283b26166e3b8db9d42b6b4e60eeca 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -324,6 +325,22 @@ void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
   }
 }
 
+inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
 #ifdef GEMMLOWP_NEON
 // In the common case of batch size 1, a fully-connected node degenerates
 // to a matrix*vector product. LSTM cells contain a fully-connected node;
@@ -516,6 +533,297 @@ inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
 }
 #endif
 
+#ifdef GEMMLOWP_NEON
+inline void GEMVForLstmCellWithSymmetricRange(
+    const uint8* input_data, const Dims<4>& input_dims,
+    const uint8* weights_data, const Dims<4>& weights_dims,
+    const int32* bias_data, const Dims<4>& bias_dims, int32 accum_multiplier,
+    int accum_shift, int16* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GEMVForLstmCellWithSymmetricRange");
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                       ArraySize(output_dims, 3),
+                   1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
+  // This special fast path for quantized LSTM cells does not try to support
+  // odd sizes that we haven't encountered in any LSTM cell, that would
+  // require special code (that would go untested until any LSTM cell
+  // exercises it). We just guard our assumptions about size evenness with
+  // the following assertions.
+  TFLITE_DCHECK(!(output_size % 4));
+  TFLITE_DCHECK(!(input_size % 64));
+  const int32* bias_ptr = bias_data;
+  int16* output_ptr = output_data;
+  const uint8x16_t signbit = vdupq_n_u8(0x80);
+  for (int in = 0; in < input_size; in += 32) {
+    optimized_ops_preload_l1_keep(input_data + in);
+  }
+  const int left_shift = accum_shift > 0 ? accum_shift : 0;
+  const int right_shift = accum_shift > 0 ? 0 : -accum_shift;
+  for (int out = 0; out < output_size; out += 4) {
+    // Load the bias values
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+
+    // Clear accumulators. We use 2 accumulator registers per row,
+    // for 4 rows. row_accumRN is the N-th accumulator for row R.
+    int32x4_t row_accum00 = vdupq_n_s32(0);
+    int32x4_t row_accum01 = vdupq_n_s32(0);
+    int32x4_t row_accum10 = vdupq_n_s32(0);
+    int32x4_t row_accum11 = vdupq_n_s32(0);
+    int32x4_t row_accum20 = vdupq_n_s32(0);
+    int32x4_t row_accum21 = vdupq_n_s32(0);
+    int32x4_t row_accum30 = vdupq_n_s32(0);
+    int32x4_t row_accum31 = vdupq_n_s32(0);
+
+    // kReadAhead parametrizes how far ahead we prefetch weights into L1 cache.
+    const int kReadAhead = 512;
+    // Prefetch the first weights values.
+    for (int k = 0; k < kReadAhead; k += 64) {
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      k);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      k);
+    }
+    // Loop along the rows, handling 64 bytes per iteration because that's
+    // cache line size on most current ARM-architecture CPUs.
+    for (int in = 0; in < input_size; in += 64) {
+      // Prefetch some future weights values.
+      optimized_ops_preload_l1_stream(weights_data + (out + 0) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 1) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 2) * input_size +
+                                      in + kReadAhead);
+      optimized_ops_preload_l1_stream(weights_data + (out + 3) * input_size +
+                                      in + kReadAhead);
+
+      // We will use 2 local 16-bit accumulators per row, for 2 rows.
+      // See below (*) for the rationale of processing only 2 rows at a time.
+      // local_accumRN is the N-th local accumulator for row R.
+      int16x8_t local_accum00;
+      int16x8_t local_accum01;
+      int16x8_t local_accum10;
+      int16x8_t local_accum11;
+
+      // Load 64 bytes of input activations values. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t input0 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 0)));
+      int8x16_t input1 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 1)));
+      int8x16_t input2 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 2)));
+      int8x16_t input3 = vreinterpretq_s8_u8(
+          veorq_u8(signbit, vld1q_u8(input_data + in + 16 * 3)));
+
+      // Beginning of the core accumulation. Notice how while we have 4
+      // rows to process, this code is taking care of only 2 rows at a time,
+      // thus being divided into two parts looking similar ("Rows 0 and 1" and
+      // "Rows 2 and 3").
+      //
+      // (*) The rationale for handling only 2 rows at a time is to avoid
+      // cache aliasing issues on 4-way set-associative L1-cache CPUs, such
+      // as Cortex-A53. With sufficiently large, power-of-two matrix dimensions,
+      // we may find ourselves in a situation where rows alias each other in
+      // the L1 cache, and moreover may also mutually alias with the input
+      // activations. If we try to load 4 rows at a time, together with the
+      // input activations, that may be 5 mutually-aliasing vectors, resulting
+      // in constant mutual eviction from L1 cache. Handling 2 rows at a time
+      // here largely mitigates these issues, and seems at least to be very
+      // effective on Cortex-A53:
+      //                          Before       After
+      // big (Cortex-A73)         2.85 ms      2.85 ms
+      // little (Cortex-A53)      11.0 ms      5.16 ms
+
+      // Rows 0 and 1:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      int8x16_t weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 0)));
+      int8x16_t weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 1)));
+      int8x16_t weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 2)));
+      int8x16_t weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 0) * input_size + in + 16 * 3)));
+      int8x16_t weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 0)));
+      int8x16_t weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 1)));
+      int8x16_t weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 2)));
+      int8x16_t weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 1) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum00 = vpadalq_s16(row_accum00, local_accum00);
+      row_accum01 = vpadalq_s16(row_accum01, local_accum01);
+      row_accum10 = vpadalq_s16(row_accum10, local_accum10);
+      row_accum11 = vpadalq_s16(row_accum11, local_accum11);
+
+      // Rows 2 and 3:
+      // Load 64 bytes of weights values from each row. Convert to signed int8
+      // by flipping the sign bit (i.e. subtracting 128, the required
+      // zero_point value).
+      weights00 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 0)));
+      weights01 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 1)));
+      weights02 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 2)));
+      weights03 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 2) * input_size + in + 16 * 3)));
+      weights10 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 0)));
+      weights11 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 1)));
+      weights12 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 2)));
+      weights13 = vreinterpretq_s8_u8(veorq_u8(
+          signbit,
+          vld1q_u8(weights_data + (out + 3) * input_size + in + 16 * 3)));
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights00), vget_low_s8(input0));
+      local_accum01 = vmull_s8(vget_low_s8(weights01), vget_low_s8(input1));
+      local_accum10 = vmull_s8(vget_low_s8(weights10), vget_low_s8(input0));
+      local_accum11 = vmull_s8(vget_low_s8(weights11), vget_low_s8(input1));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights00),
+                               vget_high_s8(input0));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights01),
+                               vget_high_s8(input1));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights10),
+                               vget_high_s8(input0));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights11),
+                               vget_high_s8(input1));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+      // Multiply-accumulate into local 16-bit accumulators.
+      // We can accumulate two products without overflow because weights are
+      // required to never be -128, so each product is at most 127^2 in absolute
+      // value.
+      local_accum00 = vmull_s8(vget_low_s8(weights02), vget_low_s8(input2));
+      local_accum01 = vmull_s8(vget_low_s8(weights03), vget_low_s8(input3));
+      local_accum10 = vmull_s8(vget_low_s8(weights12), vget_low_s8(input2));
+      local_accum11 = vmull_s8(vget_low_s8(weights13), vget_low_s8(input3));
+      local_accum00 = vmlal_s8(local_accum00, vget_high_s8(weights02),
+                               vget_high_s8(input2));
+      local_accum01 = vmlal_s8(local_accum01, vget_high_s8(weights03),
+                               vget_high_s8(input3));
+      local_accum10 = vmlal_s8(local_accum10, vget_high_s8(weights12),
+                               vget_high_s8(input2));
+      local_accum11 = vmlal_s8(local_accum11, vget_high_s8(weights13),
+                               vget_high_s8(input3));
+      // Pairwise add and accumulate into 32-bit accumulators
+      row_accum20 = vpadalq_s16(row_accum20, local_accum00);
+      row_accum21 = vpadalq_s16(row_accum21, local_accum01);
+      row_accum30 = vpadalq_s16(row_accum30, local_accum10);
+      row_accum31 = vpadalq_s16(row_accum31, local_accum11);
+    }
+
+    row_accum00 = vaddq_s32(row_accum00, row_accum01);
+    row_accum10 = vaddq_s32(row_accum10, row_accum11);
+    row_accum20 = vaddq_s32(row_accum20, row_accum21);
+    row_accum30 = vaddq_s32(row_accum30, row_accum31);
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc_0, pairwise_reduced_acc_1,
+        pairwise_reduced_acc_2, pairwise_reduced_acc_3;
+    pairwise_reduced_acc_0 =
+        vpadd_s32(vget_low_s32(row_accum00), vget_high_s32(row_accum00));
+    pairwise_reduced_acc_1 =
+        vpadd_s32(vget_low_s32(row_accum10), vget_high_s32(row_accum10));
+    pairwise_reduced_acc_2 =
+        vpadd_s32(vget_low_s32(row_accum20), vget_high_s32(row_accum20));
+    pairwise_reduced_acc_3 =
+        vpadd_s32(vget_low_s32(row_accum30), vget_high_s32(row_accum30));
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc_0, pairwise_reduced_acc_1);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc_2, pairwise_reduced_acc_3);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    reduced = vaddq_s32(reduced, bias_vec);
+    reduced = vshlq_s32(reduced, vdupq_n_s32(left_shift));
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, accum_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, right_shift);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    vst1_s16(output_ptr, res16);
+    output_ptr += 4;
+  }
+}
+#endif
+
 inline void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                            const float* weights_data,
                            const Dims<4>& weights_dims, const float* bias_data,
@@ -559,14 +867,6 @@ void FullyConnected(const float* input_data, const Dims<4>& input_dims,
                  output_data, output_dims);
 }
 
-inline void preload_l1_stream(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
 #ifdef USE_NEON
 inline void FullyConnectedAsGEMV(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
@@ -587,10 +887,10 @@ inline void FullyConnectedAsGEMV(
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   static constexpr int kPeel = 4;
   for (int k = 0; k < input_size; k += 64) {
-    preload_l1_stream(input_data + k);
+    optimized_ops_preload_l1_stream(input_data + k);
   }
   for (int k = 0; k < kPeel * input_size; k += 64) {
-    preload_l1_stream(filter_data + k);
+    optimized_ops_preload_l1_stream(filter_data + k);
   }
   TFLITE_DCHECK(!(output_size % kPeel));
   const int32* bias_ptr = bias_data;
@@ -609,7 +909,7 @@ inline void FullyConnectedAsGEMV(
       for (int k = 0; k < kPeel; k++) {
         const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
         filter_val_u8[k] = vld1q_u8(filter_ptr);
-        preload_l1_stream(filter_ptr + 64);
+        optimized_ops_preload_l1_stream(filter_ptr + 64);
       }
       int16x8_t input_val[2];
       const uint8x8_t low = vget_low_u8(input_val_u8);
@@ -834,13 +1134,22 @@ inline void FullyConnected(
   // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
   // is explained in the function comment above.
 #ifdef GEMMLOWP_NEON
-  if (batches == 1 && !(output_depth % 4) && !(accum_depth % 8) &&
-      input_offset == -128 && output_activation_min == -32768 &&
+  if (batches == 1 && input_offset == -128 && output_activation_min == -32768 &&
       output_activation_max == 32767) {
-    GEMVForLstmCell(input_data, input_dims, filter_data, filter_dims,
-                    filter_offset, bias_data_int32, bias_dims,
-                    output_multiplier, -output_shift, output_data, output_dims);
-    return;
+    if (filter_offset == -128 && !(output_depth % 4) && !(accum_depth % 64)) {
+      GEMVForLstmCellWithSymmetricRange(input_data, input_dims, filter_data,
+                                        filter_dims, bias_data_int32, bias_dims,
+                                        output_multiplier, -output_shift,
+                                        output_data, output_dims);
+      return;
+    }
+    if (!(output_depth % 4) && !(accum_depth % 8)) {
+      GEMVForLstmCell(input_data, input_dims, filter_data, filter_dims,
+                      filter_offset, bias_data_int32, bias_dims,
+                      output_multiplier, -output_shift, output_data,
+                      output_dims);
+      return;
+    }
   }
 #endif
   gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> weights_matrix(
@@ -2597,6 +2906,62 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the reference implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
@@ -3771,6 +4136,7 @@ gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
 // optimized yet.
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
   const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3804,6 +4170,94 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Currently just a copy of the reference code.
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("LogSoftmax/Uint8");
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
+      }
+    }
+  }
+}
+
 inline void Logistic(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic");
@@ -3817,7 +4271,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Logistic");
+  gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
   /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
   /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
   /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
@@ -3957,12 +4411,46 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
-  // This is a copy of the reference implementation. We do not currently have a
-  // properly optimized version.
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
 
   for (int i = 0; i < flat_size; i++) {
+  }
+
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      vst1q_s16(output_data_ptr, output0.raw());
+      vst1q_s16(output_data_ptr + 8, output1.raw());
+
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+      F0 output = gemmlowp::logistic(input);
+      vst1q_s16(output_data_ptr, output.raw());
+
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+  {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
     // whose range is in [-1, 1].
@@ -3970,9 +4458,14 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
     // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
     using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
 
-    const F3 input = F3::FromRaw(input_data[i]);
-    F0 output = gemmlowp::logistic(input);
-    output_data[i] = output.raw();
+    for (; c < flat_size; ++c) {
+      F3 input = F3::FromRaw(*input_data_ptr);
+      F0 output = gemmlowp::logistic(input);
+      *output_data_ptr = output.raw();
+
+      ++input_data_ptr;
+      ++output_data_ptr;
+    }
   }
 }
 
@@ -4139,9 +4632,6 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
                  int input_left_shift, int16* output_data,
                  const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Tanh/Int16");
-  // This is a copy of the reference implementation. We do not currently have a
-  // properly optimized version.
-
   // Support for shifts is limited until we have a parameterized version of
   // SaturatingRoundingMultiplyByPOT().
   TFLITE_DCHECK_GE(input_left_shift, 0);
@@ -4150,25 +4640,91 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   const int flat_size = RequiredBufferSizeForDims(output_dims);
   TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
 
-  // F0 uses 0 integer bits, range [-1, 1].
-  // This is the return type of math functions such as tanh, logistic,
-  // whose range is in [-1, 1].
-  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-  if (input_left_shift == 0) {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(input_data[i]);
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
+  int c = 0;
+  const int16* input_data_ptr = input_data;
+  int16* output_data_ptr = output_data;
+#ifdef GEMMLOWP_NEON
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<int16x8_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<int16x8_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F3 input1 = F3::FromRaw(vld1q_s16(input_data_ptr + 8));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(vld1q_s16(input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr + 8)));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        vst1q_s16(output_data_ptr, output0.raw());
+        vst1q_s16(output_data_ptr + 8, output1.raw());
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            vld1q_s16(input_data_ptr)));
+        F0 output = gemmlowp::tanh(input);
+        vst1q_s16(output_data_ptr, output.raw());
+
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
     }
-  } else {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(
-          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
+  }
+#endif
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(*input_data_ptr);
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
+    } else {
+      for (; c < flat_size; ++c) {
+        F3 input = F3::FromRaw(
+            gemmlowp::SaturatingRoundingMultiplyByPOT<1>(*input_data_ptr));
+        F0 output = gemmlowp::tanh(input);
+        *output_data_ptr = output.raw();
+
+        ++input_data_ptr;
+        ++output_data_ptr;
+      }
     }
   }
 }
@@ -4195,66 +4751,23 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("FakeQuant");
 
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
 
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
@@ -4265,11 +4778,12 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
       for (int x = 0; x < width; ++x) {
         for (int c = 0; c < depth; ++c) {
           const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
+          const float clamped =
+              std::min(nudged_max, std::max(nudged_min, src_val));
+          const float clamped_shifted = clamped - nudged_min;
+          const float dst_val =
+              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+              nudged_min;
           output_data[Offset(output_dims, c, x, y, b)] = dst_val;
         }
       }
@@ -4774,7 +5288,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
@@ -4794,27 +5308,27 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, 0,
+    memset(output_data, pad_value,
            left_b_padding * output_height * output_width * output_depth *
                sizeof(T));
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), 0,
+      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
              left_h_padding * output_width * output_depth * sizeof(T));
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), 0,
+        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
                left_w_padding * output_depth * sizeof(T));
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b), 0,
-                 left_d_padding * sizeof(T));
+          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+                 pad_value, left_d_padding * sizeof(T));
         }
 
         T* out = output_data +
@@ -4828,20 +5342,21 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
           memset(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              0, right_d_padding * sizeof(T));
+              pad_value, right_d_padding * sizeof(T));
         }
       }
       if (right_w_padding != 0) {
         memset(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            0, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth * sizeof(T));
       }
     }
     if (right_h_padding != 0) {
       memset(output_data + Offset(output_dims, 0, 0,
                                   output_height - right_h_padding, out_b),
-             0, right_h_padding * output_width * output_depth * sizeof(T));
+             pad_value,
+             right_h_padding * output_width * output_depth * sizeof(T));
     }
   }
   if (right_b_padding != 0) {
@@ -4853,6 +5368,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
index 18be6777a5caeb45a4ffabd8b7f1793de7b053f8..b0951aac8cbb98a181d9dcaef88770fadfc74f62 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.cc
@@ -78,6 +78,22 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
                                    quantized_multiplier, left_shift);
 }
 
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift) {
+  PreprocessSoftmaxScaling(beta, input_scale, input_integer_bits,
+                           quantized_multiplier, left_shift);
+
+  // Also calculate what amounts to the inverse scaling factor for the input.
+  const double real_reverse_scaling_divisor =
+      (1 << (31 - *left_shift)) / static_cast<double>(*quantized_multiplier);
+  tflite::QuantizeMultiplierSmallerThanOne(real_reverse_scaling_divisor,
+                                           reverse_scaling_divisor,
+                                           reverse_scaling_right_shift);
+}
+
 int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
                                     (1ll << (31 - input_integer_bits)) /
@@ -88,4 +104,25 @@ int CalculateInputRadius(int input_integer_bits, int input_left_shift) {
   return static_cast<int>(std::floor(max_input_rescaled));
 }
 
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max,
+                            float* scale) {
+  // This code originates from tensorflow/core/kernels/fake_quant_ops_functor.h.
+  const float quant_min_float = static_cast<float>(quant_min);
+  const float quant_max_float = static_cast<float>(quant_max);
+  *scale = (max - min) / (quant_max_float - quant_min_float);
+  const float zero_point_from_min = quant_min_float - min / *scale;
+  uint16 nudged_zero_point;
+  if (zero_point_from_min < quant_min_float) {
+    nudged_zero_point = static_cast<uint16>(quant_min);
+  } else if (zero_point_from_min > quant_max_float) {
+    nudged_zero_point = static_cast<uint16>(quant_max);
+  } else {
+    nudged_zero_point = static_cast<uint16>(TfLiteRound(zero_point_from_min));
+  }
+  *nudged_min = (quant_min_float - nudged_zero_point) * (*scale);
+  *nudged_max = (quant_max_float - nudged_zero_point) * (*scale);
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util.h b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
index 9a04b76e56b2527b06f5b0ec1e75e991fd1cbdea..4a217515f142b2451ebd61e423871b95cdc09748 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util.h
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util.h
@@ -196,7 +196,12 @@ void QuantizeMultiplier(double double_multiplier, int32_t* quantized_multiplier,
 void PreprocessSoftmaxScaling(double beta, double input_scale,
                               int input_integer_bits,
                               int32_t* quantized_multiplier, int* left_shift);
-
+// Like PreprocessSoftmaxScaling, but inverse scaling factors also calculated.
+void PreprocessLogSoftmaxScaling(double beta, double input_scale,
+                                 int input_integer_bits,
+                                 int32_t* quantized_multiplier, int* left_shift,
+                                 int32_t* reverse_scaling_divisor,
+                                 int* reverse_scaling_right_shift);
 // Calculate the largest input that will result in a within-bounds intermediate
 // result within MultiplyByQuantizedMultiplierGreaterThanOne.  In other words,
 // it must not overflow before we reduce the value by multiplication by the
@@ -204,6 +209,14 @@ void PreprocessSoftmaxScaling(double beta, double input_scale,
 // Softmax.
 int CalculateInputRadius(int input_integer_bits, int input_left_shift);
 
+// Nudges a min/max quantization range to ensure zero is zero.
+// Gymnastics with nudged zero point is to ensure that real zero maps to
+// an integer, which is required for e.g. zero-padding in convolutional layers.
+// Outputs nudged_min, nudged_max, nudged_scale.
+void NudgeQuantizationRange(const float min, const float max,
+                            const int quant_min, const int quant_max,
+                            float* nudged_min, float* nudged_max, float* scale);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 750737a730bbb2a385412146c854ced44f12c6eb..e8d7da73a255d1556616355420e763a3d5a5d923 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "fixedpoint/fixedpoint.h"
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
@@ -635,27 +636,14 @@ void NonGlobalBatchNormalization(
     const Dims<4>& offset_dims, float* output_data,
     const Dims<4>& output_dims) {
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      output_data[b * inner_size + i] = ActivationFunction<Ac>(
+          (input_data[b * inner_size + i] - mean_data[i]) * multiplier_data[i] +
+          offset_data[i]);
     }
   }
 }
@@ -669,87 +657,52 @@ void GlobalBatchNormalization(const float* input_data,
                               const float* offset_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = ActivationFunction<Ac>(
+          (input_data[depth * i + c] - mean_data[c]) * multiplier_data[c] +
+          offset_data[c]);
     }
   }
 }
 
 inline void Relu(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float lower = 0;
-          float clamped = val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float lower = 0;
+    const float clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -757,24 +710,17 @@ template <FusedActivationFunctionType Ac>
 void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float l2_norm = std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] / l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[depth * i + c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      output_data[depth * i + c] = input_data[depth * i + c] / l2_norm;
     }
   }
 }
@@ -859,26 +805,11 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] +
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] + input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1141,26 +1072,11 @@ inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] *
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] * input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1407,50 +1323,15 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void BroadcastDiv(const uint8* input1_data, const Dims<4>& input1_dims,
-                         int32 input1_offset, const uint8* input2_data,
-                         const Dims<4>& input2_dims, int32 input2_offset,
-                         int32 output_offset, int32 output_multiplier,
-                         int output_shift, int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastDiv/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest
-  // stride, typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for
-  // the best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val / input2_val, output_multiplier, output_shift);
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, unclamped_result));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1458,26 +1339,11 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -1613,6 +1479,61 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+// TODO(prabhumk): This is the same as the optimized implementation.
+// TODO(prabhumk): The quantized implementation of concatentation isn't fully
+// quantized as it takes scale as a floating point value. This should be fixed
+// when optimizng this routine further.
+inline void Concatenation(int concat_dim, const uint8* const* input_data,
+                          const Dims<4>* const* input_dims,
+                          const int32* input_zeropoint,
+                          const float* input_scale, int inputs_count,
+                          uint8* output_data, const Dims<4>& output_dims,
+                          const int32 output_zeropoint,
+                          const float output_scale) {
+  // The arguments input_zeropoint and input_scale are expected to be an array
+  // that have the quantization paramaters for all the inputs to the concat
+  // operator.
+  TFLITE_DCHECK_GT(inputs_count, 1);
+  int64_t concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  TFLITE_DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  int64_t outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  const float inverse_output_scale = 1.f / output_scale;
+  uint8* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      const uint8* input_ptr = input_data[i] + k * copy_size;
+      if (input_zeropoint[i] == output_zeropoint &&
+          input_scale[i] == output_scale) {
+        memcpy(output_ptr, input_ptr, copy_size);
+      } else {
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        for (int j = 0; j < copy_size; ++j) {
+          const int32_t value =
+              static_cast<int32_t>(round(input_ptr[j] * scale + bias)) +
+              output_zeropoint;
+          output_ptr[j] =
+              static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void DepthConcatenation(const Scalar* const* input_data,
                         const Dims<4>* const* input_dims, int inputs_count,
@@ -1804,15 +1725,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   (void)gemm_context;  // only used in optimized code.
 
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -1828,9 +1743,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -1875,7 +1788,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   for (int b = 0; b < outer_size; ++b) {
     for (int c = 0; c < output_depth; ++c) {
       // Define the fixed-point data types that we will use here. All use
@@ -2410,28 +2322,20 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float bias, float alpha, float beta,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const int begin_input_c = std::max(0, c - range);
-          const int end_input_c = std::min(depth, c + range);
-          float accum = 0.f;
-          for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
-            const float input_val =
-                input_data[Offset(input_dims, input_c, x, y, b)];
-            accum += input_val * input_val;
-          }
-          const float multiplier = std::pow(bias + alpha * accum, -beta);
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * multiplier;
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      const int begin_input_c = std::max(0, c - range);
+      const int end_input_c = std::min(depth, c + range);
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c) {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
       }
+      const float multiplier = std::pow(bias + alpha * accum, -beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
     }
   }
 }
@@ -2439,37 +2343,28 @@ inline void LocalResponseNormalization(const float* input_data,
 inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                          beta);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp((input_data[i * depth + c] - max) * beta);
+    }
 
-        // Compute result.
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              std::exp((input_data[Offset(input_dims, c, x, y, b)] - max) *
-                       beta) /
-              sum;
-        }
-      }
+    // Compute result.
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] =
+          std::exp((input_data[i * depth + c] - max) * beta) / sum;
     }
   }
 }
@@ -2490,73 +2385,63 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int x = 0; x < width; ++x) {
-      for (int y = 0; y < height; ++y) {
-        uint8 max_in_row = 0;
-        for (int c = 0; c < depth; ++c) {
-          max_in_row =
-              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
-        }
-
-        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-            sum_of_exps =
-                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                  exp_on_negative_values(scaled_diff_f8));
-          }
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    int32 fixed_sum_of_exps = sum_of_exps.raw();
+    int headroom_plus_one =
+        CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
+    // This is the number of bits to the left of the binary point above 1.0.
+    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+    // no later adjustment will be needed.
+    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+    int32 shifted_sum_minus_one = static_cast<int32>(
+        (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+        (static_cast<uint32>(1) << 31));
+
+    FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+        FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_beta_multiplier, input_beta_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+        int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+            (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
 
-        int32 fixed_sum_of_exps = sum_of_exps.raw();
-        int headroom_plus_one =
-            CountLeadingZeros(static_cast<uint32>(fixed_sum_of_exps));
-        // This is the number of bits to the left of the binary point above 1.0.
-        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-        // no later adjustment will be needed.
-        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-        int32 shifted_sum_minus_one = static_cast<int32>(
-            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
-            (static_cast<uint32>(1) << 31));
-
-        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
-            FixedPoint0::FromRaw(shifted_sum_minus_one));
-
-        for (int c = 0; c < depth; ++c) {
-          int32 input_diff =
-              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
-              max_in_row;
-          if (input_diff >= diff_min) {
-            const int32 input_diff_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_diff, input_beta_multiplier, input_beta_left_shift);
-            const FixedPointScaledDiff scaled_diff_f8 =
-                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
-                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
-
-            output_data[Offset(output_dims, c, x, y, b)] = static_cast<uint8>(
-                std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
-
-          } else {
-            output_data[Offset(output_dims, c, x, y, b)] = 0;
-          }
-        }
+      } else {
+        output_data[i * depth + c] = 0;
       }
     }
   }
@@ -2564,109 +2449,170 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
 
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, input_data[i * depth + c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(input_data[i * depth + c] - max);
+    }
 
-        // Compute result.
-        const float log_sum = std::log(sum);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
-        }
-      }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      output_data[i * depth + c] = input_data[i * depth + c] - max - log_sum;
     }
   }
 }
 
-inline void Logistic(const float* input_data, const Dims<4>& input_dims,
-                     float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 input_multiplier, int32 input_left_shift,
+                       int32 reverse_scaling_divisor,
+                       int32 reverse_scaling_right_shift, int diff_min,
+                       uint8* output_data, const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static constexpr int kScaledDiffIntegerBits = 5;
+  static constexpr int kAccumulationIntegerBits = 12;
+  static constexpr int kOutputIntegerBits = 4;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = 1.f / (1.f + std::exp(-val));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
+
+  for (int i = 0; i < outer_size; ++i) {
+    uint8 max_in_row = 0;
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff >= diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        const FixedPointScaledDiff scaled_diff_f8 =
+            FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                        exp_on_negative_values(scaled_diff_f8));
+      }
+    }
+
+    // TODO(b/77858996): Implement fixed-point log().
+    // Not a fully-quantized implementation: floating-point log().
+    const float float_log_sum_of_exps =
+        std::log(static_cast<float>(sum_of_exps.raw()) /
+                 (1 << (31 - kAccumulationIntegerBits)));
+    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
+        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+
+    // rescaled_diff_min is smallest representable in
+    // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
+    // log-sub-exps that will be subtracted in the loop.
+    //
+    // The thresholds diff_min, etc are negative.
+    const int rescaled_diff_min =
+        fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
+    const int adjusted_diff_min =
+        std::max(diff_min - 1,  // Note use of > below instead of >= above.
+                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                     rescaled_diff_min, reverse_scaling_divisor,
+                     reverse_scaling_right_shift));
+
+    for (int c = 0; c < depth; ++c) {
+      int32 input_diff =
+          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      if (input_diff > adjusted_diff_min) {
+        const int32 input_diff_rescaled =
+            MultiplyByQuantizedMultiplierGreaterThanOne(
+                input_diff, input_multiplier, input_left_shift);
+        int32 unsat_output =
+            gemmlowp::RoundingDivideByPOT(
+                (input_diff_rescaled - fixed_log_sum_of_exps),
+                31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
+            255;
+
+        output_data[i * depth + c] = static_cast<uint8>(
+            std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
+      } else {
+        // Set output to smallest value.
+        output_data[i * depth + c] = 0;
       }
     }
   }
 }
 
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = 1.f / (1.f + std::exp(-val));
+    output_data[i] = result;
+  }
+}
+
 inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_zero_point, int32 input_range_radius,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
-            // Convert from Q0.31 to Q23.8.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as U0.8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+      // Convert from Q0.31 to Q23.8.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as U0.8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -2684,20 +2630,12 @@ inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
 
 inline void Tanh(const float* input_data, const Dims<4>& input_dims,
                  float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = std::tanh(val);
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
   }
 }
 
@@ -2706,47 +2644,38 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  int32 input_multiplier, int input_left_shift,
                  uint8* output_data, const Dims<4>& output_dims) {
   const int32 output_zero_point = 128;
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
-          const int32 input_val_centered =
-              static_cast<int32>(input_val_u8) - input_zero_point;
-          uint8 output_val;
-          if (input_val_centered <= -input_range_radius) {
-            output_val = 0;
-          } else if (input_val_centered >= input_range_radius) {
-            output_val = 255;
-          } else {
-            const int32 input_val_rescaled =
-                MultiplyByQuantizedMultiplierGreaterThanOne(
-                    input_val_centered, input_multiplier, input_left_shift);
-            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
-            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
-            const FixedPoint4 input_val_f4 =
-                FixedPoint4::FromRaw(input_val_rescaled);
-            const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
-            // Convert from Q0.31 to Q24.7.
-            using gemmlowp::RoundingDivideByPOT;
-            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
-            output_val_s32 += output_zero_point;
-            if (output_val_s32 == 256) {
-              output_val_s32 = 255;
-            }
-            // Reinterpret as Q0.7, encoded in uint8.
-            TFLITE_DCHECK_GE(output_val_s32, 0);
-            TFLITE_DCHECK_LE(output_val_s32, 255);
-            output_val = static_cast<uint8>(output_val_s32);
-          }
-          output_data[Offset(output_dims, c, x, y, b)] = output_val;
-        }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    const uint8 input_val_u8 = input_data[i];
+    const int32 input_val_centered =
+        static_cast<int32>(input_val_u8) - input_zero_point;
+    uint8 output_val;
+    if (input_val_centered <= -input_range_radius) {
+      output_val = 0;
+    } else if (input_val_centered >= input_range_radius) {
+      output_val = 255;
+    } else {
+      const int32 input_val_rescaled =
+          MultiplyByQuantizedMultiplierGreaterThanOne(
+              input_val_centered, input_multiplier, input_left_shift);
+      using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+      using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+      const FixedPoint4 input_val_f4 = FixedPoint4::FromRaw(input_val_rescaled);
+      const FixedPoint0 output_val_f0 = gemmlowp::tanh(input_val_f4);
+      // Convert from Q0.31 to Q24.7.
+      using gemmlowp::RoundingDivideByPOT;
+      int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 24);
+      output_val_s32 += output_zero_point;
+      if (output_val_s32 == 256) {
+        output_val_s32 = 255;
       }
+      // Reinterpret as Q0.7, encoded in uint8.
+      TFLITE_DCHECK_GE(output_val_s32, 0);
+      TFLITE_DCHECK_LE(output_val_s32, 255);
+      output_val = static_cast<uint8>(output_val_s32);
     }
+    output_data[i] = output_val;
   }
 }
 
@@ -2758,8 +2687,7 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_GE(input_left_shift, 0);
   TFLITE_DCHECK_LE(input_left_shift, 1);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   // F0 uses 0 integer bits, range [-1, 1].
   // This is the return type of math functions such as tanh, logistic,
@@ -2787,138 +2715,62 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
 inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
 inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
-                      float rmin, float rmax, float* output_data,
+                      float rmin, float rmax, int num_bits, float* output_data,
                       const Dims<4>& output_dims) {
   // 0 should always be a representable value. Let's assume that the initial
   // min,max range contains 0.
-  TFLITE_DCHECK_LE(rmin, 0.);
-  TFLITE_DCHECK_GE(rmax, 0.);
-
-  // Determine quantization parameters: zero_point, scale.
-  using Integer = uint8;
-  const Integer qmin = std::numeric_limits<Integer>::min();
-  const Integer qmax = std::numeric_limits<Integer>::max();
-  const float qmin_float = qmin;
-  const float qmax_float = qmax;
-  int32 zero_point = 0;
-  float scale = 0.f;
-  // If rmin==rmax, both must be zero per the above assertion,
-  // so we are done.
-  if (rmin != rmax) {
-    // First determine the scale.
-    scale = (rmax - rmin) / (qmax_float - qmin_float);
-
-    // Zero-point computation.
-    // First the initial floating-point computation. The zero-point can be
-    // determined from solving an affine equation for any known pair
-    // (real value, corresponding quantized value).
-    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
-    // The arithmetic error on the zero point computed from either pair
-    // will be roughly machine_epsilon * (sum of absolute values of terms)
-    // so we want to use the variant that adds the smaller terms.
-    const float zero_point_from_min = qmin_float - rmin / scale;
-    const float zero_point_from_max = qmax_float - rmax / scale;
-    const float zero_point_from_min_error =
-        std::abs(qmin_float) + std::abs(rmin / scale);
-    const float zero_point_from_max_error =
-        std::abs(qmax_float) + std::abs(rmax / scale);
-
-    const float zero_point_float =
-        zero_point_from_min_error < zero_point_from_max_error
-            ? zero_point_from_min
-            : zero_point_from_max;
-
-    // Now we need to nudge the zero point to be an integer
-    // (our zero points are integer, and this is motivated by the requirement
-    // to be able to represent the real value "0" exactly as a quantized value,
-    // which is required in multiple places, for example in Im2col with SAME
-    // padding).
-    if (zero_point_float < qmin_float) {
-      zero_point = qmin;
-    } else if (zero_point_float > qmax_float) {
-      zero_point = qmax;
-    } else {
-      zero_point = static_cast<int32>(TfLiteRound(zero_point_float));
-    }
-    // The zero point should always be in the range of quantized value,
-    // [qmin, qmax].
-    TFLITE_DCHECK_GE(zero_point, qmin);
-    TFLITE_DCHECK_LE(zero_point, qmax);
-  }
-
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float unclamped_quantized_val =
-              TfLiteRound(zero_point + src_val / scale);
-          const float quantized_val = std::min(
-              qmax_float, std::max(qmin_float, unclamped_quantized_val));
-          const float dst_val = scale * (quantized_val - zero_point);
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min,
+                         &nudged_max, &nudged_scale);
+  const float inv_nudged_scale = 1.0f / nudged_scale;
+
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; i++) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
   }
 }
 
 template <typename SrcT, typename DstT>
 inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
                  DstT* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = static_cast<DstT>(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = static_cast<DstT>(input_data[offset]);
   }
 }
 
 inline void Floor(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int offset = Offset(input_dims, c, x, y, b);
-          output_data[offset] = std::floor(input_data[offset]);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+
+  for (int i = 0; i < flat_size; i++) {
+    int offset = i;
+    output_data[offset] = std::floor(input_data[offset]);
   }
 }
 
@@ -3078,7 +2930,7 @@ template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
                 const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims) {
+                const Dims<4>& output_dims, const int32_t pad_value) {
   const int output_batch = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3108,7 +2960,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = 0;
+            *out_ptr++ = static_cast<T>(pad_value);
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3118,6 +2970,15 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims) {
+  Pad(input_data, input_dims, left_paddings, right_paddings, output_data,
+      output_dims, 0);
+}
+
 inline bool LoopCondition(int index, int stop, int stride) {
   return stride > 0 ? index < stop : index > stop;
 }
@@ -3231,19 +3092,20 @@ inline void Exp(const T* input_data, const size_t num_elements,
   }
 }
 
-template <typename T>
-inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
+template <typename T, typename U>
+inline bool Mean(T* input_data, const int* input_dims, const int input_num_dims,
                  T* output_data, const int* output_dims,
                  const int output_num_dims, const int* axis,
                  const int num_axis_dimensions, bool keep_dims, int* temp_index,
-                 int* resolved_axis) {
+                 int* resolved_axis, U* temp_sum) {
   // resets output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
     num_outputs *= static_cast<size_t>(output_dims[idx]);
   }
   for (size_t idx = 0; idx < num_outputs; ++idx) {
-    output_data[idx] = 0;
+    output_data[idx] = T();
+    temp_sum[idx] = U();
   }
   // resets temp index.
   for (int idx = 0; idx < input_num_dims; ++idx) {
@@ -3276,19 +3138,24 @@ inline void Mean(T* input_data, const int* input_dims, const int input_num_dims,
     size_t output_offset =
         ReducedOutputOffset(input_num_dims, input_dims, temp_index,
                             num_resolved_axis, resolved_axis);
-    output_data[output_offset] += input_data[input_offset];
+    temp_sum[output_offset] += static_cast<U>(input_data[input_offset]);
   }
   // takes average by num of elements added to get mean.
   size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
-    num_elements_in_axis *= static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+      return false;
+    }
+    num_elements_in_axis *= current;
   }
   if (num_elements_in_axis > 0) {
     for (size_t idx = 0; idx < num_outputs; ++idx) {
-      output_data[idx] = static_cast<T>(static_cast<float>(output_data[idx]) /
-                                        num_elements_in_axis);
+      output_data[idx] =
+          static_cast<T>(temp_sum[idx] / static_cast<U>(num_elements_in_axis));
     }
   }
+  return true;
 }
 
 template <typename T>
@@ -3361,23 +3228,11 @@ template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto min_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] > min_value ? min_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
   }
 }
 
@@ -3385,30 +3240,19 @@ template <typename T>
 void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
                        const Dims<4>& output_dims) {
-  int batches = MatchingArraySize(input1_dims, 3, output_dims, 3);
-  int input_height = MatchingArraySize(input1_dims, 2, output_dims, 2);
-  int input_width = MatchingArraySize(input1_dims, 1, output_dims, 1);
-  int depth = MatchingArraySize(input1_dims, 0, output_dims, 0);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims);
 
   auto max_value = input2_data[0];
-
-  for (int b = 0; b < batches; b++) {
-    for (int y = 0; y < input_height; y++) {
-      for (int x = 0; x < input_width; x++) {
-        for (int c = 0; c < depth; c++) {
-          int offset = Offset(input1_dims, c, x, y, b);
-          output_data[offset] =
-              input1_data[offset] < max_value ? max_value : input1_data[offset];
-        }
-      }
-    }
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
   }
 }
 
-template <typename T>
-void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
-                       const T* input2_data, const Dims<4>& input2_dims,
-                       T* output_data, const Dims<4>& output_dims) {
+template <typename T, typename Op>
+void TensorFlowMaximumMinimum(const T* input1_data, const Dims<4>& input1_dims,
+                              const T* input2_data, const Dims<4>& input2_dims,
+                              T* output_data, const Dims<4>& output_dims,
+                              Op op) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
@@ -3422,7 +3266,7 @@ void TensorFlowMaximum(const T* input1_data, const Dims<4>& input1_dims,
           auto in2_idx = SubscriptToIndex(desc2, c, x, y, b);
           auto in1_val = input1_data[in1_idx];
           auto in2_val = input2_data[in2_idx];
-          output_data[out_idx] = in1_val > in2_val ? in1_val : in2_val;
+          output_data[out_idx] = op(in1_val, in2_val);
         }
       }
     }
@@ -3441,25 +3285,20 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = input_data[i * depth];
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = input_data[i * depth + d];
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
     }
+    output_data[i] = max_index;
   }
 }
 
@@ -3509,11 +3348,11 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 
   // Although transpose convolution simplifies to convolution with transposed
   // weights for strides of 1, non-unitary striding complicates matters. To
-  // keep this reference implementation as clear as possible, we use a "scatter"
-  // access pattern, where we loop through all the input elements, computing
-  // their influence on the output, rather than looping through the output
-  // elements in the typical "gather" access pattern of a conv. We therefore
-  // must initialize the output array to zero.
+  // keep this reference implementation as clear as possible, we use a
+  // "scatter" access pattern, where we loop through all the input elements,
+  // computing their influence on the output, rather than looping through the
+  // output elements in the typical "gather" access pattern of a conv. We
+  // therefore must initialize the output array to zero.
   for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
     output_data[i] = 0.0f;
   }
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62e38e0d4c3e023d0ed2242fc9438b096b86dc59..4bce2ffaaf326cf083a76c76adb093f3ac2e8850 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -126,6 +126,29 @@ class VectorOfTensors {
   std::vector<Dims<4>*> all_dims_ptr_;
 };
 
+// A list of quantized tensors in a format that can be used by kernels like
+// split and concatenation.
+class VectorOfQuantizedTensors : public VectorOfTensors<uint8> {
+ public:
+  // Build with the tensors in 'tensor_list'.
+  VectorOfQuantizedTensors(const TfLiteContext& context,
+                           const TfLiteIntArray& tensor_list)
+      : VectorOfTensors<uint8>(context, tensor_list) {
+    for (int i = 0; i < tensor_list.size; ++i) {
+      TfLiteTensor* t = &context.tensors[tensor_list.data[i]];
+      zero_point_.push_back(t->params.zero_point);
+      scale_.push_back(t->params.scale);
+    }
+  }
+
+  const float* scale() const { return scale_.data(); }
+  const int32* zero_point() const { return zero_point_.data(); }
+
+ private:
+  std::vector<int32> zero_point_;
+  std::vector<float> scale_;
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TENSOR_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 293538fcbb6406d6065d8efd25adb3b163638c92..3290c364c18224edb733c177ad72bf86b6892434 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -130,14 +130,125 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
   return MatchingArraySize(array1, index1, args...);
 }
 
-inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+template <int N>
+inline int FlatSize(const Dims<N>& dims) {
   int max_offset = 0;
-  for (int i = 0; i < 4; i++) {
+  for (int i = 0; i < N; i++) {
     max_offset += (dims.sizes[i] - 1) * dims.strides[i];
   }
   return max_offset + 1;
 }
 
+// Deprecated. Prefer FlatSize.
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  return FlatSize(dims);
+}
+
+// Flat size calculation, checking that dimensions match with one or more other
+// arrays.
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return MatchingFlatSize(dims, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
+                            const Dims<N>& check_dims_1,
+                            const Dims<N>& check_dims_2,
+                            const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+  }
+  return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
+}
+
+// Data is required to be contiguous, and so many operators can use either the
+// full array flat size or the flat size with one dimension skipped (commonly
+// the depth).
+template <int N>
+inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
+  TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
+  int flat_size = 1;
+  for (int i = 0; i < N; i++) {
+    flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
+  }
+  return flat_size;
+}
+
+// A combination of MatchingFlatSize() and FlatSizeSkipDim().
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return FlatSizeSkipDim(dims, skip_dim);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2);
+}
+
+template <int N>
+inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
+                                   const Dims<N>& check_dims_0,
+                                   const Dims<N>& check_dims_1,
+                                   const Dims<N>& check_dims_2,
+                                   const Dims<N>& check_dims_3) {
+  for (int i = 0; i < N; i++) {
+    if (i != skip_dim) {
+      TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
+    }
+  }
+  return MatchingFlatSizeSkipDim(dims, skip_dim, check_dims_1, check_dims_2,
+                                 check_dims_3);
+}
+
 template <int N>
 bool IsPackedWithoutStrides(const Dims<N>& dims) {
   int expected_stride = 1;
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index ee8bfe56d95e9f383ef49b40b8f58b63d61da3e1..e67f4e06f3680f8c9447a9e831b63415994ea176 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -45,10 +45,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+  TF_LITE_ENSURE(
+      context, output->type == kTfLiteFloat32 || output->type == kTfLiteUInt8);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
+  if (output->type == kTfLiteUInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.scale, (1. / 128.));
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 128);
+  }
+
   // TODO(ahentz): For some reason our implementations don't support
   // activations.
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -75,6 +80,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_L2NORM(optimized_ops);
     }
 #undef TF_LITE_L2NORM
+  } else if (output->type == kTfLiteUInt8) {
+#define TF_LITE_L2NORM(type)                                               \
+  type::L2Normalization(GetTensorData<uint8>(input), GetTensorDims(input), \
+                        input->params.zero_point,                          \
+                        GetTensorData<uint8>(output), GetTensorDims(output))
+
+    if (kernel_type == kReference) {
+      TF_LITE_L2NORM(reference_ops);
+    }
+    if (kernel_type == kGenericOptimized) {
+      TF_LITE_L2NORM(optimized_ops);
+    }
+#undef TF_LITE_L2NORM
   } else {
     context->ReportError(context, "Inputs and outputs not all float types.");
     return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 30e103f3303484c339ef98e6a68e0438291c102f..042314ccf55cb6de12c743448fbe040f35e7baab 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -25,10 +25,22 @@ using ::testing::ElementsAreArray;
 
 class L2NormOpModel : public SingleOpModel {
  public:
-  L2NormOpModel(std::initializer_list<int> input_shape,
-                ActivationFunctionType activation_type) {
-    input_ = AddInput(TensorType_FLOAT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+  L2NormOpModel(const std::initializer_list<int> input_shape,
+                const TensorType tensor_type,
+                const ActivationFunctionType activation_type) {
+    TensorData data = TensorData{tensor_type};
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -2.0;
+      data.max = 2.0;
+      data.scale = 2.0;
+      data.zero_point = 128;
+    }
+    input_ = AddInput(data);
+    if (tensor_type != TensorType_FLOAT32) {
+      data.min = -1.0;
+      data.max = 127.0 / 128.0;
+    }
+    output_ = AddOutput(data);
     SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
                  CreateL2NormOptions(builder_, activation_type).Union());
     BuildInterpreter({input_shape});
@@ -38,7 +50,17 @@ class L2NormOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+  int input() const { return input_; }
 
  private:
   int input_;
@@ -46,13 +68,26 @@ class L2NormOpModel : public SingleOpModel {
 };
 
 TEST(L2NormOpTest, SimpleTest) {
-  L2NormOpModel m({1, 1, 1, 6}, ActivationFunctionType_NONE);
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
   m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(),
+  EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, SimpleUint8Test) {
+  L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(), {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({58, 166, 173, 205, 83, 134}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(
+                  ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/maximum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
similarity index 50%
rename from tensorflow/contrib/lite/kernels/maximum.cc
rename to tensorflow/contrib/lite/kernels/maximum_minimum.cc
index 9fdf2b47eaf421bda11e7474ad819692106a90ac..5a28d663c9e756040746f0a98b356afba76cceab 100644
--- a/tensorflow/contrib/lite/kernels/maximum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -24,9 +24,9 @@ limitations under the License.
 namespace tflite {
 namespace ops {
 namespace builtin {
-namespace maximum {
+namespace maximum_minimum {
 
-// This file has a reference implemenation of TFMaximum.
+// This file has a reference implemenation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
 };
@@ -35,8 +35,8 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-struct MaximumContext {
-  MaximumContext(TfLiteContext* context, TfLiteNode* node) {
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
     input1 = GetInput(context, node, kInputTensor1);
     input2 = GetInput(context, node, kInputTensor2);
     output = GetOutput(context, node, kOutputTensor);
@@ -50,30 +50,68 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  MaximumContext op_context(context, node);
+  OpContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input1->type, op_context.input2->type);
-  TfLiteIntArray* output_dims = TfLiteIntArrayCopy(op_context.input2->dims);
-  op_context.output->type = op_context.input2->type;
-  return context->ResizeTensor(context, op_context.output, output_dims);
+  op_context.output->type = op_context.input1->type;
+
+  bool requires_broadcast =
+      !HaveSameShapes(op_context.input1, op_context.input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (requires_broadcast) {
+    TF_LITE_ENSURE_OK(
+        context, CalculateShapeForBroadcast(context, op_context.input1,
+                                            op_context.input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(op_context.input1->dims);
+  }
+
+  return context->ResizeTensor(context, op_context.output, output_size);
 }
 
-template <KernelType kernel_type>
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  MaximumContext op_context(context, node);
+struct MaximumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 > el2 ? el1 : el2;
+  }
+};
 
-#define TF_LITE_MAXIMUM(kernel_type, data_type)    \
-  kernel_type::TensorFlowMaximum<data_type>(       \
-      GetTensorData<data_type>(op_context.input1), \
-      GetTensorDims(op_context.input1),            \
-      GetTensorData<data_type>(op_context.input2), \
-      GetTensorDims(op_context.input2),            \
-      GetTensorData<data_type>(op_context.output), \
-      GetTensorDims(op_context.output))
+struct MinimumOp {
+  template <typename data_type>
+  static data_type op(data_type el1, data_type el2) {
+    return el1 < el2 ? el1 : el2;
+  }
+};
+
+template <typename data_type, typename op_type>
+void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
+                      const OpContext& op_context) {
+  reference_ops::TensorFlowMaximumMinimum<data_type>(
+      GetTensorData<data_type>(op_context.input1),
+      GetTensorDims(op_context.input1),
+      GetTensorData<data_type>(op_context.input2),
+      GetTensorDims(op_context.input2),
+      GetTensorData<data_type>(op_context.output),
+      GetTensorDims(op_context.output), op_type::template op<data_type>);
+}
+
+template <KernelType kernel_type, typename OpType>
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
 
   if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TF_LITE_MAXIMUM(reference_ops, float);
+        TFLiteOperation<float, OpType>(context, node, op_context);
+        break;
+      case kTfLiteUInt8:
+        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt32:
+       TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        break;
+      case kTfLiteInt64:
+        TFLiteOperation<int64_t, OpType>(context, node, op_context);
         break;
       default:
         context->ReportError(context,
@@ -87,19 +125,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                          op_context.output->type);
     return kTfLiteError;
   }
-#undef TF_LITE_MAXIMUM
   return kTfLiteOk;
 }
 
-}  // namespace maximum
+}  // namespace maximum_minimum
 
 TfLiteRegistration* Register_MAXIMUM_REF() {
-  static TfLiteRegistration r = {nullptr, nullptr, maximum::Prepare,
-                                 maximum::Eval<maximum::kReference>};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MaximumOp>};
   return &r;
 }
 
+TfLiteRegistration* Register_MINIMUM_REF() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kReference,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
 TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
 
 }  // namespace builtin
 }  // namespace ops
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0752aa1804722accb1f88910fe013ffd632a4503
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class MaxMinOpModel : public SingleOpModel {
+ public:
+  MaxMinOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorType& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <class T>
+  void SetInput1(std::initializer_list<T> data) {
+    PopulateTensor(input1_, data);
+  }
+
+  template <class T>
+  void SetInput2(std::initializer_list<T> data) {
+    PopulateTensor(input2_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values,
+               std::initializer_list<data_type> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<data_type>(input1_values);
+  m.SetInput2<data_type>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<data_type>(), ElementsAreArray(output_values));
+}
+
+template <>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<float> input1_values,
+               std::initializer_list<float> input2_values,
+               std::initializer_list<float> output_values) {
+  MaxMinOpModel m(op, input1, input2, output.type);
+  m.SetInput1<float>(input1_values);
+  m.SetInput2<float>(input2_values);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(output.shape));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear(output_values)));
+}
+
+TEST(MaximumOpTest, FloatTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {1.0, 0.0, 1.0, 12.0, -2.0, -1.43});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {3, 1, 2}}, data1, data2,
+                   {-1.0, 0.0, -1.0, 11.0, -3.0, -1.44});
+}
+
+TEST(MaxMinOpTest, Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {1, 0, 2, 12, 255, 23});
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM, {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}},
+                     {TensorType_UINT8, {3, 1, 2}}, data1, data2,
+                     {0, 0, 1, 11, 2, 1});
+}
+
+TEST(MaximumOpTest, FloatWithBroadcastTest) {
+  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, -2.0, -1.44, 11.0};
+  std::initializer_list<float> data2 = {0.5, 2.0};
+  TestModel<float>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {1.0, 2.0, 0.5, 2.0, 0.5, 11.0});
+  TestModel<float>(BuiltinOperator_MINIMUM, {TensorType_FLOAT32, {3, 1, 2}},
+                   {TensorType_FLOAT32, {2}}, {TensorType_FLOAT32, {3, 1, 2}},
+                   data1, data2, {0.5, 0.0, -1.0, -2.0, -1.44, 2.0});
+}
+
+TEST(MaximumOpTest, Int32WithBroadcastTest) {
+  std::initializer_list<int32_t> data1 = {1, 0, -1, -2, 3, 11};
+  std::initializer_list<int32_t> data2 = {2};
+  TestModel<int32>(BuiltinOperator_MAXIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {2, 2, 2, 2, 3, 11});
+  TestModel<int32>(BuiltinOperator_MINIMUM, {TensorType_INT32, {3, 1, 2}},
+                   {TensorType_INT32, {1}}, {TensorType_INT32, {3, 1, 2}},
+                   data1, data2, {1, 0, -1, -2, 2, 2});
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/maximum_test.cc b/tensorflow/contrib/lite/kernels/maximum_test.cc
deleted file mode 100644
index b3fd7d4e6f40e53db51edf2e7594662629302add..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/kernels/maximum_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <gtest/gtest.h>
-#include "tensorflow/contrib/lite/interpreter.h"
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/kernels/test_util.h"
-#include "tensorflow/contrib/lite/model.h"
-
-namespace tflite {
-namespace {
-
-using ::testing::ElementsAreArray;
-
-class MaximumOpModel : public SingleOpModel {
- public:
-  MaximumOpModel(const TensorData& input1, const TensorData& input2,
-                 const TensorType& output) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(output);
-    SetBuiltinOp(BuiltinOperator_MAXIMUM, BuiltinOptions_MaximumOptions,
-                 CreateMaximumOptions(builder_).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
-  }
-
-  template <class T>
-  void SetInput1(std::initializer_list<T> data) {
-    PopulateTensor(input1_, data);
-  }
-
-  template <class T>
-  void SetInput2(std::initializer_list<T> data) {
-    PopulateTensor(input2_, data);
-  }
-
-  template <class T>
-  std::vector<T> GetOutput() {
-    return ExtractVector<T>(output_);
-  }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- protected:
-  int input1_;
-  int input2_;
-  int output_;
-};
-
-TEST(MaximumOpTest, FloatTest) {
-  std::initializer_list<float> data1 = {1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
-  std::initializer_list<float> data2 = {-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
-  MaximumOpModel m({TensorType_FLOAT32, {3, 1, 2}},
-                   {TensorType_FLOAT32, {3, 1, 2}}, TensorType_FLOAT32);
-  m.SetInput1<float>(data1);
-  m.SetInput2<float>(data2);
-  m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 2}));
-  EXPECT_THAT(
-      m.GetOutput<float>(),
-      ElementsAreArray(ArrayFloatNear({1.0, 0.0, 1.0, 12.0, -2.0, -1.43})));
-}
-
-}  // namespace
-}  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index aff19581ea56f94c08638b7b388ae181f566cf4f..047bdd1039b993783ce8f2d69d83864e609c13fd 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
@@ -48,7 +49,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Creates two temp tensors to store index and axis for internal
   // implementation only.
   auto* scratch_tensor_index = new int;
-  context->AddTensors(context, 2, scratch_tensor_index);
+  context->AddTensors(context, 3, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
@@ -64,6 +65,14 @@ TfLiteStatus ResizeTempAxis(TfLiteContext* context, MeanContext* op_context,
   return context->ResizeTensor(context, resolved_axis, axis_size);
 }
 
+// Resizes the temp tensor that stores temp sum of reduced elements.
+TfLiteStatus ResizeTempSum(TfLiteContext* context, MeanContext* op_context,
+                           TfLiteTensor* temp_sum) {
+  TfLiteIntArray* size = TfLiteIntArrayCreate(1);
+  size->data[0] = static_cast<int>(NumElements(op_context->output));
+  return context->ResizeTensor(context, temp_sum, size);
+}
+
 // Resizes output array based on the input size and resolved axis.
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 MeanContext* op_context) {
@@ -135,7 +144,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   // Creates a temp index to iterate through input data.
   int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
   TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries = TfLiteIntArrayCreate(3);
   node->temporaries->data[0] = *scratch_tensor_index;
   TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
   scratch_tensor->type = kTfLiteInt32;
@@ -149,6 +158,25 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   node->temporaries->data[1] = *scratch_tensor_index + 1;
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
   resolved_axis->type = kTfLiteInt32;
+  // Creates a temp tensor to store temp sums when calculating mean.
+  node->temporaries->data[2] = *scratch_tensor_index + 2;
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  switch (op_context->input->type) {
+    case kTfLiteFloat32:
+      temp_sum->type = kTfLiteFloat32;
+      break;
+    case kTfLiteInt32:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteInt64:
+      temp_sum->type = kTfLiteInt64;
+      break;
+    case kTfLiteUInt8:
+      temp_sum->type = kTfLiteInt32;
+      break;
+    default:
+      return kTfLiteError;
+  }
   return kTfLiteOk;
 }
 
@@ -160,16 +188,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Leaves work to Eval if axis is not constant; else resizes output.
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
     SetTensorToDynamic(resolved_axis);
+    SetTensorToDynamic(temp_sum);
     return kTfLiteOk;
   }
   resolved_axis->allocation_type = kTfLiteArenaRw;
   TF_LITE_ENSURE_OK(context,
                     ResizeTempAxis(context, &op_context, resolved_axis));
-  return ResizeOutputTensor(context, &op_context);
+  TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+  temp_sum->allocation_type = kTfLiteArenaRw;
+  return ResizeTempSum(context, &op_context, temp_sum);
 }
 
 template <KernelType kernel_type>
@@ -178,14 +210,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   int num_axis = static_cast<int>(NumElements(op_context.axis));
   TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
   TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context,
                       ResizeTempAxis(context, &op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
+    TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
-#define TF_LITE_MEAN(kernel_type, data_type)                        \
+#define TF_LITE_MEAN(kernel_type, data_type, temp_data_type)        \
   kernel_type::Mean<>(                                              \
       GetTensorData<data_type>(op_context.input),                   \
       op_context.input->dims->data, op_context.input->dims->size,   \
@@ -193,21 +227,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       op_context.output->dims->data, op_context.output->dims->size, \
       GetTensorData<int>(op_context.axis), num_axis,                \
       op_context.params->keep_dims, GetTensorData<int>(temp_index), \
-      GetTensorData<int>(resolved_axis))
+      GetTensorData<int>(resolved_axis),                            \
+      GetTensorData<temp_data_type>(temp_sum))
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
       case kTfLiteFloat32:
-        TF_LITE_MEAN(reference_ops, float);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
         break;
       case kTfLiteInt32:
-        TF_LITE_MEAN(reference_ops, int);
-        break;
-      case kTfLiteUInt8:
-        TF_LITE_MEAN(reference_ops, uint8_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
       case kTfLiteInt64:
-        TF_LITE_MEAN(reference_ops, int64_t);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int64_t, int64_t));
+        break;
+      case kTfLiteUInt8:
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.scale,
+                          op_context.output->params.scale);
+        TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point,
+                          op_context.output->params.zero_point);
+        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, uint8_t, int));
         break;
       default:
         return kTfLiteError;
@@ -216,7 +255,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 #undef TF_LITE_MEAN
   return kTfLiteOk;
 }
-
 }  // namespace mean
 
 TfLiteRegistration* Register_MEAN_REF() {
diff --git a/tensorflow/contrib/lite/kernels/mean_test.cc b/tensorflow/contrib/lite/kernels/mean_test.cc
index 2d6d4bc2da4b75289ee27c3f2a12787216716d44..79c9957f76fdb994be0a71f2e90b883435de4815 100644
--- a/tensorflow/contrib/lite/kernels/mean_test.cc
+++ b/tensorflow/contrib/lite/kernels/mean_test.cc
@@ -37,8 +37,15 @@ class BaseMeanOpModel : public SingleOpModel {
     return ExtractVector<T>(output_);
   }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  int Input() { return input_; }
+
  protected:
   int input_;
   int axis_;
@@ -142,56 +149,64 @@ TEST(DynamicFloatMeanOpTest, Scale) {
   EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({9.527})));
 }
 
+// for quantized Add, the error shouldn't exceed step
+float GetTolerance(int min, int max) { return (max - min) / 255.0; }
+
 TEST(ConstUint8MeanOpTest, NotKeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpConstModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {2}},
-                     {4}, {1, 0, -3, -3}, false);
-  m.SetInput(data);
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {1, 3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {1}, false);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({12, 13}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear(
+                                            {0.4, 0.4}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8MeanOpTest, KeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpConstModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {3}},
-                     {2}, {0, 2}, true);
-  m.SetInput(data);
+  float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
+  std::initializer_list<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  MeanOpConstModel m({TensorType_UINT8, {3, 2}, -1.0, 1.0},
+                     {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {1}, true);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({10, 12, 14}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8MeanOpTest, NotKeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpDynamicModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {2}},
-                       {TensorType_INT32, {4}}, false);
-  std::initializer_list<int> axis = {1, 0, -3, -3};
+  float kQuantizedTolerance = GetTolerance(-5.0, 2.0);
+  std::initializer_list<float> data = {1.3, -4.8, -3.6, 0.24};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -5.0, 2.0},
+                       {TensorType_UINT8, {2}, -5.0, 2.0},
+                       {TensorType_INT32, {1}}, false);
+  std::initializer_list<int> axis = {1};
   m.SetAxis(axis);
-  m.SetInput(data);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({12, 13}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({-1.75, -1.68}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8MeanOpTest, KeepDims) {
-  std::initializer_list<uint8_t> data = {1,  2,  3,  4,  5,  6,  7,  8,
-                                         9,  10, 11, 12, 13, 14, 15, 16,
-                                         17, 18, 19, 20, 21, 22, 23, 24};
-  MeanOpDynamicModel m({TensorType_UINT8, {4, 3, 2}}, {TensorType_UINT8, {3}},
-                       {TensorType_INT32, {2}}, true);
-  std::initializer_list<int> axis = {0, 2};
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::initializer_list<float> data = {11.14, -0.14, 7.423, 0.879};
+  MeanOpDynamicModel m({TensorType_UINT8, {2, 2}, -10.0, 12.0},
+                       {TensorType_UINT8, {2}, -10.0, 12.0},
+                       {TensorType_INT32, {1}}, true);
+  std::initializer_list<int> axis = {0};
   m.SetAxis(axis);
-  m.SetInput(data);
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   m.Invoke();
-  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 1}));
-  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({10, 12, 14}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(
+      m.GetDequantizedOutput(),
+      ElementsAreArray(ArrayFloatNear({9.2815, 0.3695}, kQuantizedTolerance)));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index c29da3862e84d6756bf5ef34b2ca06307b0a065d..4f9449a225c66a0fb2a9285e6aff3a1f7147f5dd 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -119,39 +119,46 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar)                                           \
+#define TF_LITE_PAD(type, scalar, pad_value)                                \
   type::Pad(GetTensorData<scalar>(op_context.input),                        \
             GetTensorDims(op_context.input), before_padding, after_padding, \
             GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output))
+            GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float);
+        TF_LITE_PAD(reference_ops, float, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float);
+        TF_LITE_PAD(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
+      // Quantized Pad requires that 0 is represented in the quantized range.
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                  std::numeric_limits<uint8_t>::min());
+      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t);
+        TF_LITE_PAD(reference_ops, uint8_t,
+                    op_context.output->params.zero_point);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t);
+        TF_LITE_PAD(optimized_ops, uint8_t,
+                    op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t);
+        TF_LITE_PAD(reference_ops, int32_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t);
+        TF_LITE_PAD(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t);
+        TF_LITE_PAD(reference_ops, int64_t, 0);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t);
+        TF_LITE_PAD(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index 28834ad0719291b2e868bca2d86a6685e6eb9962..c06237e5720874e66c5953edab2d3749cc88af28 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class PadOpModel : public SingleOpModel {
  public:
@@ -29,6 +30,10 @@ class PadOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
@@ -36,6 +41,11 @@ class PadOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int output_;
@@ -50,16 +60,17 @@ class PadOpModel : public SingleOpModel {
 //    m.Invoke();
 class PadOpConstModel : public PadOpModel {
  public:
-  PadOpConstModel(std::initializer_list<int> input_shape,
+  PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -72,40 +83,45 @@ class PadOpConstModel : public PadOpModel {
 //    m.Invoke();
 class PadOpDynamicModel : public PadOpModel {
  public:
-  PadOpDynamicModel(std::initializer_list<int> input_shape,
-                    std::initializer_list<int> paddings_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  PadOpDynamicModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    const TensorData& output) {
+    input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
                  CreatePadOptions(builder_).Union());
-    BuildInterpreter({input_shape, paddings_shape});
+    BuildInterpreter({input.shape, paddings_shape});
   }
 };
 
 TEST(PadOpTest, TooManyDimensions) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 2, 3, 4, 5, 6, 7, 8, 9}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                      {TensorType_FLOAT32}),
       "dims != 4");
 }
 
 TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({1, 1, 2, 1}, {3, 2}, {1, 1, 2, 2, 3, 3}),
+  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
 TEST(PadOpTest, InvalidPadValue) {
   EXPECT_DEATH(
-      PadOpConstModel({1, 1, 2, 1}, {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}),
+      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
       "Pad value has to be greater than equal to 0.");
 }
 
 TEST(PadOpTest, SimpleConstTest) {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({1, 2, 2, 1}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -114,7 +130,8 @@ TEST(PadOpTest, SimpleConstTest) {
 }
 
 TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({1, 2, 2, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   m.Invoke();
@@ -124,7 +141,8 @@ TEST(PadOpTest, SimpleDynamicTest) {
 }
 
 TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({1, 2, 3, 1}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0});
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
@@ -134,7 +152,8 @@ TEST(PadOpTest, AdvancedConstTest) {
 }
 
 TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({1, 2, 3, 1}, {4, 2});
+  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   m.Invoke();
@@ -144,6 +163,80 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+class QuantizedPadOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadOpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 1, 1, 1, 1, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, SimpleDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedConstTest) {
+  PadOpConstModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0},
+                    {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
+  PadOpDynamicModel m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+                      {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 0f98154b904b1f776016e6bbee3263027f815244..67ba8d0f394770100b52f42e14d8e1472f303661 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -77,6 +77,8 @@ TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
 TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
+TfLiteRegistration* Register_MINIMUM();
+TfLiteRegistration* Register_ARG_MAX();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -135,6 +137,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index eb374d903182f46b40f5c80bfd769a19a5594742..e6d5c300dcd47821b0572e3239b36f14bd6ea3d0 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -228,6 +228,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_STRIDED_SLICE(reference_ops, int64_t);
       }
       break;
+    case kTfLiteUInt8:
+      if (kernel_type == kReference) {
+        TF_LITE_STRIDED_SLICE(reference_ops, uint8_t);
+      }
+      break;
     default:
       context->ReportError(context,
                            "Type is currently not supported "
diff --git a/tensorflow/contrib/lite/kernels/strided_slice_test.cc b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
index 5c98c5f43181fe75f35716dae5682113bde883ec..22d7b097cbd4e1349516eae9fd378aa186e06de7 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice_test.cc
@@ -24,6 +24,8 @@ namespace {
 using ::int32;
 using ::testing::ElementsAreArray;
 
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
 class StridedSliceOpModel : public SingleOpModel {
  public:
   StridedSliceOpModel(std::initializer_list<int> input_shape,
@@ -32,11 +34,11 @@ class StridedSliceOpModel : public SingleOpModel {
                       std::initializer_list<int> strides_shape, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
                       int shrink_axis_mask) {
-    input_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(tensor_input_type);
     begin_ = AddInput(TensorType_INT32);
     end_ = AddInput(TensorType_INT32);
     strides_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(tensor_input_type);
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
         CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
@@ -45,8 +47,8 @@ class StridedSliceOpModel : public SingleOpModel {
     BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
   }
 
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
   }
   void SetBegin(std::initializer_list<int32> data) {
     PopulateTensor<int32>(begin_, data);
@@ -58,7 +60,9 @@ class StridedSliceOpModel : public SingleOpModel {
     PopulateTensor<int32>(strides_, data);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  private:
@@ -71,19 +75,19 @@ class StridedSliceOpModel : public SingleOpModel {
 
 TEST(StridedSliceOpTest, UnsupportedInputSize) {
   EXPECT_DEATH(
-      StridedSliceOpModel({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
+      StridedSliceOpModel<>({2, 2, 2, 2, 2}, {5}, {5}, {5}, 0, 0, 0, 0, 0),
       "StridedSlice op only supports 1D-4D input arrays.");
 }
 
 TEST(StridedSliceOpTest, UnssupportedArgs) {
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 1, 0, 0),
                "ellipsis_mask is not implemented yet.");
-  EXPECT_DEATH(StridedSliceOpModel({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
+  EXPECT_DEATH(StridedSliceOpModel<>({3, 2}, {2}, {2}, {2}, 0, 0, 0, 1, 0),
                "new_axis_mask is not implemented yet.");
 }
 
 TEST(StridedSliceOpTest, In1D) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -94,7 +98,7 @@ TEST(StridedSliceOpTest, In1D) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutput) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({10});
   m.SetEnd({3});
@@ -104,7 +108,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutput) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({3});
@@ -115,7 +119,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-5});
   m.SetEnd({3});
@@ -126,7 +130,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBegin) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({-2});
@@ -137,7 +141,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({5});
@@ -148,7 +152,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEnd) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -159,7 +163,7 @@ TEST(StridedSliceOpTest, In1D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -170,7 +174,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({5});
   m.SetEnd({2});
@@ -181,7 +185,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeBeginNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({-4});
@@ -192,7 +196,7 @@ TEST(StridedSliceOpTest, In1D_NegativeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-3});
   m.SetEnd({-5});
@@ -203,7 +207,7 @@ TEST(StridedSliceOpTest, In1D_OutOfRangeEndNegativeStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EndMask) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -214,7 +218,7 @@ TEST(StridedSliceOpTest, In1D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegStride) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({-1});
   m.SetEnd({-4});
@@ -225,7 +229,7 @@ TEST(StridedSliceOpTest, In1D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
-  StridedSliceOpModel m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2});
   m.SetBegin({0});
   m.SetEnd({2});
@@ -236,7 +240,7 @@ TEST(StridedSliceOpTest, In1D_EvenLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In1D_OddLenStride2) {
-  StridedSliceOpModel m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({3}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3});
   m.SetBegin({0});
   m.SetEnd({3});
@@ -247,7 +251,7 @@ TEST(StridedSliceOpTest, In1D_OddLenStride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_Identity) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -258,7 +262,7 @@ TEST(StridedSliceOpTest, In2D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In2D) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -269,7 +273,7 @@ TEST(StridedSliceOpTest, In2D) {
 }
 
 TEST(StridedSliceOpTest, In2D_Stride2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -280,7 +284,7 @@ TEST(StridedSliceOpTest, In2D_Stride2) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStride) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -1});
   m.SetEnd({2, -4});
@@ -291,7 +295,7 @@ TEST(StridedSliceOpTest, In2D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In2D_BeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -302,7 +306,7 @@ TEST(StridedSliceOpTest, In2D_BeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_EndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, 0});
   m.SetEnd({2, 2});
@@ -313,7 +317,7 @@ TEST(StridedSliceOpTest, In2D_EndMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 2, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -4});
@@ -324,7 +328,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideBeginMask) {
 }
 
 TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 2, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({1, -2});
   m.SetEnd({2, -3});
@@ -335,7 +339,7 @@ TEST(StridedSliceOpTest, In2D_NegStrideEndMask) {
 }
 
 TEST(StridedSliceOpTest, In3D_Identity) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -347,7 +351,7 @@ TEST(StridedSliceOpTest, In3D_Identity) {
 }
 
 TEST(StridedSliceOpTest, In3D_NegStride) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({-1, -1, -1});
   m.SetEnd({-3, -4, -3});
@@ -359,7 +363,7 @@ TEST(StridedSliceOpTest, In3D_NegStride) {
 }
 
 TEST(StridedSliceOpTest, In3D_Strided2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 0);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -370,7 +374,7 @@ TEST(StridedSliceOpTest, In3D_Strided2) {
 }
 
 TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -381,7 +385,7 @@ TEST(StridedSliceOpTest, In1D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({2});
   m.SetEnd({1});
@@ -392,7 +396,7 @@ TEST(StridedSliceOpTest, In1D_EmptyOutputShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 1, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({1});
   m.SetEnd({3});
@@ -403,7 +407,7 @@ TEST(StridedSliceOpTest, In1D_BeginMaskShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
-  StridedSliceOpModel m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({4}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4});
   m.SetBegin({-2});
   m.SetEnd({-3});
@@ -414,7 +418,7 @@ TEST(StridedSliceOpTest, In1D_NegativeBeginNegativeStrideShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -425,7 +429,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask1) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -436,7 +440,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask2) {
 }
 
 TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetBegin({0, 0});
   m.SetEnd({2, 3});
@@ -447,7 +451,7 @@ TEST(StridedSliceOpTest, In2D_ShrinkAxisMask3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 1);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -458,7 +462,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 2);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -469,7 +473,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis2) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 3);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -480,7 +484,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis3) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 4);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -491,7 +495,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis4) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 5);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -502,7 +506,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis5) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 6);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -513,7 +517,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis6) {
 }
 
 TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
-  StridedSliceOpModel m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
+  StridedSliceOpModel<> m({2, 3, 2}, {3}, {3}, {3}, 0, 0, 0, 0, 7);
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
   m.SetBegin({0, 0, 0});
   m.SetEnd({2, 3, 2});
@@ -525,7 +529,7 @@ TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis7) {
 
 // This tests catches a very subtle bug that was fixed by cl/188403234.
 TEST(StridedSliceOpTest, RunTwice) {
-  StridedSliceOpModel m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 1, 0, 0, 0, 0);
 
   auto setup_inputs = [&m]() {
     m.SetInput({1, 2, 3, 4, 5, 6});
@@ -544,6 +548,17 @@ TEST(StridedSliceOpTest, RunTwice) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 4, 5}));
 }
 
+TEST(StridedSliceOpTest, In3D_IdentityShrinkAxis1Uint8) {
+  StridedSliceOpModel<uint8, TensorType_UINT8> m({2, 3, 2}, {3}, {3}, {3}, 0, 0,
+                                                 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetEnd({2, 3, 2});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 791d1378f393594ceb6f1fcec7cc5aadaa81dab3..87af9530614c5ac1bdc30e6919d527632be7a8a3 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -30,8 +30,41 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+// Ensure that ErrorReporter is non-null.
+ErrorReporter* ValidateErrorReporter(ErrorReporter* e) {
+  return e ? e : DefaultErrorReporter();
+}
+}  // namespace
+
 const char* kEmptyTensorName = "";
 
+TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
+                               ErrorReporter* error_reporter) {
+  switch (tensor_type) {
+    case TensorType_FLOAT32:
+      *type = kTfLiteFloat32;
+      break;
+    case TensorType_INT32:
+      *type = kTfLiteInt32;
+      break;
+    case TensorType_UINT8:
+      *type = kTfLiteUInt8;
+      break;
+    case TensorType_INT64:
+      *type = kTfLiteInt64;
+      break;
+    case TensorType_STRING:
+      *type = kTfLiteString;
+      break;
+    default:
+      error_reporter->Report("Unimplemented data type %s (%d) in tensor\n",
+                             EnumNameTensorType(tensor_type), tensor_type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 // Loads a model from `filename`. If `mmap_file` is true then use mmap,
 // otherwise make a copy of the model in a buffer.
 std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
@@ -52,6 +85,8 @@ std::unique_ptr<Allocation> GetAllocationFromFile(const char* filename,
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
     const char* filename, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -63,6 +98,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromFile(
 std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
     const char* filename, TfLiteVerifier* verifier,
     ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   auto allocation = GetAllocationFromFile(filename, /*mmap_file=*/true,
                                           error_reporter, /*use_nnapi=*/true);
@@ -78,6 +115,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::VerifyAndBuildFromFile(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
     const char* buffer, size_t buffer_size, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   Allocation* allocation =
       new MemoryAllocation(buffer, buffer_size, error_reporter);
@@ -88,6 +127,8 @@ std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromBuffer(
 
 std::unique_ptr<FlatBufferModel> FlatBufferModel::BuildFromModel(
     const tflite::Model* model_spec, ErrorReporter* error_reporter) {
+  error_reporter = ValidateErrorReporter(error_reporter);
+
   std::unique_ptr<FlatBufferModel> model;
   model.reset(new FlatBufferModel(model_spec, error_reporter));
   if (!model->initialized()) model.reset();
@@ -107,15 +148,13 @@ bool FlatBufferModel::CheckModelIdentifier() const {
 
 FlatBufferModel::FlatBufferModel(const Model* model,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   model_ = model;
 }
 
 FlatBufferModel::FlatBufferModel(Allocation* allocation,
                                  ErrorReporter* error_reporter)
-    : error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {
+    : error_reporter_(ValidateErrorReporter(error_reporter)) {
   allocation_ = allocation;
   if (!allocation_->valid() || !CheckModelIdentifier()) return;
 
@@ -128,7 +167,7 @@ InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
       op_resolver_(op_resolver),
-      error_reporter_(model.error_reporter()),
+      error_reporter_(ValidateErrorReporter(model.error_reporter())),
       allocation_(model.allocation()) {}
 
 InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
@@ -136,8 +175,7 @@ InterpreterBuilder::InterpreterBuilder(const ::tflite::Model* model,
                                        ErrorReporter* error_reporter)
     : model_(model),
       op_resolver_(op_resolver),
-      error_reporter_(error_reporter ? error_reporter
-                                     : DefaultErrorReporter()) {}
+      error_reporter_(ValidateErrorReporter(error_reporter)) {}
 
 TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
@@ -223,13 +261,11 @@ T* MallocPOD() {
 // Parse the appropriate data out of the op.
 //
 // This handles builtin data explicitly as there are flatbuffer schemas.
-//
-// Returns memory that must be feed.
-//
-// TODO(nupurgarg): Pass in void ** and return TfLiteStatus to ensure program
-// crashes if error reporter is called.
-void* ParseOpData(const Operator* op, BuiltinOperator op_type,
-                  ErrorReporter* error_reporter) {
+// If it returns kTfLiteOk, it passes the data out with `builtin_data`, which
+// need to be released by calling `free`.`
+// If it returns kTfLiteError, `builtin_data` will be `nullptr`.
+TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
+                         ErrorReporter* error_reporter, void** builtin_data) {
   auto parse_padding = [](Padding padding) {
     switch (padding) {
       case Padding_SAME:
@@ -278,7 +314,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     }
   };
 
-  void* builtin_data = nullptr;
+  *builtin_data = nullptr;
   switch (op_type) {
     case BuiltinOperator_CALL:
       // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
@@ -295,7 +331,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_TANH:
@@ -307,17 +343,33 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_EXP:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_CAST:
     case BuiltinOperator_DEQUANTIZE:
     case BuiltinOperator_PRELU:
       break;
+    case BuiltinOperator_CAST: {
+      TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
+      if (auto* schema_params = op->builtin_options_as_CastOptions()) {
+        auto in_status =
+            ConvertTensorType(schema_params->in_data_type(),
+                              &params->in_data_type, error_reporter);
+        auto out_status =
+            ConvertTensorType(schema_params->out_data_type(),
+                              &params->out_data_type, error_reporter);
+        if (in_status != kTfLiteOk || out_status != kTfLiteOk) {
+          free(params);
+          return kTfLiteError;
+        }
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_LSH_PROJECTION: {
       TfLiteLSHProjectionParams* params =
           MallocPOD<TfLiteLSHProjectionParams>();
       if (auto* lshParams = op->builtin_options_as_LSHProjectionOptions()) {
         params->type = parseLSHProjectionType(lshParams->type());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_AVERAGE_POOL_2D:
@@ -333,7 +385,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(pool_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DEPTHWISE_CONV_2D: {
@@ -347,7 +399,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(conv_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SVDF: {
@@ -357,7 +409,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(svdf_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
@@ -369,7 +421,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(sequence_rnn_params->fused_activation_function());
         params->time_major = sequence_rnn_params->time_major();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RNN: {
@@ -378,7 +430,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(rnn_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_EMBEDDING_LOOKUP:
@@ -391,7 +443,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_EmbeddingLookupSparseOptions()) {
         params->combiner = parseCombinerType(embedding_params->combiner());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_FULLY_CONNECTED: {
@@ -402,7 +454,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation = parse_activation(
             fully_connected_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_HASHTABLE_LOOKUP:
@@ -413,7 +465,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* softmax_params = op->builtin_options_as_SoftmaxOptions()) {
         params->beta = softmax_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_CONCATENATION: {
@@ -425,7 +477,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(concatenation_params->fused_activation_function());
         params->axis = concatenation_params->axis();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_MUL: {
@@ -434,7 +486,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_ADD: {
@@ -443,7 +495,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DIV: {
@@ -452,7 +504,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SUB: {
@@ -461,7 +513,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_L2_NORMALIZATION: {
@@ -470,7 +522,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->activation =
             parse_activation(schema_params->fused_activation_function());
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: {
@@ -482,7 +534,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->alpha = schema_params->alpha();
         params->beta = schema_params->beta();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
@@ -495,7 +547,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_RESIZE_BILINEAR: {
@@ -504,7 +556,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
               op->builtin_options_as_ResizeBilinearOptions()) {
         params->align_corners = schema_params->align_corners();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_PAD: {
@@ -518,7 +570,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->shape, error_reporter);
         params->num_dimensions = new_shape->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SKIP_GRAM: {
@@ -528,7 +580,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->max_skip_size = skip_gram_params->max_skip_size();
         params->include_all_ngrams = skip_gram_params->include_all_ngrams();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_DEPTH: {
@@ -536,7 +588,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SpaceToDepthOptions()) {
         params->block_size = schema_params->block_size();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_GATHER: {
@@ -546,7 +598,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->axis = gather_params->axis();
       }
 
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPACE_TO_BATCH_ND: {
@@ -563,7 +615,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
         params->keep_dims = schema_params->keep_dims();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SPLIT: {
@@ -571,7 +623,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
       if (auto* schema_params = op->builtin_options_as_SplitOptions()) {
         params->num_splits = schema_params->num_splits();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_SQUEEZE: {
@@ -582,7 +634,7 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
                                    params->squeeze_dims, error_reporter);
         params->num_squeeze_dims = squeeze_dims->Length();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_STRIDED_SLICE: {
@@ -594,19 +646,29 @@ void* ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->new_axis_mask = schema_params->new_axis_mask();
         params->shrink_axis_mask = schema_params->shrink_axis_mask();
       }
-      builtin_data = reinterpret_cast<void*>(params);
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM: {
       break;
     }
-    case BuiltinOperator_MAXIMUM: {
+    case BuiltinOperator_ARG_MAX: {
+      auto* params = MallocPOD<TfLiteArgMaxParams>();
+      if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
+        ConvertTensorType(schema_params->output_type(), &params->output_type,
+                          error_reporter);
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
-      break;
+      return kTfLiteError;
     }
   }
-  return builtin_data;
+  return kTfLiteOk;
 }
 
 }  // namespace
@@ -646,10 +708,13 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           reinterpret_cast<const char*>(op->custom_options()->data()),
           op->custom_options()->size(), nullptr, reg);
     } else {
+      void* builtin_data = nullptr;
+      TF_LITE_ENSURE_STATUS(
+          ParseOpData(op, op_type, error_reporter_, &builtin_data));
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0,
-          ParseOpData(op, op_type, error_reporter_), reg);
+          FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
+          reg);
     }
   }
 
@@ -707,29 +772,10 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
     }
 
     TfLiteType type;
-    switch (tensor->type()) {
-      case TensorType_FLOAT32:
-        type = kTfLiteFloat32;
-        break;
-      case TensorType_INT32:
-        type = kTfLiteInt32;
-        break;
-      case TensorType_UINT8:
-        type = kTfLiteUInt8;
-        break;
-      case TensorType_INT64:
-        type = kTfLiteInt64;
-        break;
-      case TensorType_STRING:
-        type = kTfLiteString;
-        break;
-      default:
-        // tensorType = ArrayType::NONE;
-        error_reporter_->Report("Unimplemented data type %s (%d) in tensor\n",
-                                EnumNameTensorType(tensor->type()),
-                                tensor->type());
-        status = kTfLiteError;
-        continue;
+    if (ConvertTensorType(tensor->type(), &type, error_reporter_) !=
+        kTfLiteOk) {
+      status = kTfLiteError;
+      continue;
     }
     auto get_readonly_data = [&](const char** buffer_data,
                                  size_t* buffer_size) {
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index 036dc46e03f565c40791aee55d4158cef5c832e0..5a55b031a8c28085e02782608eb820a3cfe78dde 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -56,27 +56,37 @@ class TfLiteVerifier {
 // or mmapped. This uses flatbuffers as the serialization format.
 class FlatBufferModel {
  public:
-  // Builds a model based on a file. Returns a nullptr in case of failure.
+  // Builds a model based on a file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromFile(
       const char* filename,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Verifies whether the content of the file is legit, then builds a model
-  // based on the file. Returns a nullptr in case of failure.
+  // based on the file.
+  // Caller retains ownership of `error_reporter` and must ensure its lifetime
+  // is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> VerifyAndBuildFromFile(
       const char* filename, TfLiteVerifier* verifier = nullptr,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model based on a pre-loaded flatbuffer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromBuffer(
       const char* buffer, size_t buffer_size,
       ErrorReporter* error_reporter = DefaultErrorReporter());
 
   // Builds a model directly from a flatbuffer pointer. The caller retains
   // ownership of the buffer and should keep it alive until the returned object
-  // is destroyed. Returns a nullptr in case of failure.
+  // is destroyed. Caller retains ownership of `error_reporter` and must ensure
+  // its lifetime is longer than the FlatBufferModel instance.
+  // Returns a nullptr in case of failure.
   static std::unique_ptr<FlatBufferModel> BuildFromModel(
       const tflite::Model* model_spec,
       ErrorReporter* error_reporter = DefaultErrorReporter());
@@ -100,7 +110,10 @@ class FlatBufferModel {
 
  private:
   // Loads a model from a given allocation. FlatBufferModel will take over the
-  // ownership of `allocation`, and delete it in desctructor.
+  // ownership of `allocation`, and delete it in destructor. The ownership of
+  // `error_reporter`remains with the caller and must have lifetime at least
+  // as much as FlatBufferModel. This is to allow multiple models to use the
+  // same ErrorReporter instance.
   FlatBufferModel(Allocation* allocation,
                   ErrorReporter* error_reporter = DefaultErrorReporter());
 
@@ -111,7 +124,10 @@ class FlatBufferModel {
   // Flatbuffer traverser pointer. (Model* is a pointer that is within the
   // allocated memory of the data allocated by allocation's internals.
   const tflite::Model* model_ = nullptr;
+  // The error reporter to use for model errors and subsequent errors when
+  // the interpreter is created
   ErrorReporter* error_reporter_;
+  // The allocator used for holding memory of the model.
   Allocation* allocation_ = nullptr;
 };
 
diff --git a/tensorflow/contrib/lite/models/BUILD b/tensorflow/contrib/lite/models/BUILD
index 6a1255b586ef04b80159156a78f0c4569a4661c5..efa47b06fa7f06cc6312535713ec582af4705d85 100644
--- a/tensorflow/contrib/lite/models/BUILD
+++ b/tensorflow/contrib/lite/models/BUILD
@@ -12,15 +12,3 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 exports_files(glob([
     "testdata/*",
 ]))
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index 733c3f4c7fa0605f24a1e6b4c458e34310c079c4..a82d1f2eb673b9b7211581f5a9f9febc140d4d1e 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -86,15 +86,3 @@ cc_test(
         "@com_google_googletest//:gtest",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/nnapi/BUILD b/tensorflow/contrib/lite/nnapi/BUILD
index 402f1e949b7bb576de4970a8ebb41541fcee1cb2..467a2b7a7bc9a40135428240585cd2c2a133cf9f 100644
--- a/tensorflow/contrib/lite/nnapi/BUILD
+++ b/tensorflow/contrib/lite/nnapi/BUILD
@@ -11,15 +11,3 @@ cc_library(
     ],
     linkopts = ["-ldl"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index bd49d327c995ef53dc6cf9f8301ab749c925b2c7..85aca3687402a89b557d76ab5ace80dea8f8b23d 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 // helpers
 
-#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__);
+#define NNAPI_LOG(format, ...) fprintf(stderr, format "\n", __VA_ARGS__);
 #define LOAD_FUNCTION(name) \
   static name##_fn fn = reinterpret_cast<name##_fn>(loadFunction(#name));
 #define EXECUTE_FUNCTION(...) \
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index decaf9f160ad35b66f0ed56d0840634c610e4246..08fb82076739d58b77cbe83f89a202e81022eeea 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -162,7 +162,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     };
 
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &augmented_inputs, &next_id](int tensor_id) {
+        [interpreter, &nn_model, &augmented_inputs](int tensor_id) {
           const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
           CHECK_NN(ANeuralNetworksModel_setOperandValue(
               nn_model, tensor_id, tensor->data.raw, tensor->bytes));
@@ -351,6 +351,8 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_CAST:
       case tflite::BuiltinOperator_PRELU:
       case tflite::BuiltinOperator_MAXIMUM:
+      case tflite::BuiltinOperator_MINIMUM:
+      case tflite::BuiltinOperator_ARG_MAX:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 411d5c0d272c07b710fe987d25a79f2614bbab4e..6fafaf07273c58cfac264964e807e5ec466608ff 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -101,12 +101,18 @@ py_test(
     name = "convert_saved_model_test",
     srcs = ["convert_saved_model_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     visibility = ["//visibility:public"],
     deps = [
         ":convert_saved_model",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model",
     ],
 )
@@ -118,15 +124,3 @@ py_library(
         ":convert_saved_model",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index d87fbeb91cc3d2779c0ae01aff488f88bd340c1c..734e42d619bdb79de0306a94e304ce46065d14d4 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -25,21 +25,21 @@ from __future__ import print_function
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
-from tensorflow.python import estimator
 from tensorflow.python import keras
-from tensorflow.python import layers
-from tensorflow.python import losses
-from tensorflow.python import nn
-from tensorflow.python import saved_model
-from tensorflow.python import train
 from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator_lib as estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training import training as train
 
 
 class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index b8638007f7e49737726d9939a00e8cb1d6a41281..cb9c0d3121895595ffce91e254bea3f527714809 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -121,8 +121,8 @@ class Interpreter(object):
     Raises:
       ValueError: If the interpreter could not resize the input tensor.
     """
-    if not self.ResizeInputTensor.SetTensor(input_index, tensor_size):
-      raise ValueError('Failed to set input')
+    if not self._interpreter.ResizeInputTensor(input_index, tensor_size):
+      raise ValueError('Failed to resize input')
 
   def get_output_details(self):
     """Gets model output details.
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index cd2386f5263f24e1e034015ec6880e71f0608c7c..f802edf020db8a9d4e7bb890aadaae7e34e983a8 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -81,6 +81,9 @@ class InterpreterTest(test_util.TensorFlowTestCase):
 
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
+    interpreter.resize_tensor_input(input_details[0]['index'],
+                                    np.array(test_input.shape, dtype=np.int32))
+    interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 35ad226b78c906f0819afd5b029a1a0d438d69af..4b349693563e2bc8e73062ed1d8fad9cbb56e9c4 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -186,7 +186,7 @@ bool InterpreterWrapper::ResizeInputTensor(int i, PyObject* value) {
   std::vector<int> dims(PyArray_SHAPE(array)[0]);
   memcpy(dims.data(), PyArray_BYTES(array), dims.size() * sizeof(int));
 
-  return interpreter_->ResizeInputTensor(i, dims);
+  return (interpreter_->ResizeInputTensor(i, dims) == kTfLiteOk);
 }
 
 std::string InterpreterWrapper::TensorName(int i) const {
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index ed6dd036f9fd9f39b74e902498d815793943924b..cf50f9d4d65cb7a36af8f82e2d29babbc9884d23 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -145,7 +145,8 @@ def toco_convert(input_data,
                  input_format=TENSORFLOW_GRAPHDEF,
                  output_format=TFLITE,
                  quantized_input_stats=None,
-                 drop_control_dependency=True):
+                 drop_control_dependency=True,
+                 allow_custom_ops=None):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -178,9 +179,12 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
+  toco.inference_type = inference_type
   toco.drop_control_dependency = drop_control_dependency
+  if allow_custom_ops is not None:
+    toco.allow_custom_ops = allow_custom_ops
+
   model = _model_flags_pb2.ModelFlags()
-  toco.inference_type = inference_type
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = FLOAT
diff --git a/tensorflow/contrib/lite/schema/BUILD b/tensorflow/contrib/lite/schema/BUILD
index da65ec659c7ab39348d2b7911aceaa9dbdd2654b..246ec85fe47e496e157a91ab4ff84f6b1eeab4a4 100644
--- a/tensorflow/contrib/lite/schema/BUILD
+++ b/tensorflow/contrib/lite/schema/BUILD
@@ -70,16 +70,4 @@ cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 7d2e00fe329a5da77af7bf091eaa99badbd1022a..357493755d882b423811e684435b99f81b3f166f 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -132,6 +132,8 @@ enum BuiltinOperator : byte {
   CAST = 53,
   PRELU = 54,
   MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
 }
 
 // Options for the builtin operators.
@@ -174,7 +176,8 @@ union BuiltinOptions {
   LogSoftmaxOptions,
   CastOptions,
   DequantizeOptions,
-  MaximumOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -381,12 +384,18 @@ table LogSoftmaxOptions {
 }
 
 table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
 }
 
 table DequantizeOptions {
 }
 
-table MaximumOptions {
+table MaximumMinimumOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
 }
 
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 66a97a1460d12b48102f53f975cb1e25e7735111..c638daf66ef51268040d5c4f4cebffc272d3b70f 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -145,8 +145,11 @@ struct CastOptionsT;
 struct DequantizeOptions;
 struct DequantizeOptionsT;
 
-struct MaximumOptions;
-struct MaximumOptionsT;
+struct MaximumMinimumOptions;
+struct MaximumMinimumOptionsT;
+
+struct ArgMaxOptions;
+struct ArgMaxOptionsT;
 
 struct OperatorCode;
 struct OperatorCodeT;
@@ -259,11 +262,13 @@ enum BuiltinOperator {
   BuiltinOperator_CAST = 53,
   BuiltinOperator_PRELU = 54,
   BuiltinOperator_MAXIMUM = 55,
+  BuiltinOperator_ARG_MAX = 56,
+  BuiltinOperator_MINIMUM = 57,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_MAXIMUM
+  BuiltinOperator_MAX = BuiltinOperator_MINIMUM
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[56] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -318,7 +323,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[54] {
     BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
     BuiltinOperator_CAST,
     BuiltinOperator_PRELU,
-    BuiltinOperator_MAXIMUM
+    BuiltinOperator_MAXIMUM,
+    BuiltinOperator_ARG_MAX,
+    BuiltinOperator_MINIMUM
   };
   return values;
 }
@@ -381,6 +388,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "CAST",
     "PRELU",
     "MAXIMUM",
+    "ARG_MAX",
+    "MINIMUM",
     nullptr
   };
   return names;
@@ -431,12 +440,13 @@ enum BuiltinOptions {
   BuiltinOptions_LogSoftmaxOptions = 36,
   BuiltinOptions_CastOptions = 37,
   BuiltinOptions_DequantizeOptions = 38,
-  BuiltinOptions_MaximumOptions = 39,
+  BuiltinOptions_MaximumMinimumOptions = 39,
+  BuiltinOptions_ArgMaxOptions = 40,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_MaximumOptions
+  BuiltinOptions_MAX = BuiltinOptions_ArgMaxOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[41] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -477,7 +487,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[40] {
     BuiltinOptions_LogSoftmaxOptions,
     BuiltinOptions_CastOptions,
     BuiltinOptions_DequantizeOptions,
-    BuiltinOptions_MaximumOptions
+    BuiltinOptions_MaximumMinimumOptions,
+    BuiltinOptions_ArgMaxOptions
   };
   return values;
 }
@@ -523,7 +534,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "LogSoftmaxOptions",
     "CastOptions",
     "DequantizeOptions",
-    "MaximumOptions",
+    "MaximumMinimumOptions",
+    "ArgMaxOptions",
     nullptr
   };
   return names;
@@ -690,8 +702,12 @@ template<> struct BuiltinOptionsTraits<DequantizeOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_DequantizeOptions;
 };
 
-template<> struct BuiltinOptionsTraits<MaximumOptions> {
-  static const BuiltinOptions enum_value = BuiltinOptions_MaximumOptions;
+template<> struct BuiltinOptionsTraits<MaximumMinimumOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MaximumMinimumOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ArgMaxOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ArgMaxOptions;
 };
 
 struct BuiltinOptionsUnion {
@@ -1029,13 +1045,21 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_DequantizeOptions ?
       reinterpret_cast<const DequantizeOptionsT *>(value) : nullptr;
   }
-  MaximumOptionsT *AsMaximumOptions() {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<MaximumOptionsT *>(value) : nullptr;
+  MaximumMinimumOptionsT *AsMaximumMinimumOptions() {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  const MaximumMinimumOptionsT *AsMaximumMinimumOptions() const {
+    return type == BuiltinOptions_MaximumMinimumOptions ?
+      reinterpret_cast<const MaximumMinimumOptionsT *>(value) : nullptr;
+  }
+  ArgMaxOptionsT *AsArgMaxOptions() {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<ArgMaxOptionsT *>(value) : nullptr;
   }
-  const MaximumOptionsT *AsMaximumOptions() const {
-    return type == BuiltinOptions_MaximumOptions ?
-      reinterpret_cast<const MaximumOptionsT *>(value) : nullptr;
+  const ArgMaxOptionsT *AsArgMaxOptions() const {
+    return type == BuiltinOptions_ArgMaxOptions ?
+      reinterpret_cast<const ArgMaxOptionsT *>(value) : nullptr;
   }
 };
 
@@ -3702,14 +3726,30 @@ flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::Flat
 
 struct CastOptionsT : public flatbuffers::NativeTable {
   typedef CastOptions TableType;
-  CastOptionsT() {
+  TensorType in_data_type;
+  TensorType out_data_type;
+  CastOptionsT()
+      : in_data_type(TensorType_FLOAT32),
+        out_data_type(TensorType_FLOAT32) {
   }
 };
 
 struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef CastOptionsT NativeTableType;
+  enum {
+    VT_IN_DATA_TYPE = 4,
+    VT_OUT_DATA_TYPE = 6
+  };
+  TensorType in_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_IN_DATA_TYPE, 0));
+  }
+  TensorType out_data_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUT_DATA_TYPE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_IN_DATA_TYPE) &&
+           VerifyField<int8_t>(verifier, VT_OUT_DATA_TYPE) &&
            verifier.EndTable();
   }
   CastOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -3720,6 +3760,12 @@ struct CastOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct CastOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
+  void add_in_data_type(TensorType in_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_IN_DATA_TYPE, static_cast<int8_t>(in_data_type), 0);
+  }
+  void add_out_data_type(TensorType out_data_type) {
+    fbb_.AddElement<int8_t>(CastOptions::VT_OUT_DATA_TYPE, static_cast<int8_t>(out_data_type), 0);
+  }
   explicit CastOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -3733,8 +3779,12 @@ struct CastOptionsBuilder {
 };
 
 inline flatbuffers::Offset<CastOptions> CreateCastOptions(
-    flatbuffers::FlatBufferBuilder &_fbb) {
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType in_data_type = TensorType_FLOAT32,
+    TensorType out_data_type = TensorType_FLOAT32) {
   CastOptionsBuilder builder_(_fbb);
+  builder_.add_out_data_type(out_data_type);
+  builder_.add_in_data_type(in_data_type);
   return builder_.Finish();
 }
 
@@ -3780,45 +3830,99 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(
 
 flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct MaximumOptionsT : public flatbuffers::NativeTable {
-  typedef MaximumOptions TableType;
-  MaximumOptionsT() {
+struct MaximumMinimumOptionsT : public flatbuffers::NativeTable {
+  typedef MaximumMinimumOptions TableType;
+  MaximumMinimumOptionsT() {
   }
 };
 
-struct MaximumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef MaximumOptionsT NativeTableType;
+struct MaximumMinimumOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MaximumMinimumOptionsT NativeTableType;
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            verifier.EndTable();
   }
-  MaximumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<MaximumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  MaximumMinimumOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MaximumMinimumOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-struct MaximumOptionsBuilder {
+struct MaximumMinimumOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  explicit MaximumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit MaximumMinimumOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  MaximumOptionsBuilder &operator=(const MaximumOptionsBuilder &);
-  flatbuffers::Offset<MaximumOptions> Finish() {
+  MaximumMinimumOptionsBuilder &operator=(const MaximumMinimumOptionsBuilder &);
+  flatbuffers::Offset<MaximumMinimumOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<MaximumOptions>(end);
+    auto o = flatbuffers::Offset<MaximumMinimumOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
     flatbuffers::FlatBufferBuilder &_fbb) {
-  MaximumOptionsBuilder builder_(_fbb);
+  MaximumMinimumOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ArgMaxOptionsT : public flatbuffers::NativeTable {
+  typedef ArgMaxOptions TableType;
+  TensorType output_type;
+  ArgMaxOptionsT()
+      : output_type(TensorType_FLOAT32) {
+  }
+};
+
+struct ArgMaxOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ArgMaxOptionsT NativeTableType;
+  enum {
+    VT_OUTPUT_TYPE = 4
+  };
+  TensorType output_type() const {
+    return static_cast<TensorType>(GetField<int8_t>(VT_OUTPUT_TYPE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_OUTPUT_TYPE) &&
+           verifier.EndTable();
+  }
+  ArgMaxOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ArgMaxOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArgMaxOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_output_type(TensorType output_type) {
+    fbb_.AddElement<int8_t>(ArgMaxOptions::VT_OUTPUT_TYPE, static_cast<int8_t>(output_type), 0);
+  }
+  explicit ArgMaxOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ArgMaxOptionsBuilder &operator=(const ArgMaxOptionsBuilder &);
+  flatbuffers::Offset<ArgMaxOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ArgMaxOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    TensorType output_type = TensorType_FLOAT32) {
+  ArgMaxOptionsBuilder builder_(_fbb);
+  builder_.add_output_type(output_type);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
@@ -4051,8 +4155,11 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const DequantizeOptions *builtin_options_as_DequantizeOptions() const {
     return builtin_options_type() == BuiltinOptions_DequantizeOptions ? static_cast<const DequantizeOptions *>(builtin_options()) : nullptr;
   }
-  const MaximumOptions *builtin_options_as_MaximumOptions() const {
-    return builtin_options_type() == BuiltinOptions_MaximumOptions ? static_cast<const MaximumOptions *>(builtin_options()) : nullptr;
+  const MaximumMinimumOptions *builtin_options_as_MaximumMinimumOptions() const {
+    return builtin_options_type() == BuiltinOptions_MaximumMinimumOptions ? static_cast<const MaximumMinimumOptions *>(builtin_options()) : nullptr;
+  }
+  const ArgMaxOptions *builtin_options_as_ArgMaxOptions() const {
+    return builtin_options_type() == BuiltinOptions_ArgMaxOptions ? static_cast<const ArgMaxOptions *>(builtin_options()) : nullptr;
   }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
@@ -4232,8 +4339,12 @@ template<> inline const DequantizeOptions *Operator::builtin_options_as<Dequanti
   return builtin_options_as_DequantizeOptions();
 }
 
-template<> inline const MaximumOptions *Operator::builtin_options_as<MaximumOptions>() const {
-  return builtin_options_as_MaximumOptions();
+template<> inline const MaximumMinimumOptions *Operator::builtin_options_as<MaximumMinimumOptions>() const {
+  return builtin_options_as_MaximumMinimumOptions();
+}
+
+template<> inline const ArgMaxOptions *Operator::builtin_options_as<ArgMaxOptions>() const {
+  return builtin_options_as_ArgMaxOptions();
 }
 
 struct OperatorBuilder {
@@ -5727,6 +5838,8 @@ inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t
 inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; };
+  { auto _e = out_data_type(); _o->out_data_type = _e; };
 }
 
 inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -5737,8 +5850,12 @@ inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBuffe
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
   return tflite::CreateCastOptions(
-      _fbb);
+      _fbb,
+      _in_data_type,
+      _out_data_type);
 }
 
 inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5764,29 +5881,55 @@ inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffer
       _fbb);
 }
 
-inline MaximumOptionsT *MaximumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MaximumOptionsT();
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumMinimumOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MaximumOptions::UnPackTo(MaximumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<MaximumOptions> MaximumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMaximumOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MaximumOptions> CreateMaximumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateMaximumOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
       _fbb);
 }
 
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ArgMaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; };
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -6119,8 +6262,12 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
     default: return false;
@@ -6293,8 +6440,12 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const DequantizeOptions *>(obj);
       return ptr->UnPack(resolver);
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptions *>(obj);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptions *>(obj);
       return ptr->UnPack(resolver);
     }
     default: return nullptr;
@@ -6455,9 +6606,13 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const DequantizeOptionsT *>(value);
       return CreateDequantizeOptions(_fbb, ptr, _rehasher).Union();
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<const MaximumOptionsT *>(value);
-      return CreateMaximumOptions(_fbb, ptr, _rehasher).Union();
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<const MaximumMinimumOptionsT *>(value);
+      return CreateMaximumMinimumOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<const ArgMaxOptionsT *>(value);
+      return CreateArgMaxOptions(_fbb, ptr, _rehasher).Union();
     }
     default: return 0;
   }
@@ -6617,8 +6772,12 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new DequantizeOptionsT(*reinterpret_cast<DequantizeOptionsT *>(u.value));
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      value = new MaximumOptionsT(*reinterpret_cast<MaximumOptionsT *>(u.value));
+    case BuiltinOptions_MaximumMinimumOptions: {
+      value = new MaximumMinimumOptionsT(*reinterpret_cast<MaximumMinimumOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      value = new ArgMaxOptionsT(*reinterpret_cast<ArgMaxOptionsT *>(u.value));
       break;
     }
     default:
@@ -6818,8 +6977,13 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
-    case BuiltinOptions_MaximumOptions: {
-      auto ptr = reinterpret_cast<MaximumOptionsT *>(value);
+    case BuiltinOptions_MaximumMinimumOptions: {
+      auto ptr = reinterpret_cast<MaximumMinimumOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ArgMaxOptions: {
+      auto ptr = reinterpret_cast<ArgMaxOptionsT *>(value);
       delete ptr;
       break;
     }
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 12b7b3c35088a0560213e2e1431f23427d4fe640..1ce89a25fd254c324d0318309c87c61b32dee23d 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -18,6 +18,7 @@ gen_zipped_test_files(
     name = "optest",
     files = [
         "add.zip",
+        "arg_max.zip",
         "avg_pool.zip",
         "batch_to_space_nd.zip",
         "concat.zip",
@@ -38,9 +39,9 @@ gen_zipped_test_files(
         "max_pool.zip",
         "maximum.zip",
         "mean.zip",
+        "minimum.zip",
         "mul.zip",
         "pad.zip",
-        "prelu.zip",
         "relu.zip",
         "relu1.zip",
         "relu6.zip",
@@ -194,7 +195,6 @@ cc_library(
 
 cc_library(
     name = "util",
-    testonly = 1,
     hdrs = ["util.h"],
 )
 
@@ -249,7 +249,6 @@ cc_test(
 
 cc_library(
     name = "generate_testspec",
-    testonly = 1,
     srcs = ["generate_testspec.cc"],
     hdrs = ["generate_testspec.h"],
     deps = [
@@ -275,7 +274,6 @@ cc_test(
 
 cc_library(
     name = "tflite_diff_util",
-    testonly = 1,
     srcs = ["tflite_diff_util.cc"],
     hdrs = ["tflite_diff_util.h"],
     deps = [
@@ -293,7 +291,6 @@ cc_library(
 
 cc_library(
     name = "tflite_diff_flags",
-    testonly = 1,
     hdrs = ["tflite_diff_flags.h"],
     deps = [
         ":split",
@@ -336,6 +333,15 @@ tf_cc_test(
     ],
 )
 
+cc_binary(
+    name = "tflite_diff",
+    srcs = ["tflite_diff_example_test.cc"],
+    deps = [
+        ":tflite_diff_flags",
+        ":tflite_diff_util",
+    ],
+)
+
 tf_cc_test(
     name = "generated_examples_zip_test",
     size = "large",
@@ -374,16 +380,4 @@ tf_cc_test(
     }),
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 68bce19aa372280219fb2be9ebe3bef2ad03ec05..0e6aceeb86afa7a5f8dfa12ed66fc9a459be9b6a 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -17,10 +17,9 @@
 
 Usage:
 
-generate_examples <output directory> zipped
+generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
-    third_party/tensorflow/contrib/lite/testing/generated_examples zipped
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -52,8 +51,6 @@ from tensorflow.python.ops import rnn
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-# TODO(ahentz): remove this flag
-parser.add_argument("type", help="zipped")
 parser.add_argument("--zip_to_output",
                     type=str,
                     help="Particular zip to output.",
@@ -107,6 +104,10 @@ KNOWN_BUGS = {
     r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Needs support for dimensions other than the last one in argmax.
+    r"arg_max.*axis=0.*": "77546240",
+    r"arg_max.*axis=1.*": "77546240",
+    r"arg_max.*axis=2.*": "77546240",
 }
 
 
@@ -543,6 +544,18 @@ def make_pool_tests(pool_op_in):
   return f
 
 
+def make_l2_pool_tests(zip_path):
+  make_pool_tests(make_l2_pool)(zip_path)
+
+
+def make_avg_pool_tests(zip_path):
+  make_pool_tests(tf.nn.avg_pool)(zip_path)
+
+
+def make_max_pool_tests(zip_path):
+  make_pool_tests(tf.nn.max_pool)(zip_path)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -617,54 +630,6 @@ def make_relu6_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_prelu_tests(zip_path):
-  """Make a set of tests to do PReLU."""
-
-  test_parameters = [{
-      # The canonical case for image processing is having a 4D `input` (NHWC)
-      # and `shared_axes`=[1, 2], so the alpha parameter is per channel.
-      "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
-      "shared_axes": [[1, 2], [1]],
-  }]
-
-  def build_graph(parameters):
-    """Build the graph for the test case."""
-
-    input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
-    prelu = tf.keras.layers.PReLU(shared_axes=parameters["shared_axes"])
-    out = prelu(input_tensor)
-    return [input_tensor], [out]
-
-  def build_inputs(parameters, sess, inputs, outputs):
-    """Build the inputs for the test case."""
-
-    input_shape = parameters["input_shape"]
-    input_values = create_tensor_data(
-        np.float32, input_shape, min_value=-10, max_value=10)
-    shared_axes = parameters["shared_axes"]
-
-    alpha_shape = []
-    for dim in range(1, len(input_shape)):
-      alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
-
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
-
-    with tf.variable_scope("", reuse=True):
-      alpha = tf.get_variable("p_re_lu/alpha")
-      sess.run(alpha.assign(alpha_values))
-
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
-
-  make_zip_of_tests(
-      zip_path,
-      test_parameters,
-      build_graph,
-      build_inputs,
-      use_frozen_graph=True)
-
-
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -897,11 +862,62 @@ def make_maximum_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_minimum_tests(zip_path):
+  """Make a set of tests to do minimum."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build the minimum op testing graph."""
+    input_tensor_1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_1",
+        shape=parameters["input_shape_1"])
+    input_tensor_2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input_2",
+        shape=parameters["input_shape_2"])
+
+    out = tf.minimum(input_tensor_1, input_tensor_2)
+    return [input_tensor_1, input_tensor_2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_1"]),
+        create_tensor_data(parameters["input_dtype"],
+                           parameters["input_shape_2"])
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_binary_op_tests_func(binary_operator):
   """Return a function that does a test on a binary operator."""
   return lambda zip_path: make_binary_op_tests(zip_path, binary_operator)
 
 
+def make_add_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.add)
+
+
+def make_div_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.div)
+
+
+def make_sub_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.subtract)
+
+
+def make_mul_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.multiply)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -909,12 +925,11 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      # TODO(mgubin): add tests for axis != 0 when it is supported by TfLite.
       "params_dtype": [tf.float32, tf.int32],
       "params_shape": [[10], [1, 2, 20]],
       "indices_dtype": [tf.int32],
       "indices_shape": [[3], [5]],
-      "axis": [0],  # axis!=0 is GatherV2
+      "axis": [0, 1],
   }]
 
   def build_graph(parameters):
@@ -1170,7 +1185,7 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_concatenation_tests(zip_path):
+def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
   test_parameters = [{
@@ -1930,7 +1945,7 @@ def make_l2_pool(input_tensor, ksize, strides, padding, data_format):
 
 
 def make_topk_tests(zip_path):
-  """Make a set of tests to do gather."""
+  """Make a set of tests to do topk."""
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
@@ -1938,7 +1953,7 @@ def make_topk_tests(zip_path):
   }]
 
   def build_graph(parameters):
-    """Build the gather op testing graph."""
+    """Build the topk op testing graph."""
     input_value = tf.placeholder(
         dtype=parameters["input_dtype"],
         name="input",
@@ -1955,6 +1970,36 @@ def make_topk_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
+def make_arg_max_tests(zip_path):
+  """Make a set of tests to do arg_max."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "axis": [0, 1, 2, 3],
+      "output_type": [tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the topk op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis = tf.constant(parameters["axis"], name="axis")
+    out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
@@ -1967,69 +2012,26 @@ def main(unused_args):
       if not os.path.isdir(x):
         raise RuntimeError("Failed to create dir %r" % x)
 
-  if FLAGS.type == "zipped":
-    opstest_path = os.path.join(FLAGS.output_path)
-    mkdir_if_not_exist(opstest_path)
-    def _path(filename):
-      return os.path.join(opstest_path, filename)
-
-    dispatch = {
-        "control_dep.zip": make_control_dep_tests,
-        "add.zip": make_binary_op_tests_func(tf.add),
-        "space_to_batch_nd.zip": make_space_to_batch_nd_tests,
-        "div.zip": make_binary_op_tests_func(tf.div),
-        "sub.zip": make_binary_op_tests_func(tf.subtract),
-        "batch_to_space_nd.zip": make_batch_to_space_nd_tests,
-        "conv.zip": make_conv_tests,
-        "constant.zip": make_constant_tests,
-        "depthwiseconv.zip": make_depthwiseconv_tests,
-        "concat.zip": make_concatenation_tests,
-        "fully_connected.zip": make_fully_connected_tests,
-        "global_batch_norm.zip": make_global_batch_norm_tests,
-        "gather.zip": make_gather_tests,
-        "fused_batch_norm.zip": make_fused_batch_norm_tests,
-        "l2norm.zip": make_l2norm_tests,
-        "local_response_norm.zip": make_local_response_norm_tests,
-        "mul.zip": make_binary_op_tests_func(tf.multiply),
-        "relu.zip": make_relu_tests,
-        "relu1.zip": make_relu1_tests,
-        "relu6.zip": make_relu6_tests,
-        "prelu.zip": make_prelu_tests,
-        "l2_pool.zip": make_pool_tests(make_l2_pool),
-        "avg_pool.zip": make_pool_tests(tf.nn.avg_pool),
-        "max_pool.zip": make_pool_tests(tf.nn.max_pool),
-        "pad.zip": make_pad_tests,
-        "reshape.zip": make_reshape_tests,
-        "resize_bilinear.zip": make_resize_bilinear_tests,
-        "sigmoid.zip": make_sigmoid_tests,
-        "softmax.zip": make_softmax_tests,
-        "space_to_depth.zip": make_space_to_depth_tests,
-        "topk.zip": make_topk_tests,
-        "split.zip": make_split_tests,
-        "transpose.zip": make_transpose_tests,
-        "mean.zip": make_mean_tests,
-        "squeeze.zip": make_squeeze_tests,
-        "strided_slice.zip": make_strided_slice_tests,
-        "exp.zip": make_exp_tests,
-        "log_softmax.zip": make_log_softmax_tests,
-        "lstm.zip": make_lstm_tests,
-        "maximum.zip": make_maximum_tests,
-    }
-    out = FLAGS.zip_to_output
-    bin_path = FLAGS.toco
-    if out in dispatch:
-      dispatch[out](_path(out))
-    else:
-      raise RuntimeError("Invalid zip to output %r" % out)
+  opstest_path = os.path.join(FLAGS.output_path)
+  mkdir_if_not_exist(opstest_path)
 
-  else:
-    raise RuntimeError("Invalid argument for type of generation.")
+  out = FLAGS.zip_to_output
+  bin_path = FLAGS.toco
+  test_function = ("make_%s_tests" % out.replace(".zip", ""))
+  if test_function not in globals():
+    raise RuntimeError("Can't find a test function to create %r. Tried %r" %
+                       (out, test_function))
+
+  # TODO(ahentz): accessing globals() is not very elegant. We should either
+  # break this file into multiple tests or use decorator-based registration to
+  # avoid using globals().
+  globals()[test_function](os.path.join(opstest_path, out))
 
 
 if __name__ == "__main__":
   FLAGS, unparsed = parser.parse_known_args()
 
   if unparsed:
-    print("Usage: %s <path out> zipped <zip file to generate>")
+    print("Usage: %s <path out> <zip file to generate>")
   else:
     tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index eb3deafb6986e877f0a553a8b6f712102af4caca..6580845af42b3cdded19b578b41c682089aaf9ef 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -22,7 +22,22 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-void GenerateTestSpecFromTensorflowModel(
+template <typename T>
+void GenerateCsv(const std::vector<int>& shape, float min, float max,
+                 string* out) {
+  auto random_float = [](int min, int max) {
+    static unsigned int seed;
+    return min + (max - min) * static_cast<float>(rand_r(&seed)) / RAND_MAX;
+  };
+
+  std::function<T(int)> random_t = [&](int) {
+    return static_cast<T>(random_float(min, max));
+  };
+  std::vector<T> data = GenerateRandomTensor(shape, random_t);
+  *out = Join(data.data(), data.size(), ",");
+}
+
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
@@ -31,12 +46,6 @@ void GenerateTestSpecFromTensorflowModel(
   CHECK_EQ(input_layer.size(), input_layer_type.size());
   CHECK_EQ(input_layer.size(), input_layer_shape.size());
 
-  // Initialize random functions.
-  static unsigned int seed = 0;
-  std::function<float(int)> float_rand = [](int idx) {
-    return static_cast<float>(rand_r(&seed)) / RAND_MAX - 0.5f;
-  };
-
   // Generate inputs.
   std::vector<string> input_values;
   input_values.resize(input_layer.size());
@@ -46,15 +55,25 @@ void GenerateTestSpecFromTensorflowModel(
     auto shape = Split<int>(input_layer_shape[i], ",");
 
     switch (type) {
-      case tensorflow::DT_FLOAT: {
-        const auto& data = GenerateRandomTensor<float>(shape, float_rand);
-        input_values[i] = Join(data.data(), data.size(), ",");
+      case tensorflow::DT_FLOAT:
+        GenerateCsv<float>(shape, -0.5, 0.5, &input_values[i]);
+        break;
+      case tensorflow::DT_UINT8:
+        GenerateCsv<uint8_t>(shape, 0, 255, &input_values[i]);
+        break;
+      case tensorflow::DT_INT32:
+        GenerateCsv<int32_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_INT64:
+        GenerateCsv<int64_t>(shape, -100, 100, &input_values[i]);
+        break;
+      case tensorflow::DT_BOOL:
+        GenerateCsv<int>(shape, 0.01, 1.99, &input_values[i]);
         break;
-      }
       default:
-
-        fprintf(stderr, "Unsupported type %d when generating testspec\n", type);
-        return;
+        fprintf(stderr, "Unsupported type %d (%s) when generating testspec.\n",
+                type, input_layer_type[i].c_str());
+        return false;
     }
   }
 
@@ -82,6 +101,8 @@ void GenerateTestSpecFromTensorflowModel(
     stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
   }
   stream << "}\n";
+
+  return true;
 }
 
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
index 3529ee709b66625fff6e2a35b78e47f3778f0fe7..6e31a853c3f7f82a89126ff83af784ffd418741a 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -34,7 +34,7 @@ namespace testing {
 //   input_layer_type: datatypes of input tensors. Example: float
 //   input_layer_shape: shapes of input tensors, separated by comma. example:
 //   1,3,4 output_layer: names of output tensors. Example: output
-void GenerateTestSpecFromTensorflowModel(
+bool GenerateTestSpecFromTensorflowModel(
     std::iostream& stream, const string& tensorflow_model_path,
     const string& tflite_model_path, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index e9d505a76d15c8eaf1d3b6ba55bffe512532585e..7426ab56af3344eb9ca66aa23476c84b3b391a34 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -91,6 +91,14 @@ std::map<string, string> kBrokenTests = {
 
     // PRelu only supports 4D input with (1, 1, channels) 3D alpha now.
     {R"(^\/prelu.*shared_axes=\[1\])", "75975192"},
+
+    // No support for axis!=0 in GatherV2.
+    {R"(^\/gather.*axis=1)", "76910444"},
+
+    // No support for arbitrary dimensions in ArgMax.
+    {R"(^\/arg_max.*axis=0)", "77546240"},
+    {R"(^\/arg_max.*axis=1)", "77546240"},
+    {R"(^\/arg_max.*axis=2)", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -233,6 +241,7 @@ TEST_P(OpsTest, RunStuff) {
       ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
 
 INSTANTIATE_TESTS(add)
+INSTANTIATE_TESTS(arg_max)
 INSTANTIATE_TESTS(avg_pool)
 INSTANTIATE_TESTS(batch_to_space_nd)
 INSTANTIATE_TESTS(concat)
@@ -253,6 +262,7 @@ INSTANTIATE_TESTS(log_softmax)
 INSTANTIATE_TESTS(maximum)
 INSTANTIATE_TESTS(max_pool)
 INSTANTIATE_TESTS(mean)
+INSTANTIATE_TESTS(minimum)
 INSTANTIATE_TESTS(mul)
 INSTANTIATE_TESTS(pad)
 INSTANTIATE_TESTS(relu)
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 2c253bb1983e5ddc5bc12858c929585d1bcee710..7b295875aab12bf48da2341ce05dd53442464cf0 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -87,10 +87,9 @@ TfDriver::TfDriver(const std::vector<string>& input_layer,
 
 void TfDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
   std::ifstream model(bin_file_path);
   if (model.fail()) {
-    Invalidate("Failed to find the model");
+    Invalidate("Failed to find the model " + bin_file_path);
     return;
   }
 
@@ -121,6 +120,10 @@ void TfDriver::SetInput(int id, const string& csv_values) {
       FillTensorWithData<int32_t>(&tensor, csv_values);
       break;
     }
+    case tensorflow::DT_UINT8: {
+      FillTensorWithData<uint8_t>(&tensor, csv_values);
+      break;
+    }
     default:
       fprintf(stderr, "Unsupported type %d in SetInput\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
@@ -162,6 +165,8 @@ string TfDriver::ReadOutput(int id) {
       return TensorDataToCsvString<float>(output_tensors_[id]);
     case tensorflow::DT_INT32:
       return TensorDataToCsvString<int32_t>(output_tensors_[id]);
+    case tensorflow::DT_UINT8:
+      return TensorDataToCsvString<uint8_t>(output_tensors_[id]);
     default:
       fprintf(stderr, "Unsupported type %d in ResetTensor\n", input_types_[id]);
       Invalidate("Unsupported tensor data type");
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
index 3817e68111dbaaf2a38ceff9fbc38f30f303cb5f..5afa0f800cdaa8bf70a11cb6e2ac64ace8138e79 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_example_test.cc
@@ -19,10 +19,13 @@ limitations under the License.
 int main(int argc, char** argv) {
   ::tflite::testing::DiffOptions options =
       ::tflite::testing::ParseTfliteDiffFlags(&argc, argv);
+  if (options.tensorflow_model.empty()) return 1;
+  int failure_count = 0;
   for (int i = 0; i < 100; i++) {
     if (!tflite::testing::RunDiffTest(options)) {
-      return 1;
+      ++failure_count;
     }
   }
-  return 0;
+  fprintf(stderr, "Num errors: %d\n", failure_count);
+  return failure_count != 0 ? 1 : 0;
 }
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 5f1129d501b7235f1202b704cf36904e07b8720e..706108ed73bb3fd9bd784cffffe322d6981433e6 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -51,9 +51,11 @@ DiffOptions ParseTfliteDiffFlags(int* argc, char** argv) {
                        "output_1,output_2"),
   };
 
+  bool no_inputs = *argc == 1;
   bool success = tensorflow::Flags::Parse(argc, argv, flags);
-  if (!success || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
+  if (!success || no_inputs || (*argc == 2 && !strcmp(argv[1], "--helpfull"))) {
     fprintf(stderr, "%s", tensorflow::Flags::Usage(argv[0], flags).c_str());
+    return {};
   }
 
   return {values.tensorflow_model,
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_util.cc b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
index 9ef4e1f66c7d31c746c18d63495e760585d4af9e..f601d3752ddb5df9f2b5ac73d9bc303efaade4a5 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_util.cc
+++ b/tensorflow/contrib/lite/testing/tflite_diff_util.cc
@@ -27,13 +27,13 @@ namespace testing {
 
 bool RunDiffTest(const DiffOptions& options) {
   std::stringstream tflite_stream;
-  GenerateTestSpecFromTensorflowModel(
-      tflite_stream, options.tensorflow_model, options.tflite_model,
-      options.input_layer, options.input_layer_type, options.input_layer_shape,
-      options.output_layer);
+  if (!GenerateTestSpecFromTensorflowModel(
+          tflite_stream, options.tensorflow_model, options.tflite_model,
+          options.input_layer, options.input_layer_type,
+          options.input_layer_shape, options.output_layer))
+    return false;
   TfLiteDriver tflite_driver(/*use_nnapi=*/true);
   tflite_driver.LoadModel(options.tflite_model);
-  std::cout << tflite_stream.str();
   return tflite::testing::ParseAndRunTests(&tflite_stream, &tflite_driver);
 }
 }  // namespace testing
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 613223f3d4ff212cb8672494243b2d7a1d06b3db..3764bab03575f41760e8c81bf36e11f1fd984933 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -56,12 +56,16 @@ void SetTensorData(const std::vector<T>& values, TfLitePtrUnion* data) {
 
 class TfLiteDriver::Expectation {
  public:
-  Expectation() { data_.raw = nullptr; }
+  Expectation() {
+    data_.raw = nullptr;
+    num_elements_ = 0;
+  }
   ~Expectation() { delete[] data_.raw; }
   template <typename T>
   void SetData(const string& csv_values) {
     const auto& values = testing::Split<T>(csv_values, ",");
-    data_.raw = new char[values.size() * sizeof(T)];
+    num_elements_ = values.size();
+    data_.raw = new char[num_elements_ * sizeof(T)];
     SetTensorData(values, &data_);
   }
 
@@ -88,7 +92,13 @@ class TfLiteDriver::Expectation {
     constexpr double kRelativeThreshold = 1e-2f;
     constexpr double kAbsoluteThreshold = 1e-4f;
 
-    int tensor_size = tensor.bytes / sizeof(T);
+    size_t tensor_size = tensor.bytes / sizeof(T);
+
+    if (tensor_size != num_elements_) {
+      std::cerr << "Expected a tensor with " << num_elements_
+                << " elements, got " << tensor_size << std::endl;
+      return false;
+    }
 
     bool good_output = true;
     for (int i = 0; i < tensor_size; ++i) {
@@ -115,6 +125,7 @@ class TfLiteDriver::Expectation {
   }
 
   TfLitePtrUnion data_;
+  size_t num_elements_;
 };
 
 TfLiteDriver::TfLiteDriver(bool use_nnapi) : use_nnapi_(use_nnapi) {}
@@ -132,7 +143,6 @@ void TfLiteDriver::AllocateTensors() {
 
 void TfLiteDriver::LoadModel(const string& bin_file_path) {
   if (!IsValid()) return;
-  std::cout << std::endl << "Loading model: " << bin_file_path << std::endl;
 
   model_ = FlatBufferModel::BuildFromFile(GetFullPath(bin_file_path).c_str());
   if (!model_) {
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 102740ee4725904918ce551d1a3e233ee6f8cc57..8a35fb9034ca9cd1b9eb87956aed1eb96485dc9b 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -143,6 +143,7 @@ cc_library(
         ":toco_graphviz_dump_options",
         ":toco_port",
         ":types_proto_cc",
+        "//tensorflow/cc/saved_model:tag_constants",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
@@ -233,6 +234,7 @@ cc_library(
         "graph_transformations/identify_relu1.cc",
         "graph_transformations/lstm_utils.cc",
         "graph_transformations/make_initial_dequantize_operator.cc",
+        "graph_transformations/merge_reshape_into_preceding_transpose.cc",
         "graph_transformations/propagate_activation_function_into_constants.cc",
         "graph_transformations/propagate_array_data_types.cc",
         "graph_transformations/propagate_fixed_sizes.cc",
@@ -250,7 +252,8 @@ cc_library(
         "graph_transformations/remove_trivial_reshape.cc",
         "graph_transformations/remove_trivial_slice.cc",
         "graph_transformations/remove_unused_op.cc",
-        "graph_transformations/reorder_activation_functions.cc",
+        "graph_transformations/reorder_elementwise_unary.cc",
+        "graph_transformations/reorder_reshape_transpose.cc",
         "graph_transformations/resolve_batch_normalization.cc",
         "graph_transformations/resolve_batch_to_space_nd_attributes.cc",
         "graph_transformations/resolve_constant_binary.cc",
@@ -258,6 +261,7 @@ cc_library(
         "graph_transformations/resolve_constant_fake_quant.cc",
         "graph_transformations/resolve_constant_fill.cc",
         "graph_transformations/resolve_constant_gather.cc",
+        "graph_transformations/resolve_constant_random_uniform.cc",
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
         "graph_transformations/resolve_constant_stack.cc",
@@ -418,15 +422,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/README.md b/tensorflow/contrib/lite/toco/README.md
index 281b2ea5e4c5553ff7aa240cdef3cb9819f19b49..522e260ad2a14c5f8e080c0a0f538f4192b7ed2d 100644
--- a/tensorflow/contrib/lite/toco/README.md
+++ b/tensorflow/contrib/lite/toco/README.md
@@ -1,26 +1,27 @@
-# The TensorFlow Lite Optimizing Converter
+# TOCO: TensorFlow Lite Optimizing Converter
 
-The TensorFlow Lite Optimizing Converter's most typical use is converting from the TensorFlow GraphDef to the TensorFlow Lite
-format, but it supports much more than that.
+The TensorFlow Lite Optimizing Converter converts TensorFlow graphs into
+TensorFlow Lite graphs. There are additional usages that are also detailed in
+the usage documentation.
 
 ## Usage documentation
 
 Usage information is given in these documents:
 
+*   [Command-line glossary](g3doc/cmdline_reference.md)
 *   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Command-line reference](g3doc/cmdline_reference.md)
-*   [Python API](g3doc/python_api.md)
-
-## Design documentation
-
-Coming soon!
+*   [Python API examples](g3doc/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
-In the typical case, an application developer is using TensorFlow to design and
-train models, then uses TensorFlow's freeze_graph.py to generate a frozen
-inference graph, then uses the converter to convert that into a TensorFlow Lite flatbuffer file,
-then ships that file to client devices where the TensorFlow Lite interpreter handles them
-on-device. This is represented in the following diagram:
-
-![drawing](https://storage.googleapis.com/download.tensorflow.org/example_images/tensorflow_landscape.svg)
+Once an application developer has a trained TensorFlow model, TOCO will accept
+that model and generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file. TOCO currently supports
+[SavedModels](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+and frozen graphs (models generated via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)).
+The TensorFlow Lite FlatBuffer file can be shipped to client devices, generally
+mobile devices, where the TensorFlow Lite interpreter handles them on-device.
+This flow is represented in the diagram below.
+
+![drawing](g3doc/toco_landscape.svg)
diff --git a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
index 621fbcb98db049f819ebbbda8816ad4e30538530..1f3ea2e1c71e7de7e9ede2224796b489d7518d18 100644
--- a/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
+++ b/tensorflow/contrib/lite/toco/allocate_transient_arrays.cc
@@ -200,6 +200,12 @@ void DeallocateTransientArray(const Model& model, const string& array_name,
   allocator->Deallocate(*array->alloc);
 }
 
+void PushBackIfNotFound(const string& s, std::vector<string>* v) {
+  if (std::find(v->begin(), v->end(), s) == v->end()) {
+    v->push_back(s);
+  }
+}
+
 }  // namespace
 
 void AllocateTransientArrays(Model* model,
@@ -251,18 +257,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_allocate;
     for (const auto& input : op->inputs) {
       if (StartsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      input) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_allocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (StartsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_allocate.begin(), arrays_to_allocate.end(),
-                      output) == arrays_to_allocate.end()) {
-          arrays_to_allocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_allocate);
       }
     }
     for (const string& array : arrays_to_allocate) {
@@ -274,18 +274,12 @@ void AllocateTransientArrays(Model* model,
     std::vector<string> arrays_to_deallocate;
     for (const auto& input : op->inputs) {
       if (EndsAt(array_lifespans[input], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      input) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(input);
-        }
+        PushBackIfNotFound(input, &arrays_to_deallocate);
       }
     }
     for (const auto& output : op->outputs) {
       if (EndsAt(array_lifespans[output], op_index)) {
-        if (std::find(arrays_to_deallocate.begin(), arrays_to_deallocate.end(),
-                      output) == arrays_to_deallocate.end()) {
-          arrays_to_deallocate.push_back(output);
-        }
+        PushBackIfNotFound(output, &arrays_to_deallocate);
       }
     }
     for (const string& array : arrays_to_deallocate) {
@@ -310,17 +304,21 @@ void AllocateTransientArrays(Model* model,
     // for each operator, compute the sum of the sizes of the array that must
     // be live during the execution of this operator, plus the size of
     // persistent arrays that must be live at all times.
-    std::size_t size = persistent_alloc_size;
+    std::vector<string> non_persistent_edges;
     for (const auto& input : op->inputs) {
       if (!array_lifespans[input].persistent) {
-        size += TransientArraySize(*model, input, transient_data_alignment);
+        PushBackIfNotFound(input, &non_persistent_edges);
       }
     }
     for (const auto& output : op->outputs) {
       if (!array_lifespans[output].persistent) {
-        size += TransientArraySize(*model, output, transient_data_alignment);
+        PushBackIfNotFound(output, &non_persistent_edges);
       }
     }
+    std::size_t size = persistent_alloc_size;
+    for (const string& edge : non_persistent_edges) {
+      size += TransientArraySize(*model, edge, transient_data_alignment);
+    }
     // The optimal total size is the maximum of all operator-specific sizes.
     optimal_transient_alloc_size = std::max(optimal_transient_alloc_size, size);
   }
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 7b71792ff79604a61e0693415815bc86c8d6d1bc..7a7059e3572fbd8ca83fa149cbc0ee5d07883c45 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -26,6 +26,7 @@ limitations under the License.
 #endif
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/cc/saved_model/tag_constants.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 
@@ -201,6 +202,7 @@ struct ParsedModelFlags {
   Arg<toco::IntList> input_shape;
   Arg<toco::StringMapList> rnn_states;
   Arg<toco::StringMapList> model_checks;
+  Arg<bool> change_concat_input_ranges = Arg<bool>(true);
   // Debugging output options.
   // TODO(benoitjacob): these shouldn't be ModelFlags.
   Arg<string> graphviz_first_array;
@@ -210,6 +212,7 @@ struct ParsedModelFlags {
   Arg<bool> allow_nonexistent_arrays = Arg<bool>(false);
   Arg<bool> allow_nonascii_arrays = Arg<bool>(false);
   Arg<string> arrays_extra_info_file;
+  Arg<string> model_flags_file;
 };
 
 // Flags that describe the operation you would like to do (what conversion
@@ -220,7 +223,7 @@ struct ParsedTocoFlags {
   Arg<string> output_file;
   Arg<string> input_format = Arg<string>("TENSORFLOW_GRAPHDEF");
   Arg<string> output_format = Arg<string>("TFLITE");
-  Arg<string> savedmodel_tagset;
+  Arg<string> savedmodel_tagset = Arg<string>(tensorflow::kSavedModelTagServe);
   // TODO(aselle): command_line_flags  doesn't support doubles
   Arg<float> default_ranges_min = Arg<float>(0.);
   Arg<float> default_ranges_max = Arg<float>(0.);
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 22a23357b36c16ea937e726f1e49aa95d7f964e3..99ccfaea648077b7b72af30b32dd53b42b85d3a2 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT16;
 using tensorflow::DT_INT32;
 using tensorflow::DT_INT64;
 using tensorflow::DT_UINT8;
@@ -357,6 +358,14 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   strides.mutable_list()->add_i(src_op.stride_height);
   strides.mutable_list()->add_i(src_op.stride_width);
   strides.mutable_list()->add_i(1);
+  if ((src_op.dilation_width_factor != 1) ||
+      (src_op.dilation_height_factor != 1)) {
+    auto& dilations = (*conv2d_op->mutable_attr())["dilations"];
+    dilations.mutable_list()->add_i(1);
+    dilations.mutable_list()->add_i(src_op.dilation_height_factor);
+    dilations.mutable_list()->add_i(src_op.dilation_width_factor);
+    dilations.mutable_list()->add_i(1);
+  }
   string padding;
   if (src_op.padding.type == PaddingType::kSame) {
     padding = "SAME";
@@ -391,84 +400,6 @@ void ConvertConvOperator(const Model& model, const ConvOperator& src_op,
   }
 }
 
-void ConvertDilatedConvOperator(const Model& model, const ConvOperator& src_op,
-                                GraphDef* tensorflow_graph) {
-  CHECK((src_op.dilation_width_factor > 1) ||
-        (src_op.dilation_height_factor > 1))
-      << "Conv operator must have height or width dilation factor > 1. "
-         "Otherwise, use regular conv op.";
-  CHECK_EQ(src_op.stride_width, 1)
-      << "Dilated AND strided convolution is unsupported";
-  CHECK_EQ(src_op.stride_height, 1)
-      << "Dilated AND strided convolution is unsupported";
-
-  // Emulate dilated convolution with a chain of SpaceToBatchND -> Conv ->
-  // BatchToSpaceND ops.
-
-  // Compute padding
-  const auto& input_array = model.GetArray(src_op.inputs[0]);
-  const auto& input_shape = input_array.shape();
-  CHECK_EQ(input_shape.dimensions_count(), 4);
-  int height_mod_dilation = input_shape.dims(1) % src_op.dilation_height_factor;
-  int pad_height;
-  if (height_mod_dilation) {
-    pad_height = src_op.dilation_height_factor - height_mod_dilation;
-  } else {
-    pad_height = 0;
-  }
-  int pad_width;
-  int width_mod_dilation = input_shape.dims(2) % src_op.dilation_width_factor;
-  if (width_mod_dilation) {
-    pad_width = src_op.dilation_width_factor - width_mod_dilation;
-  } else {
-    pad_width = 0;
-  }
-
-  // SpaceToBatchND op "collapses" the spatially separated elements together
-  string stb_output = src_op.outputs[0] + "/dilated_conv_SpaceToBatch";
-  auto* stb_op = tensorflow_graph->add_node();
-  stb_op->set_op("SpaceToBatchND");
-  stb_op->set_name(stb_output);
-  *stb_op->add_input() = src_op.inputs[0];
-  (*stb_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  string block_shape = src_op.outputs[0] + "/dilated_conv_block_shape";
-  CreateIntTensorConst(
-      block_shape,
-      {src_op.dilation_height_factor, src_op.dilation_width_factor}, {2},
-      tensorflow_graph);
-  *stb_op->add_input() = block_shape;
-  (*stb_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  string stb_paddings = src_op.outputs[0] + "/dilated_conv_paddings";
-  CreateIntTensorConst(stb_paddings, {0, pad_height, pad_width, 0}, {2, 2},
-                       tensorflow_graph);
-  *stb_op->add_input() = stb_paddings;
-  (*stb_op->mutable_attr())["Tpaddings"].set_type(DT_INT32);
-
-  // Perform a regular conv on the "collapsed" elements
-  ConvOperator conv_op;
-  string conv_output = src_op.outputs[0] + "/dilated_conv_Conv2D";
-  conv_op.inputs = src_op.inputs;
-  conv_op.inputs[0] = stb_output;
-  conv_op.outputs = {conv_output};
-  conv_op.padding.type = src_op.padding.type;
-  conv_op.stride_width = src_op.stride_width;
-  conv_op.stride_height = src_op.stride_height;
-  conv_op.dilation_width_factor = 1;
-  conv_op.dilation_height_factor = 1;
-  ConvertConvOperator(model, conv_op, tensorflow_graph);
-
-  // BatchToSpaceND op restores elements to their original layout
-  auto* bts_op = tensorflow_graph->add_node();
-  bts_op->set_op("BatchToSpaceND");
-  bts_op->set_name(src_op.outputs[0]);
-  *bts_op->add_input() = conv_output;
-  (*bts_op->mutable_attr())["T"].set_type(DT_FLOAT);
-  *bts_op->add_input() = block_shape;
-  (*bts_op->mutable_attr())["Tblock_shape"].set_type(DT_INT32);
-  *bts_op->add_input() = stb_paddings;
-  (*bts_op->mutable_attr())["Tcrops"].set_type(DT_INT32);
-}
-
 void ConvertDepthwiseConvOperator(const Model& model,
                                   const DepthwiseConvOperator& src_op,
                                   GraphDef* tensorflow_graph) {
@@ -773,6 +704,15 @@ void ConvertRelu6Operator(const Relu6Operator& src_op,
   (*relu_op->mutable_attr())["T"].set_type(DT_FLOAT);
 }
 
+void ConvertLogOperator(const LogOperator& src_op, GraphDef* tensorflow_graph) {
+  auto* op = tensorflow_graph->add_node();
+  op->set_op("Log");
+  op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 1);
+  *op->add_input() = src_op.inputs[0];
+  (*op->mutable_attr())["T"].set_type(DT_FLOAT);
+}
+
 void ConvertLogisticOperator(const LogisticOperator& src_op,
                              GraphDef* tensorflow_graph) {
   auto* relu_op = tensorflow_graph->add_node();
@@ -943,6 +883,9 @@ void ConvertFakeQuantOperator(const FakeQuantOperator& src_op,
   CHECK(src_op.minmax);
   (*fakequant_op->mutable_attr())["min"].set_f(src_op.minmax->min);
   (*fakequant_op->mutable_attr())["max"].set_f(src_op.minmax->max);
+  if (src_op.num_bits) {
+    (*fakequant_op->mutable_attr())["num_bits"].set_i(src_op.num_bits);
+  }
 }
 
 void ConvertMaxPoolOperator(const MaxPoolOperator& src_op,
@@ -1711,6 +1654,23 @@ void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
   (*topk_op->mutable_attr())["sorted"].set_b(true);
 }
 
+void ConvertRandomUniformOperator(const Model& model,
+                                  const RandomUniformOperator& src_op,
+                                  GraphDef* tensorflow_graph) {
+  CHECK(tensorflow_graph != nullptr);
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("RandomUniform");
+  CHECK_EQ(src_op.inputs.size(), 1);
+  new_op->set_name(src_op.outputs[0]);
+  *new_op->add_input() = src_op.inputs[0];
+  const auto shape_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(shape_type);
+  (*new_op->mutable_attr())["dtype"].set_type(
+      GetTensorFlowDataType(src_op.dtype));
+  (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
+  (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1719,13 +1679,8 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   }
 
   if (src_op.type == OperatorType::kConv) {
-    const ConvOperator& conv_op = static_cast<const ConvOperator&>(src_op);
-    if ((conv_op.dilation_width_factor != 1) ||
-        (conv_op.dilation_height_factor != 1)) {
-      return ConvertDilatedConvOperator(model, conv_op, tensorflow_graph);
-    } else {
-      ConvertConvOperator(model, conv_op, tensorflow_graph);
-    }
+    ConvertConvOperator(model, static_cast<const ConvOperator&>(src_op),
+                        tensorflow_graph);
   } else if (src_op.type == OperatorType::kDepthwiseConv) {
     ConvertDepthwiseConvOperator(
         model, static_cast<const DepthwiseConvOperator&>(src_op),
@@ -1760,6 +1715,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kRelu6) {
     ConvertRelu6Operator(static_cast<const Relu6Operator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kLog) {
+    ConvertLogOperator(static_cast<const LogOperator&>(src_op),
+                       tensorflow_graph);
   } else if (src_op.type == OperatorType::kLogistic) {
     ConvertLogisticOperator(static_cast<const LogisticOperator&>(src_op),
                             tensorflow_graph);
@@ -1897,6 +1855,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertTransposeConvOperator(
         model, static_cast<const TransposeConvOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kRandomUniform) {
+    ConvertRandomUniformOperator(
+        model, static_cast<const RandomUniformOperator&>(src_op),
+        tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
@@ -1922,6 +1884,9 @@ void AddPlaceholder(const string& name, ArrayDataType type,
     case ArrayDataType::kInt64:
       (*placeholder->mutable_attr())["dtype"].set_type(DT_INT64);
       break;
+    case ArrayDataType::kInt16:
+      (*placeholder->mutable_attr())["dtype"].set_type(DT_INT16);
+      break;
     default:
       LOG(FATAL) << "Unexpected data type in array \"" << name << "\"";
   }
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 372c52558973f4aacc180ac44b9e95a5e9b199ef..495014c6fc67ab0ad7c975d0570034545e90f9bc 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -1,73 +1,72 @@
 # TensorFlow Lite Optimizing Converter command-line examples
 
-This page is a guide to using the TensorFlow Lite Optimizing Converter by
-looking at some example command lines. It is complemented by the following other
-documents:
+This page provides examples on how to use TOCO via command line. It is
+complemented by the following documents:
 
 *   [README](../README.md)
-*   [Command-line reference](cmdline_reference.md)
+*   [Command-line glossary](cmdline_reference.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference
-
-In this example, we look at the most common task: we have an ordinary TensorFlow
-GraphDef and want to convert it to a TensorFlow Lite flatbuffer to perform
-floating-point inference.
+*   [Convert a TensorFlow SavedModel to TensorFlow Lite](#savedmodel)
+*   [Convert a TensorFlow GraphDef to TensorFlow Lite for float
+    inference](#graphdef-float)
+*   [Quantization](#quantization)
+    *   [Convert a TensorFlow GraphDef to TensorFlow Lite for quantized
+        inference](#graphdef-quant)
+    *   [Use "dummy-quantization" to try out quantized inference on a float
+        graph](#dummy-quant)
+*   [Specifying input and output arrays](#specifying-input-and-output-arrays)
+    *   [Multiple output arrays](#multiple-output-arrays)
+    *   [Multiple input arrays](#multiple-input-arrays)
+    *   [Specifying subgraphs](#specifying-subgraphs)
+*   [Other conversions supported by TOCO](#other-conversions)
+    *   [Optimize a TensorFlow GraphDef](#optimize-graphdef)
+    *   [Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef
+        format](#to-graphdef)
+*   [Logging](#logging)
+    *   [Standard logging](#standard-logging)
+    *   [Verbose logging](#verbose-logging)
+    *   [Graph "video" logging](#graph-video-logging)
+*   [Graph visualizations](#graph-visualizations)
+    *   [Using --output_format=GRAPHVIZ_DOT](#using-output-formatgraphviz-dot)
+    *   [Using --dump_graphviz](#using-dump-graphviz)
+    *   [Legend for the graph visualizations](#graphviz-legend)
+
+## Convert a TensorFlow SavedModel to TensorFlow Lite <a name="savedmodel"></a>
+
+The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
+FlatBuffer to perform floating-point inference.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --inference_type=FLOAT \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
+  third_party/tensorflow/contrib/lite/toco:toco -- \
+  --savedmodel_directory=/tmp/saved_model \
+  --output_file=/tmp/foo.tflite
 ```
 
-To explain each of these flags:
-
-*   `--input_format` and `--output_format` determine the formats of the input
-    and output files: here we are converting from `TENSORFLOW_GRAPHDEF` to
-    `TFLITE`.
-*   `--input_file` specifies the path of the input file, to be converted. When
-    `--input_format=TENSORFLOW_GRAPHDEF`, this file should be a
-    *[frozen](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)*
-    *inference* graph. Being frozen means in particular that the input file is
-    self-contained, and does not reference any external "checkpoint" file. An
-    *inference* graph is a version of a graph meant to be used for inference,
-    typically not the same graph file as was used for training a given model.
-*   `--output_file` specifies the destination to write the converted file to.
-*   `--input_array` specifies the input activations, that is, the input "tensor"
-    in the input TensorFlow GraphDef file. The array designated by
-    `--input_array` is the one that the user will have to provide the contents
-    of as input to the runtime inference code.
-*   `--output_array` specifies the output activations, that is, the output
-    "tensor" in the input TensorFlow GraphDef file. The runtime inference code
-    will store its results in the array designated by `--output_array`.
-*   `--input_shape` specifies the shape of the input array. It is currently
-    required, but the plan is for a future version to no longer require it,
-    allowing to defer the specification of the input shape until runtime. The
-    format of `input_shape` is always a comma-separated list of dimensions,
-    always in TensorFlow convention.
-*   `--inference_type` specifies what type of arithmetic the output file should
-    be relying on. It implies in particular the choice of type of the output
-    arrays in the output file.
-
-## Just optimize a TensorFlow GraphDef
+[SavedModel](https://www.tensorflow.org/programmers_guide/saved_model#using_savedmodel_with_estimators)
+has fewer required flags than frozen graphs (described [below](#graphdef-float))
+due to access to additional data contained within the SavedModel. The values for
+`--input_arrays` and `--output_arrays` are an aggregated, alphabetized list of
+the inputs and outputs in the
+[SignatureDefs](https://www.tensorflow.org/serving/signature_defs) within the
+[MetaGraphDef](https://www.tensorflow.org/programmers_guide/saved_model#apis_to_build_and_load_a_savedmodel)
+specified by `--savedmodel_tagset`. The value for `input_shapes` is
+automatically determined from the MetaGraphDef whenever possible. The default
+value for `--inference_type` for SavedModels is `FLOAT`.
 
-The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
-`--input_format` and `--output_format`. This means that conversion from and to
-any supported format is possible, and in particular, same-format "conversions"
-are possible, and effectively ask the converter to optimize and simplify a
-graph. Example:
+There is currently no support for MetaGraphDefs without a SignatureDef or for
+MetaGraphDefs that use the [`assets/`
+directory](https://www.tensorflow.org/programmers_guide/saved_model#structure_of_a_savedmodel_directory).
+
+## Convert a TensorFlow GraphDef to TensorFlow Lite for float inference <a name="graphdef-float"></a>
+
+The follow example converts a basic TensorFlow GraphDef (frozen by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
+into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
+graphs contain the variables stored in Checkpoint files as Const ops.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -75,56 +74,27 @@ curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
   --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TENSORFLOW_GRAPHDEF \
+  --output_file=/tmp/foo.tflite \
+  --inference_type=FLOAT \
   --input_shape=1,128,128,3 \
   --input_array=input \
   --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
-Here we did not pass `--inference_type` because it is not considered applicable
-to the TensorFlow GraphDef format (as far as we are concerned, TensorFlow
-GraphDefs are technically always float, and the only flavor of "quantized"
-GraphDef that the converter deals with is "FakeQuantized" graphs that are still
-technically float graphs).
+## Quantization
 
-Below in the section about passing arbitrary input/output arrays we give another
-example, using the converter to extract just a sub-graph from a TensorFlow
-GraphDef.
+### Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference <a name="graphdef-quant"></a>
 
-## Convert a TensorFlow Lite flatbuffer back into TensorFlow GraphDef format
+TOCO is compatible with fixed point quantization models described
+[here](https://www.tensorflow.org/performance/quantization). These are float
+models with
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops inserted at the boundaries of fused layers to record min-max range
+information. This generates a quantized inference workload that reproduces the
+quantization behavior that was used during training.
 
-As we mentioned that the converter supports file format conversions in any
-direction, let us just give an example of that:
-
-```
-bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/foo.tflite \
-  --output_file=/tmp/foo.pb \
-  --input_format=TFLITE \
-  --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1
-```
-
-## Convert a TensorFlow GraphDef to TensorFlow Lite for quantized inference
-
-Let us now look at a quantized model. As mentioned above, the only flavor of
-quantized TensorFlow GraphDefs that the converter is concerned with, is
-"FakeQuantized" models. These are technically float models, but with special
-`FakeQuant*` ops inserted at the boundaries of fused layers to record min-max
-range information allowing to generate a quantized inference workload that is
-able to reproduce exactly the specific quantization behavior that was used
-during training. Indeed, the whole point of quantized training is to allow for
-both training and inference to perform exactly the same arithmetic, so that the
-way that the training process about around quantization inaccuracy is
-effectively helping the quantized inference process to be more accurate.
-
-Given a quantized TensorFlow GraphDef, generating a quantized TensorFlow Lite
-flatbuffer is done like this:
+The following command generates a quantized TensorFlow Lite FlatBuffer from a
+"quantized" TensorFlow GraphDef.
 
 ```
 bazel run --config=opt \
@@ -141,36 +111,17 @@ bazel run --config=opt \
   --std_value=127
 ```
 
-Here, besides changing `--input_file` to point to a (fake-)quantized GraphDef,
-the only other changes are:
-
-*   To change `--inference_type` to `QUANTIZED_UINT8`. This effectively tells
-    the converter to generate an output file that performs quantized inference
-    on a quantized input.
-*   To pass `--mean_value` and `--std_value` flags to describe how the quantized
-    uint8 input array values are to be interpreted as the mathematical real
-    numbers that the graph is concerned with (keep in mind that even a
-    "fake-quantized" TensorFlow GraphDef is still technically a float graph).
-    The meaning of `--mean_value` and `--std_value` is explained in the
-    command-line reference; it suffices for now to say that they are a property
-    of each model.
+### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy-quant"></a>
 
-## Use dummy-quantization to try out quantized inference on a float graph
+In order to evaluate the possible benefit of generating a quantized graph, TOCO
+allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
+and `--default_ranges_max` accept plausable values for the min-max ranges of the
+values in all arrays that do not have min-max information. "Dummy-quantization"
+will produce lower accuracy but will emulate the performance of a correctly
+quantized model.
 
-Sometimes, one only has a plain float graph, and one is curious as to how much
-faster inference might run if one could perform quantized inference instead of
-float inference. Rather than requiring users to first invest in quantizing their
-graphs before they can evaluate a possible benefit, the converter allows to
-simply experiment with what we call "dummy quantization": provide some vaguely
-plausible values for the min-max ranges of values in all arrays that do not have
-min-max information, so that quantization can carry on, certainly producing
-inaccurate results (do not use that in production!) but with performance
-characteristics that should be identical to those of an actually quantized
-flavor of the model.
-
-In the present example, we have a model using Relu6 activation functions almost
-everywhere, so a reasonable guess is that most activation ranges should be
-contained in [0, 6] and roughly comparable to it.
+The example below contains a model using Relu6 activation functions. Therefore,
+a reasonable guess is that most activation ranges should be contained in [0, 6].
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -191,15 +142,13 @@ bazel run --config=opt \
   --std_value=127.5
 ```
 
-## Multiple output arrays
+## Specifying input and output arrays
 
-Some models have multiple outputs. Even in a model with only one output, you may
-want for the inference code to return the contents of other arrays as well, or
-to perform inference on a subgraph with multiple outputs (see the section below
-on specifying arbitrary arrays as input/output arrays).
+### Multiple output arrays
 
-Either way, using `--output_arrays` instead of `--output_array` allows to
-specify a comma-separated list of output arrays.
+The flag `output_arrays` takes in a comma-separated list of output arrays as
+seen in the example below. This is useful for models or subgraphs with multiple
+outputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -216,18 +165,11 @@ bazel run --config=opt \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
 ```
 
-## Multiple input arrays
-
-Some models have multiple inputs; even in a model with a single input, you may
-want for the inference code to implement only a subgraph with multiple inputs
-(see the section below on specifying arbitrary arrays as input/output arrays).
+### Multiple input arrays
 
-Either way, multiple input arrays are specified by using `--input_arrays`
-instead of `--input_array` to specify a comma-separated list of input arrays. In
-that case, one also needs to use `--input_shapes` instead of `--input_shape`.
-The syntax for `--input_shapes` is a bit trickier, since already the singular
-`--input_shape` was a comma-separated list of integers! Multiple input shapes
-are delimited by a colon (`:`) in `--input_shapes`.
+The flag `input_arrays` takes in a comma-separated list of input arrays as seen
+in the example below. This is useful for models or subgraphs with multiple
+inputs.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
@@ -244,54 +186,93 @@ bazel run --config=opt \
   --output_array=InceptionV1/Logits/Predictions/Reshape_1
 ```
 
-## Specifying arbitrary arrays in a graph as input or output arrays
+Note that `input_shapes` is provided as a colon-separated list. Each input shape
+corresponds to the input array at the same position in the respective list.
 
-Any array in the input file can be specified as an input or output array. This
-allows to use the converter to extract a sub-graph out of the input graph file.
-The converter then automatically discards any part of the graph that is not
-needed for the subgraph identified by the specified input and output arrays.
-Another use case for specifying multiple output arrays is to get inference code
-to return the contents of some specified intermediate activations array, not
-just the output activations.
+### Specifying subgraphs
 
-In order to know which array you want to pass as `--input_arrays` /
-`--output_arrays`, it helps to have a visualization of the graph. See the
-section below on graph visualization. When using graph visualization for that
-purpose, make sure to use `--dump_graphviz=` to visualize exactly the graph as
-it is in the actual final form being exported to the output file.
+Any array in the input file can be specified as an input or output array in
+order to extract subgraphs out of an input graph file. TOCO discards the parts
+of the graph outside of the specific subgraph. Use [graph
+visualizations](#graph-visualizations) to identify the input and output arrays
+that make up the desired subgraph.
+
+The follow command shows how to extract a single fused layer out of a TensorFlow
+GraphDef.
+
+```
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --output_file=/tmp/foo.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
+  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+```
 
 Note that the final representation of an on-device inference workload (say, in
-TensorFlow Lite flatbuffers format) tends to have coarser granularity than the
+TensorFlow Lite FlatBuffers format) tends to have coarser granularity than the
 very fine granularity of the TensorFlow GraphDef representation. For example,
 while a fully-connected layer is typically represented as at least four separate
 ops in TensorFlow GraphDef (Reshape, MatMul, BiasAdd, Relu...), it is typically
 represented as a single "fused" op (FullyConnected) in the converter's optimized
 representation and in the final on-device representation (e.g. in TensorFlow
-Lite flatbuffer format). As the level of granularity gets coarser, some
+Lite FlatBuffer format). As the level of granularity gets coarser, some
 intermediate arrays (say, the array between the MatMul and the BiasAdd in the
 TensorFlow GraphDef) are dropped. When specifying intermediate arrays as
-`--input_arrays` / `--output_arrays`, it is generally at least desirable (and
-often required) to specify arrays that are meant to survive in the final form of
-the graph, after fusing. These are typically the outputs of activation functions
-(since everything in each layer until the activation function tends to get
-fused).
+`--input_arrays` / `--output_arrays`, it is desirable (and often required) to
+specify arrays that are meant to survive in the final form of the graph, after
+fusing. These are typically the outputs of activation functions (since
+everything in each layer until the activation function tends to get fused).
+
+## Other conversions supported by TOCO <a name="other-conversions"></a>
+
+The converter accepts both TENSORFLOW_GRAPHDEF and TFLITE file formats as both
+`--input_format` and `--output_format`. This means that conversion to and from
+any supported format is possible.
 
-Here is an example of extracting just a sub-graph, namely just a single fused
-layer, out of a TensorFlow GraphDef, and exporting a TensorFlow GraphDef
-containing just that subgraph:
+### Optimize a TensorFlow GraphDef <a name="optimize-graphdef"></a>
+
+Same-format "conversions" can be used to optimize and simplify a graph or be
+used to [get a subgraph](#specifying-subgraphs) of a graph. The flag
+`--inference_type` is not required because TensorFlow graphs, including those
+containing the
+[`FakeQuant*`](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization)
+ops are always float graphs.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
   | tar xzv -C /tmp
 bazel run --config=opt \
   //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/inception_v1_2016_08_28_frozen.pb \
+  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.pb \
   --input_format=TENSORFLOW_GRAPHDEF \
   --output_format=TENSORFLOW_GRAPHDEF \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
-  --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
-  --output_array=InceptionV1/InceptionV1/Mixed_3b/concat_v2
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
+```
+
+### Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef format <a name="to-graphdef"></a>
+
+The converter supports file format conversions from TensorFlow Lite, back into
+TensorFlow GraphDef format.
+
+```
+bazel run --config=opt \
+  //tensorflow/contrib/lite/toco:toco -- \
+  --input_file=/tmp/foo.tflite \
+  --output_file=/tmp/foo.pb \
+  --input_format=TFLITE \
+  --output_format=TENSORFLOW_GRAPHDEF \
+  --input_shape=1,128,128,3 \
+  --input_array=input \
+  --output_array=MobilenetV1/Predictions/Reshape_1
 ```
 
 ## Logging
@@ -299,8 +280,8 @@ bazel run --config=opt \
 ### Standard logging
 
 The converter generates some informative log messages during processing. The
-easiest way to view them is to add `--logtostderr` to command lines. For the
-previous example, that gives:
+easiest way to view them is to add `--logtostderr` to command lines as seen in
+the following example.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -333,42 +314,34 @@ I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic
 For debugging purposes, the converter supports two levels of verbose logging,
 which can be set by passing a `--v=` flag:
 
-*   At `--v=1`, the converter generates text dumps of the graph at various
-    points during processing, as well as log messages about every graph
-    transformation that did take place, typically answering questions of the
-    form "why was my graph transformed in this way"?
-*   At `--v=2`, the converter additionally generates log messages about graph
-    transformations that were considered but not actually performed, typically
-    answering questions of the form "why was my graph NOT transformed when I
-    expected it would be?".
+*   For `--v=1`, the converter generates text dumps of the graph at various
+    points during processing as well as log messages about every graph
+    transformation that took place.
+*   For `--v=2`, the converter additionally generates log messages about graph
+    transformations that were considered but not performed.
 
 ### Graph "video" logging
 
-When `--dump_graphviz=` is used (see the section on Graph visualizations), one
-may additionally pass `--dump_graphviz_video`, which causes a graph
-visualization to be dumped after each individual graph transformations, often
-resulting in thousands of files. Typically, one would then bisect into these
-files to understand when a given change was introduced in the graph.
+When `--dump_graphviz=` is used (see the section on [graph
+visualizations](#graph-visualizations)), one may additionally pass
+`--dump_graphviz_video`, which causes a graph visualization to be dumped after
+each individual graph transformation. This results in thousands of files.
+Typically, one would then bisect into these files to understand when a given
+change was introduced in the graph.
 
 ## Graph visualizations
 
-The converter is able to export a graph to the GraphViz Dot format, for easy
-visualization. Combined with the converter's ability to transform the graph into
-a simpler, coarser-granularity representation, that makes it a very powerful
-visualization tool.
-
-There are two ways to get the converter to export a GraphViz Dot file,
-corresponding to two separate use cases. Understanding the difference between
-them is key to getting useful graph visualizations.
+TOCO can export a graph to the GraphViz Dot format for easy visualization via
+either the `--output_format` flag or the `--dump_graphviz` flag. The subsections
+below outline the use cases for each.
 
 ### Using `--output_format=GRAPHVIZ_DOT`
 
-The first way to get a graphviz rendering is to pass
-`--output_format=GRAPHVIZ_DOT`, instead of the `--output_format` that you would
-otherwise use. This says: "I just want to get a plausible visualization of that
-graph". The upside is that it makes for very simple command lines, and makes the
-converter very lax about aspects of the graph or the command line that it would
-otherwise complain about. Example:
+The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
+`--output_format`. This results in a plausable visualization of the graph. This
+reduces the requirements that normally exist during conversion between other
+input and output formats. For example, this may be useful if conversion from
+TENSORFLOW_GRAPHDEF to TFLITE is failing.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -391,7 +364,7 @@ dot -Tpdf -O /tmp/foo.dot
 ```
 
 And the resulting `.dot.pdf` can be viewed in any PDF viewer, but we suggest one
-with a good ability to pan and zoom across a very large page; Google Chrome does
+with a good ability to pan and zoom across a very large page. Google Chrome does
 well in that respect.
 
 ```
@@ -400,14 +373,14 @@ google-chrome /tmp/foo.dot.pdf
 
 Example PDF files are viewable online in the next section.
 
-### Using `--dump_graphviz=`
+### Using `--dump_graphviz`
 
-The second way to get a graphviz rendering is to pass a `--dump_graphviz=` flag
-specifying a destination directory to dump GraphViz rendering to. Unlike the
-previous approach, this one allows you to keep your real command-line (with your
-real `--output_format` and other flags) unchanged, just appending a
-`--dump_graphviz=` flag to it. This says: "I want visualizations of the actual
-graph during this specific conversion process". Example:
+The second way to get a graphviz rendering is to pass the `--dump_graphviz=`
+flag, specifying a destination directory to dump GraphViz rendering to. Unlike
+the previous approach, this one allows you to keep your real command-line (with
+your real `--output_format` and other flags) unchanged, just appending a
+`--dump_graphviz=` flag to it. This provides a visualization of the actual graph
+during a specific conversion process.
 
 ```
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
@@ -425,8 +398,8 @@ bazel run --config=opt \
   --dump_graphviz=/tmp
 ```
 
-This generates a few files in the destination directory, here `/tmp`. Most
-important are these two files:
+This generates a few files in the destination directory, here `/tmp`. The two
+most important files are:
 
 ```
 /tmp/toco_AT_IMPORT.dot
@@ -442,8 +415,7 @@ conversion subsequently fails).
 
 `toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
 were applied to it, just before it was exported to the `--output_file`.
-Typically, this is a much smaller graph, and it conveys much more information
-about each node.
+Typically, this is a much smaller graph with more information about each node.
 
 Again, these can be rendered to PDFs:
 
@@ -451,12 +423,12 @@ Again, these can be rendered to PDFs:
 dot -Tpdf -O /tmp/toco_*.dot
 ```
 
-The resulting files can be seen here:
+Sample output files can be seen here:
 
 *   [toco_AT_IMPORT.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AT_IMPORT.dot.pdf)
 *   [toco_AFTER_TRANSFORMATIONS.dot.pdf](https://storage.googleapis.com/download.tensorflow.org/example_images/toco_AFTER_TRANSFORMATIONS.dot.pdf).
 
-### Legend for the graph visualizations
+### Legend for the graph visualizations <a name="graphviz-legend"></a>
 
 *   Operators are red square boxes with the following hues of red:
     *   Most operators are
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 5e077952235fa1aac1e12403d3d83633a617ccb7..9e99287f828c22aa81eb216c087f3261e378fc14 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -1,84 +1,47 @@
-# TensorFlow Lite Optimizing Converter command-line reference
+# TensorFlow Lite Optimizing Converter command-line glossary
 
 This page is complete reference of command-line flags. It is complemented by the
 following other documents:
 
 *   [README](../README.md)
 *   [Command-line examples](cmdline_examples.md)
+*   [Python API examples](python_api.md)
 
 Table of contents:
 
-[TOC]
-
-## High-level overview
-
-A full list and detailed specification of all flags is given in the next
-section. For now we focus on a higher-level description of command lines:
-
-```
-toco \
-  --input_format=... \
-  --output_format=... \
-  --input_file=... \
-  --output_file=... \
-  [model flags...] \
-  [transformation flags...] \
-  [logging flags...]
-```
-
-In other words, the converter requires at least the following mandatory flags:
-`--input_format`, `--output_format`, `--input_file`, `--output_file`. Depending
-on the input and output formats, additional flags may be allowed or mandatory:
-
-*   *Model flags* provide additional information about the model stored in the
-    input file.
-    *   `--output_array` or `--output_arrays` specify which arrays in the input
-        file are to be considered the output activations.
-    *   `--input_array` or `--input_arrays` specify which arrays in the input
-        file are to be considered the input activations.
-    *   `--input_shape` or `--input_shapes` specify the shapes of the input
-        arrays.
-    *   `--input_data_type` or `--input_data_types` specify the data types of
-        input arrays, which can be used if the input file does not already
-        specify them.
-    *   `--mean_value` or `--mean_values`, and `--std_value` or `--std_values`,
-        give the dequantization parameters of the input arrays, for the case
-        when the output file will accept quantized input arrays.
-*   *Transformation flags* specify options of the transformations to be applied
-    to the graph, i.e. they specify requested properties that the output file
-    should have.
-    *   `--inference_type` specifies the type of real-numbers arrays in the
-        output file. This only affects arrays of real numbers and allows to
-        control their quantization or dequantization, effectively switching
-        between floating-point and quantized arithmetic for the inference
-        workload, as far as real numbers are concerned. Other data types are
-        unaffected (e.g. plain integers, and strings).
-    *   `--inference_input_type` is like `--inference_type` but specifically
-        controlling input arrays, separately from other arrays. If not
-        specified, then `--inference_type` is used. The use case for specifying
-        `--inference_input_type` is when one wants to perform floating-point
-        inference on a quantized input, as is common in image models operating
-        on bitmap image inputs.
-    *   Some transformation flags allow to carry on with quantization when the
-        input graph is not properly quantized: `--default_ranges_min`,
-        `--default_ranges_max`, `--drop_fake_quant`,
-        `--reorder_across_fake_quant`.
-*   *Logging flags* described below.
-
-## Command-line flags complete reference
-
-### Mandatory flags
-
-*   `--input_format`. Type: string. Specifies the format of the input file.
-    Allowed values:
+*   [High-level flags](#high-level-flags)
+*   [Model flags](#model-flags)
+*   [Transformation flags](#transformation-flags)
+*   [Logging flags](#logging-flags)
+
+## High-level flags
+
+The following high level flags specify the location of the input and output
+files. The flag `--output_file` is always required. Additionally, either
+`--input_file` or `--savedmodel_directory` is required.
+
+*   `--savedmodel_directory`. Type: string. Specifies the full path to the
+    directory containing the SavedModel.
+*   `--savedmodel_tagset`. Type: string. Default:
+    [kSavedModelTagServe](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h).
+    Specifies a comma-separated set of tags identifying the MetaGraphDef within
+    the SavedModel to analyze. All tags in the tag set must be specified.
+*   `--input_file`. Type: string. Specifies the path of the input file. This may
+    be either an absolute or a relative path.
+*   `--output_file`. Type: string. Specifies the path of the output file.
+
+The following high level flags specify the types of the input and output files:
+
+*   `--input_format`. Type: string. Default: `TENSORFLOW_GRAPHDEF`. Specifies
+    the format of the input file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Both
         binary and text proto formats are allowed.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
-*   `--output_format`. Type: string. Specifies the format of the output file.
-    Allowed values:
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
+*   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
+    the output file. Allowed values:
     *   `TENSORFLOW_GRAPHDEF` &mdash; The TensorFlow GraphDef format. Always
         produces a file in binary (not text) proto format.
-    *   `TFLITE` &mdash; The TensorFlow Lite flatbuffers format.
+    *   `TFLITE` &mdash; The TensorFlow Lite FlatBuffers format.
         *   Whether a float or quantized TensorFlow Lite file will be produced
             depends on the `--inference_type` flag.
     *   `GRAPHVIZ_DOT` &mdash; The GraphViz `.dot` format. This asks the
@@ -95,11 +58,11 @@ on the input and output formats, additional flags may be allowed or mandatory:
             you get in your actual output format as opposed to just a merely
             plausible visualization of a model, consider using `--dump_graphviz`
             instead and keeping your true `--output_format`.
-*   `--input_file`. Type: string. Specifies the path of the input file. This may
-    be either an absolute or a relative path.
-*   `--output_file`. Type: string. Specifies the path of the output file.
 
-### Model flags
+## Model flags
+
+*Model flags* provide additional information about the model stored in the input
+file.
 
 *   `--output_array`. Type: string. Specifies a single array as the output
     activations. Incompatible with `--output_arrays`.
@@ -111,6 +74,10 @@ on the input and output formats, additional flags may be allowed or mandatory:
 *   `--input_arrays`. Type: comma-separated list of strings. Specifies a list of
     arrays as the input activations, for models with multiple inputs.
     Incompatible with `--input_array`.
+*   `--batch_size`. Type: integer. Default: 1. Specifies the batch size for the
+    model. Replaces the first dimension of an input size array if undefined. Use
+    only with SavedModels when neither `--input_shape` nor `input_shapes` flags
+    are specified. Incompatible with GraphDefs.
 
 When `--input_array` is used, the following flags are available to provide
 additional information about the single input array:
@@ -160,7 +127,11 @@ additional information about the multiple input arrays:
     the input arrays specified in `--input_arrays`, in the same order. See
     `--mean_value`, `--std_value` for details.
 
-### Transformation flags
+## Transformation flags
+
+*Transformation flags* specify options of the transformations to be applied to
+the graph, i.e. they specify requested properties that the output file should
+have.
 
 *   `--inference_type`. Type: string. Sets the type of real-number arrays in the
     output file, that is, controls the representation (quantization) of real
@@ -232,7 +203,7 @@ additional information about the multiple input arrays:
     graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
-### Logging flags
+## Logging flags
 
 The following are standard Google logging flags:
 
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index 36e2d9c37238bb6184ec99c567810b1bcb9a68ce..f0fd638a618c75c75d336a746f9b1d8dccaea470 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -1,5 +1,12 @@
 # TensorFlow Lite Optimizing Converter (TOCO) Python API reference
 
+This page provides examples on how to use TOCO via the Python API. It is
+complemented by the following documents:
+
+*   [README](../README.md)
+*   [Command-line examples](cmdline_examples.md)
+*   [Command-line glossary](cmdline_reference.md)
+
 ## High-level overview
 
 While the TensorFlow Lite Optimizing Converter can be used from the command
diff --git a/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
new file mode 100644
index 0000000000000000000000000000000000000000..a47c088991299159be39bc490149720dae43eb53
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/g3doc/toco_landscape.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 720.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="p.0"><path d="m0 0l720.0 0l0 540.0l-720.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#p.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l720.0 0l0 540.0l-720.0 0z" fill-rule="evenodd"/><path fill="#f3f3f3" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m19.375328 28.750656l361.6378 0l0 358.01575l-361.6378 0z" fill-rule="evenodd"/><path fill="#434343" d="m338.49512 374.66016q-0.609375 0 -1.171875 -0.140625q-0.546875 -0.15625 -0.96875 -0.421875q-0.25 -0.15625 -0.359375 -0.296875q-0.09375 -0.140625 -0.09375 -0.34375q0 -0.171875 0.09375 -0.28125q0.109375 -0.109375 0.265625 -0.109375q0.171875 0 0.46875 0.1875q0.40625 0.25 0.796875 0.390625q0.390625 0.140625 0.984375 0.140625q0.71875 0 1.109375 -0.25q0.40625 -0.265625 0.40625 -0.734375q0 -0.296875 -0.15625 -0.46875q-0.140625 -0.1875 -0.5 -0.328125q-0.359375 -0.140625 -1.046875 -0.296875q-1.171875 -0.25 -1.6875 -0.671875q-0.5 -0.421875 -0.5 -1.15625q0 -0.578125 0.3125 -1.015625q0.328125 -0.4375 0.890625 -0.6875q0.5625 -0.265625 1.28125 -0.265625q0.53125 0 1.015625 0.140625q0.484375 0.140625 0.859375 0.390625q0.453125 0.328125 0.453125 0.671875q0 0.171875 -0.109375 0.296875q-0.109375 0.125 -0.25 0.125q-0.15625 0 -0.484375 -0.234375q-0.375 -0.234375 -0.703125 -0.359375q-0.328125 -0.140625 -0.828125 -0.140625q-0.625 0 -1.015625 0.28125q-0.375 0.265625 -0.375 0.734375q0 0.296875 0.140625 0.484375q0.140625 0.171875 0.46875 0.3125q0.328125 0.140625 0.9375 0.28125q0.90625 0.1875 1.40625 0.4375q0.5 0.234375 0.703125 0.578125q0.21875 0.34375 0.21875 0.890625q0 0.828125 -0.703125 1.34375q-0.703125 0.515625 -1.859375 0.515625zm9.241241 -1.59375q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551147 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625zm6.157959 0.328125q0.15625 -0.3125 0.46875 -0.3125q0.203125 0 0.359375 0.140625q0.15625 0.125 0.15625 0.328125q0 0.109375 -0.046875 0.203125l-2.59375 5.609375q-0.078125 0.171875 -0.25 0.28125q-0.15625 0.09375 -0.34375 0.09375q-0.171875 0 -0.328125 -0.09375q-0.15625 -0.109375 -0.25 -0.28125l-2.59375 -5.609375q-0.046875 -0.09375 -0.046875 -0.1875q0 -0.203125 0.171875 -0.34375q0.1875 -0.15625 0.390625 -0.15625q0.140625 0 0.265625 0.078125q0.125 0.078125 0.1875 0.234375l2.234375 5.0l2.21875 -4.984375zm7.2099915 4.796875q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.5551453 -0.8125q0.546875 -0.03125 0.546875 0.453125q0 0.21875 -0.125 0.34375q-0.109375 0.125 -0.40625 0.15625l-0.390625 0.03125q-0.890625 0.078125 -1.328125 0.640625q-0.4375 0.546875 -0.4375 1.296875l0 3.234375q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.359375 0.140625q0.140625 0.140625 0.140625 0.375l0 0.75q0.28125 -0.578125 0.796875 -0.890625q0.515625 -0.3125 1.1875 -0.359375l0.1875 -0.015625z" fill-rule="nonzero"/><path fill="#d9d9d9" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m25.624672 36.249344l301.88977 0l0 69.98425l-301.88977 0z" fill-rule="evenodd"/><path fill="#434343" d="m134.36497 56.831844q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm9.004181 -1.421875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.839676 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm5.84729 6.0625q-0.56248474 0 -1.0624847 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.87498474 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0624847 -0.234375 -1.5156097 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.1562347 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.56248474 0 -0.90623474 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84373474 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.2131653 0q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1288147 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm1.970398 6.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.721527 0.015625q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm12.222534 -4.9375q0.125 -0.28125 0.390625 -0.28125q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.078125 -0.03125 0.171875l-1.984375 5.046875q-0.078125 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.296875 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.65625 -4.21875l-1.640625 4.21875q-0.0625 0.15625 -0.203125 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-1.984375 -5.03125q-0.046875 -0.09375 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.171875 -0.140625 0.359375 -0.140625q0.296875 0 0.40625 0.296875l1.65625 4.421875l1.6875 -4.390625q0.078125 -0.15625 0.203125 -0.234375q0.125 -0.09375 0.265625 -0.09375q0.15625 0 0.28125 0.09375q0.125 0.078125 0.1875 0.234375l1.6875 4.375l1.65625 -4.40625zm12.637604 5.09375q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm4.4157715 0.015625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f3f3f3" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path stroke="#cccccc" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m396.75067 183.75066l249.00787 0l0 203.02364l-249.00787 0z" fill-rule="evenodd"/><path fill="#434343" d="m409.42255 374.66803q-0.90625 0 -1.609375 -0.40625q-0.6875 -0.421875 -1.078125 -1.171875q-0.375 -0.765625 -0.375 -1.765625q0 -1.0 0.390625 -1.765625q0.40625 -0.78125 1.109375 -1.203125q0.703125 -0.4375 1.625 -0.4375q0.5 0 1.0 0.140625q0.5 0.140625 0.875 0.40625q0.234375 0.171875 0.328125 0.328125q0.109375 0.140625 0.109375 0.328125q0 0.1875 -0.109375 0.3125q-0.09375 0.109375 -0.25 0.109375q-0.09375 0 -0.203125 -0.046875q-0.09375 -0.046875 -0.171875 -0.09375q-0.078125 -0.0625 -0.09375 -0.078125q-0.359375 -0.234375 -0.671875 -0.359375q-0.3125 -0.140625 -0.765625 -0.140625q-0.96875 0 -1.515625 0.671875q-0.53125 0.65625 -0.53125 1.828125q0 1.171875 0.53125 1.8125q0.546875 0.640625 1.515625 0.640625q0.453125 0 0.78125 -0.125q0.328125 -0.140625 0.65625 -0.375q0.15625 -0.09375 0.28125 -0.15625q0.140625 -0.0625 0.234375 -0.0625q0.140625 0 0.234375 0.125q0.109375 0.109375 0.109375 0.296875q0 0.171875 -0.09375 0.3125q-0.09375 0.140625 -0.34375 0.3125q-0.375 0.25 -0.90625 0.40625q-0.515625 0.15625 -1.0625 0.15625zm4.2591553 -0.03125q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -8.46875q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.21875 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 8.46875q0 0.25 -0.15625 0.390625q-0.15625 0.140625 -0.375 0.140625zm3.092102 0q-0.234375 0 -0.390625 -0.140625q-0.15625 -0.140625 -0.15625 -0.390625l0 -5.625q0 -0.25 0.15625 -0.390625q0.15625 -0.140625 0.390625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 5.625q0 0.265625 -0.15625 0.40625q-0.140625 0.125 -0.375 0.125zm0 -8.09375q-0.3125 0 -0.515625 -0.171875q-0.203125 -0.1875 -0.203125 -0.5q0 -0.296875 0.203125 -0.484375q0.203125 -0.1875 0.515625 -0.1875q0.328125 0 0.515625 0.1875q0.203125 0.1875 0.203125 0.484375q0 0.3125 -0.203125 0.5q-0.1875 0.171875 -0.515625 0.171875zm7.5765076 6.53125q0.140625 0 0.25 0.125q0.109375 0.109375 0.109375 0.296875q0 0.328125 -0.46875 0.609375q-0.484375 0.28125 -1.015625 0.421875q-0.53125 0.140625 -1.046875 0.140625q-1.5 0 -2.375 -0.890625q-0.875 -0.890625 -0.875 -2.46875q0 -1.0 0.390625 -1.765625q0.390625 -0.765625 1.078125 -1.1875q0.703125 -0.4375 1.59375 -0.4375q1.265625 0 2.015625 0.828125q0.75 0.828125 0.75 2.25q0 0.265625 -0.109375 0.390625q-0.109375 0.109375 -0.34375 0.109375l-4.296875 0q0.125 2.296875 2.171875 2.296875q0.53125 0 0.890625 -0.140625q0.375 -0.140625 0.8125 -0.390625q0.34375 -0.1875 0.46875 -0.1875zm-2.34375 -4.3125q-0.84375 0 -1.359375 0.53125q-0.515625 0.53125 -0.609375 1.515625l3.765625 0q-0.015625 -1.0 -0.484375 -1.515625q-0.46875 -0.53125 -1.3125 -0.53125zm7.6020203 -0.84375q2.328125 0 2.328125 2.578125l0 3.609375q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -3.546875q0 -0.90625 -0.359375 -1.3125q-0.34375 -0.421875 -1.125 -0.421875q-0.890625 0 -1.421875 0.546875q-0.53125 0.546875 -0.53125 1.484375l0 3.25q0 0.25 -0.140625 0.390625q-0.140625 0.140625 -0.390625 0.140625q-0.25 0 -0.40625 -0.140625q-0.140625 -0.140625 -0.140625 -0.390625l0 -5.625q0 -0.234375 0.140625 -0.375q0.15625 -0.15625 0.40625 -0.15625q0.234375 0 0.375 0.15625q0.140625 0.140625 0.140625 0.359375l0 0.6875q0.328125 -0.609375 0.890625 -0.921875q0.578125 -0.3125 1.3125 -0.3125zm7.304718 5.875q0.46875 0.03125 0.46875 0.421875q0 0.21875 -0.171875 0.34375q-0.171875 0.109375 -0.5 0.078125l-0.359375 -0.015625q-1.0625 -0.09375 -1.578125 -0.640625q-0.5 -0.5625 -0.5 -1.703125l0 -3.34375l-0.890625 0q-0.234375 0 -0.359375 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.203125 0.125 -0.3125q0.125 -0.125 0.359375 -0.125l0.890625 0l0 -1.515625q0 -0.25 0.140625 -0.390625q0.15625 -0.140625 0.40625 -0.140625q0.234375 0 0.375 0.140625q0.15625 0.140625 0.15625 0.390625l0 1.515625l1.484375 0q0.203125 0 0.328125 0.125q0.140625 0.109375 0.140625 0.3125q0 0.1875 -0.140625 0.296875q-0.125 0.109375 -0.328125 0.109375l-1.484375 0l0 3.40625q0 0.734375 0.296875 1.0625q0.296875 0.3125 0.90625 0.359375l0.359375 0.03125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m206.61942 201.17455l140.47244 0l0 30.992126l-140.47244 0z" fill-rule="evenodd"/><path fill="#000000" d="m237.0857 213.5031q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.417801 3.875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.199051 4.46875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm3.3865662 5.875q-0.171875 0 -0.28125 -0.09375q-0.109375 -0.09375 -0.109375 -0.21875q0 -0.140625 0.109375 -0.234375q0.109375 -0.09375 0.28125 -0.09375l5.21875 0q0.171875 0 0.28125 0.09375q0.109375 0.09375 0.109375 0.234375q0 0.125 -0.109375 0.21875q-0.109375 0.09375 -0.28125 0.09375l-5.21875 0zm11.2500305 -6.609375q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 5.09375q0 1.296875 -0.671875 1.96875q-0.671875 0.671875 -1.984375 0.671875q-1.28125 0 -2.140625 -0.515625q-0.421875 -0.234375 -0.421875 -0.546875q0 -0.171875 0.078125 -0.28125q0.09375 -0.109375 0.234375 -0.109375q0.125 0 0.4375 0.171875q0.421875 0.21875 0.828125 0.34375q0.40625 0.140625 0.96875 0.140625q0.859375 0 1.28125 -0.453125q0.4375 -0.453125 0.4375 -1.3125l0 -1.03125q-0.25 0.5625 -0.78125 0.859375q-0.515625 0.296875 -1.21875 0.296875q-0.765625 0 -1.359375 -0.359375q-0.59375 -0.359375 -0.9375 -1.015625q-0.328125 -0.65625 -0.328125 -1.515625q0 -0.875 0.328125 -1.53125q0.34375 -0.65625 0.9375 -1.015625q0.59375 -0.359375 1.359375 -0.359375q0.6875 0 1.203125 0.296875q0.515625 0.296875 0.78125 0.84375l0 -0.640625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625zm-2.28125 4.984375q0.84375 0 1.3125 -0.546875q0.484375 -0.5625 0.484375 -1.546875q0 -0.984375 -0.46875 -1.53125q-0.46875 -0.5625 -1.328125 -0.5625q-0.84375 0 -1.34375 0.5625q-0.484375 0.546875 -0.484375 1.53125q0 0.984375 0.484375 1.546875q0.5 0.546875 1.34375 0.546875zm7.4695435 -4.984375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.20282 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.331665 6.046875q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm5.2167664 -6.046875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.45282 -4.9375q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875z" fill-rule="nonzero"/><path fill="#f4cccc" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m154.36745 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m184.89111 339.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.160431 0.03125q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625zm9.214935 0.84375q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm8.077179 0q-1.171875 0 -2.046875 -0.515625q-0.859375 -0.53125 -1.328125 -1.5q-0.46875 -0.984375 -0.46875 -2.296875q0 -1.34375 0.453125 -2.3125q0.46875 -0.984375 1.328125 -1.5q0.875 -0.53125 2.0625 -0.53125q1.1875 0 2.0625 0.53125q0.875 0.515625 1.328125 1.5q0.46875 0.96875 0.46875 2.296875q0 1.3125 -0.46875 2.296875q-0.46875 0.984375 -1.34375 1.515625q-0.859375 0.515625 -2.046875 0.515625zm0 -0.84375q1.34375 0 2.09375 -0.90625q0.75 -0.90625 0.75 -2.578125q0 -1.6875 -0.75 -2.578125q-0.734375 -0.90625 -2.09375 -0.90625q-1.34375 0 -2.09375 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.09375 0.90625z" fill-rule="nonzero"/><path fill="#d9ead3" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m284.12296 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m314.7006 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m303.37402 346.47687q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.5434265 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm4.674652 -6.046875q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.3300476 -5.28125q0.765625 0 1.34375 0.375q0.59375 0.359375 0.921875 1.046875q0.328125 0.6875 0.328125 1.59375q0 0.90625 -0.328125 1.59375q-0.328125 0.6875 -0.921875 1.078125q-0.578125 0.375 -1.34375 0.375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 0.640625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.203125q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.59375q0.46875 -0.59375 0.46875 -1.65625q0 -1.046875 -0.46875 -1.625q-0.46875 -0.578125 -1.328125 -0.578125q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.687164 -5.25q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.8726807 -1.71875q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm3.9360352 0q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm5.873535 6.328125q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 319.3983l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m443.6039 332.47687q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm5.113556 0q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm6.6840515 -0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -7.5625q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.171875l3.875 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-4.375 0zm6.3394165 0.0625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm4.987152 6.515625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.908142 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#000000" d="m429.9527 346.47687q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.56604 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm4.282898 -0.015625q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.14032 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.5896606 4.53125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.9081726 -0.65625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.7927856 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m371.61902 334.89435l41.417297 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m371.61902 334.89435l37.990234 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m409.60925 334.89435l-1.1245728 1.1246033l3.0897522 -1.1246033l-3.0897522 -1.1245728z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 277.52954l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m587.0588 293.13934q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm2.8911743 4.46875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 319.3983l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m584.63763 339.50812q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm5.0302734 -0.03125q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm7.7869263 4.375q-1.65625 0 -2.515625 -0.859375q-0.84375 -0.859375 -0.84375 -2.546875l0 -4.703125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.78125q0 1.25 0.609375 1.875q0.609375 0.609375 1.78125 0.609375q1.171875 0 1.765625 -0.609375q0.609375 -0.625 0.609375 -1.875l0 -4.78125q0 -0.234375 0.140625 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.359375l0 4.703125q0 1.671875 -0.859375 2.546875q-0.859375 0.859375 -2.5 0.859375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m241.86351 334.89435l42.267715 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m241.86351 334.89435l38.840652 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m280.70413 334.89435l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#d9ead3" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m413.02625 141.28871l20.53543 0l0 20.53543l-20.53543 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m437.52493 135.68242l73.763794 0l0 31.748032l-73.763794 0z" fill-rule="evenodd"/><path fill="#000000" d="m448.0718 156.20241q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm8.3211975 -5.140625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.767517 -5.28125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm10.15921 0.75q-0.234375 0 -0.375 -0.140625q-0.140625 -0.140625 -0.140625 -0.359375l0 -7.1875l-2.578125 0q-0.21875 0 -0.34375 -0.109375q-0.109375 -0.109375 -0.109375 -0.3125q0 -0.203125 0.109375 -0.296875q0.125 -0.109375 0.34375 -0.109375l6.15625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.109375 0.109375 -0.328125 0.109375l-2.578125 0l0 7.1875q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.34375 0.140625zm8.691681 -5.71875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-3.375 7.28125q-0.0625 0.125 -0.171875 0.1875q-0.109375 0.078125 -0.234375 0.078125q-0.1875 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.296875q0 -0.09375 0.046875 -0.1875l0.84375 -1.8125l-2.375 -5.140625q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm4.902405 -0.328125q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm8.76532 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#f4cccc" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m519.9029 141.28871l20.5354 0l0 20.53543l-20.5354 0z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m544.40155 135.68242l100.0 0l0 31.748032l-100.0 0z" fill-rule="evenodd"/><path fill="#000000" d="m554.9328 156.26491q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm5.3845215 -6.046875q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.456726 -1.703125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125zm4.248535 1.71875q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.47876 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm4.283142 -5.265625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.782898 0q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm4.7008057 6.046875q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm6.029297 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm5.830017 -5.265625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 5.078125q0 0.203125 -0.125 0.34375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.328125l0 -0.609375q-0.28125 0.53125 -0.78125 0.8125q-0.5 0.265625 -1.125 0.265625q-1.03125 0 -1.5625 -0.578125q-0.53125 -0.578125 -0.53125 -1.71875l0 -3.265625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.34375l0 3.234375q0 0.78125 0.3125 1.15625q0.3125 0.359375 0.984375 0.359375q0.765625 0 1.234375 -0.5q0.46875 -0.5 0.46875 -1.3125l0 -2.9375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625zm5.1851807 0q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm5.861023 4.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375z" fill-rule="nonzero"/><path fill="#d9ead3" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m78.872284 252.53609l87.49606 0l0 30.992142l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m93.328064 272.6459q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm6.9353027 -6.078125q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm8.578796 -4.96875q0.140625 -0.296875 0.421875 -0.296875q0.1875 0 0.328125 0.125q0.140625 0.109375 0.140625 0.296875q0 0.109375 -0.046875 0.1875l-2.34375 5.046875q-0.0625 0.15625 -0.21875 0.25q-0.140625 0.078125 -0.3125 0.078125q-0.15625 0 -0.296875 -0.078125q-0.140625 -0.09375 -0.21875 -0.25l-2.328125 -5.046875q-0.046875 -0.078125 -0.046875 -0.171875q0 -0.1875 0.15625 -0.3125q0.15625 -0.140625 0.359375 -0.140625q0.109375 0 0.21875 0.078125q0.125 0.078125 0.1875 0.203125l2.0 4.5l2.0 -4.46875zm6.480545 4.296875q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm8.589676 -3.28125q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm12.202805 -7.796875q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.359375l0 7.59375q0 0.21875 -0.125 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.328125 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -6.125l-2.59375 4.984375q-0.171875 0.34375 -0.5 0.34375q-0.3125 0 -0.484375 -0.34375l-2.625 -4.921875l0 6.0625q0 0.21875 -0.109375 0.359375q-0.109375 0.125 -0.328125 0.125q-0.21875 0 -0.34375 -0.125q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.59375q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.3125 0 0.484375 0.34375l3.046875 5.84375l3.015625 -5.84375q0.09375 -0.1875 0.203125 -0.265625q0.125 -0.078125 0.28125 -0.078125zm4.8576965 8.59375q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.925674 -7.796875q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.328125l0 7.625q0 0.21875 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.640625q-0.265625 0.546875 -0.78125 0.84375q-0.5 0.296875 -1.1875 0.296875q-0.765625 0 -1.359375 -0.375q-0.578125 -0.390625 -0.90625 -1.078125q-0.328125 -0.6875 -0.328125 -1.59375q0 -0.90625 0.328125 -1.59375q0.328125 -0.6875 0.90625 -1.046875q0.59375 -0.375 1.359375 -0.375q0.6875 0 1.1875 0.296875q0.515625 0.296875 0.78125 0.84375l0 -3.203125q0 -0.21875 0.125 -0.34375q0.125 -0.125 0.359375 -0.125zm-2.25 7.796875q0.84375 0 1.296875 -0.578125q0.46875 -0.59375 0.46875 -1.65625q0 -1.0625 -0.46875 -1.640625q-0.453125 -0.578125 -1.296875 -0.578125q-0.859375 0 -1.34375 0.578125q-0.46875 0.578125 -0.46875 1.625q0 1.0625 0.46875 1.65625q0.484375 0.59375 1.34375 0.59375zm9.06218 -0.640625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm4.386551 5.296875q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m190.14 134.76706l87.49608 0l0 30.992126l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m215.10997 150.37688q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.375 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84375 0 1.5625 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.15625 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.109375 0 2.03125 -0.328125l0 -2.578125l-1.75 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.234375 0zm5.1568146 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2028046 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035553 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461807 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480301 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1085 252.53609l87.49608 0l0 30.992142l-87.49608 0z" fill-rule="evenodd"/><path fill="#000000" d="m260.00964 265.61465q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.375 0q0.203125 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.203125 -0.125 0.3125q-0.125 0.109375 -0.328125 0.109375l-3.90625 0l0 2.90625l3.65625 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.3125q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.65625 0l0 3.453125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625zm8.9496765 -6.03125q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.767273 6.046875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm8.535065 -0.046875q0.203125 0 0.296875 0.109375q0.109375 0.09375 0.109375 0.265625q0 0.1875 -0.109375 0.296875q-0.09375 0.09375 -0.296875 0.09375l-4.203125 0q-0.203125 0 -0.34375 -0.125q-0.125 -0.125 -0.125 -0.3125q0 -0.1875 0.140625 -0.359375l3.546875 -4.28125l-3.28125 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l4.0625 0q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.3125q0 0.1875 -0.140625 0.359375l-3.5625 4.28125l3.421875 0zm6.2547913 -0.59375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.8396606 -0.75q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125z" fill-rule="nonzero"/><path fill="#000000" d="m258.07846 275.1459q0.1875 0 0.296875 0.109375q0.109375 0.109375 0.109375 0.296875l0 2.984375q0 0.296875 -0.09375 0.4375q-0.078125 0.140625 -0.328125 0.234375q-0.46875 0.203125 -1.15625 0.328125q-0.6875 0.109375 -1.3749847 0.109375q-1.25 0 -2.171875 -0.515625q-0.90625 -0.515625 -1.390625 -1.484375q-0.484375 -0.96875 -0.484375 -2.328125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.375 -1.5q0.90625 -0.53125 2.125 -0.53125q0.84373474 0 1.5624847 0.265625q0.71875 0.25 1.203125 0.734375q0.21875 0.203125 0.21875 0.421875q0 0.171875 -0.109375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.140625 0 -0.328125 -0.140625q-0.625 -0.484375 -1.140625 -0.671875q-0.5 -0.1875 -1.1562347 -0.1875q-1.4375 0 -2.203125 0.90625q-0.75 0.890625 -0.75 2.578125q0 1.71875 0.765625 2.609375q0.78125 0.890625 2.28125 0.890625q1.1093597 0 2.0312347 -0.328125l0 -2.578125l-1.7499847 0q-0.203125 0 -0.328125 -0.109375q-0.125 -0.109375 -0.125 -0.265625q0 -0.1875 0.125 -0.28125q0.125 -0.109375 0.328125 -0.109375l2.2343597 0zm5.15683 -1.5625q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.3131714 -5.296875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm7.2027893 -5.265625q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm4.5035706 5.984375q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.34375 0q2.03125 0 3.140625 1.09375q1.109375 1.09375 1.109375 3.125q0 2.03125 -1.125 3.140625q-1.109375 1.09375 -3.125 1.09375l-2.34375 0zm2.28125 -0.84375q3.28125 0 3.28125 -3.390625q0 -3.390625 -3.28125 -3.390625l-1.796875 0l0 6.78125l1.796875 0zm10.461792 -0.515625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.480316 -2.453125q-0.640625 0.046875 -0.96875 0.40625q-0.3125 0.34375 -0.3125 1.046875l0 0.390625l1.328125 0q0.203125 0 0.3125 0.109375q0.109375 0.109375 0.109375 0.28125q0 0.1875 -0.109375 0.28125q-0.109375 0.09375 -0.3125 0.09375l-1.328125 0l0 4.65625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -4.65625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -0.21875q0 -1.078125 0.53125 -1.6875q0.546875 -0.625 1.5625 -0.703125l0.3125 -0.015625q0.3125 -0.03125 0.453125 0.0625q0.140625 0.078125 0.140625 0.296875q0 0.34375 -0.421875 0.390625l-0.3125 0.03125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 232.16667l0 20.377945" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 232.16667l0 16.950867" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.85565 249.11754l-1.1246033 -1.124588l1.1246033 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#f4cccc" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m31.874016 68.3563l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m67.63894 87.62236q0.171875 0.15625 0.171875 0.359375q0 0.15625 -0.140625 0.296875q-0.140625 0.140625 -0.3125 0.140625q-0.15625 0 -0.328125 -0.140625l-4.484375 -3.921875l0 3.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 3.4375l4.28125 -3.796875q0.125 -0.140625 0.3125 -0.140625q0.171875 0 0.296875 0.140625q0.140625 0.140625 0.140625 0.3125q0 0.171875 -0.15625 0.328125l-3.875 3.421875l4.09375 3.5625zm5.8329315 -0.609375q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.792801 -0.734375q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625zm3.720398 -0.015625q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm6.3444214 0.765625q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#f4cccc" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m127.74803 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m147.45874 88.37367q-0.234375 0 -0.375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -7.5q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l4.484375 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.015625 0l0 2.9375l3.78125 0q0.21875 0 0.328125 0.109375q0.125 0.109375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-3.78125 0l0 3.078125l4.015625 0q0.21875 0 0.328125 0.109375q0.125 0.09375 0.125 0.296875q0 0.1875 -0.125 0.296875q-0.109375 0.109375 -0.328125 0.109375l-4.484375 0zm8.31218 0.078125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125zm6.4787903 -0.78125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm1.8769073 0.765625q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125zm6.0990753 0q2.203125 0 2.203125 2.296875l0 3.265625q0 0.21875 -0.125 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.140625 -0.125 -0.359375l0 -0.578125q-0.21875 0.515625 -0.6875 0.796875q-0.46875 0.28125 -1.078125 0.28125q-0.5625 0 -1.046875 -0.21875q-0.46875 -0.234375 -0.75 -0.640625q-0.265625 -0.40625 -0.265625 -0.90625q0 -0.65625 0.328125 -1.015625q0.34375 -0.375 1.109375 -0.53125q0.765625 -0.15625 2.125 -0.15625l0.265625 0l0 -0.40625q0 -0.71875 -0.296875 -1.046875q-0.28125 -0.34375 -0.953125 -0.34375q-0.8125 0 -1.65625 0.453125q-0.3125 0.203125 -0.453125 0.203125q-0.140625 0 -0.234375 -0.109375q-0.09375 -0.109375 -0.09375 -0.28125q0 -0.171875 0.09375 -0.296875q0.109375 -0.125 0.328125 -0.25q0.421875 -0.25 0.953125 -0.375q0.546875 -0.140625 1.0625 -0.140625zm-0.390625 5.296875q0.71875 0 1.171875 -0.484375q0.46875 -0.484375 0.46875 -1.25l0 -0.34375l-0.21875 0q-1.046875 0 -1.609375 0.09375q-0.546875 0.078125 -0.78125 0.296875q-0.234375 0.203125 -0.234375 0.609375q0 0.46875 0.34375 0.78125q0.34375 0.296875 0.859375 0.296875zm7.0631714 -0.015625q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.8144073 0.78125q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm7.1287994 -5.25q0.5 -0.03125 0.5 0.40625q0 0.203125 -0.109375 0.3125q-0.109375 0.109375 -0.375 0.140625l-0.359375 0.03125q-0.796875 0.078125 -1.1875 0.578125q-0.390625 0.484375 -0.390625 1.15625l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.140625 -0.359375q0.140625 -0.125 0.34375 -0.125q0.1875 0 0.3125 0.125q0.140625 0.125 0.140625 0.34375l0 0.671875q0.25 -0.53125 0.71875 -0.796875q0.46875 -0.28125 1.0625 -0.328125l0.171875 -0.015625z" fill-rule="nonzero"/><path fill="#f4cccc" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.1076 68.35761l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m269.00754 88.46742q-0.90625 0 -1.734375 -0.265625q-0.8125 -0.265625 -1.3125 -0.734375q-0.171875 -0.15625 -0.171875 -0.40625q0 -0.171875 0.09375 -0.296875q0.09375 -0.125 0.234375 -0.125q0.15625 0 0.328125 0.125q1.109375 0.859375 2.546875 0.859375q1.03125 0 1.578125 -0.390625q0.5625 -0.390625 0.5625 -1.125q0 -0.421875 -0.265625 -0.671875q-0.265625 -0.265625 -0.703125 -0.421875q-0.4375 -0.15625 -1.15625 -0.328125q-0.984375 -0.21875 -1.625 -0.46875q-0.625 -0.265625 -1.015625 -0.734375q-0.390625 -0.46875 -0.390625 -1.21875q0 -0.71875 0.390625 -1.265625q0.390625 -0.5625 1.09375 -0.875q0.703125 -0.3125 1.59375 -0.3125q0.84375 0 1.5625 0.265625q0.734375 0.25 1.234375 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.234375 0.125q-0.125 0 -0.34375 -0.140625q-0.59375 -0.46875 -1.09375 -0.65625q-0.5 -0.203125 -1.21875 -0.203125q-0.984375 0 -1.546875 0.421875q-0.546875 0.40625 -0.546875 1.15625q0 0.625 0.484375 0.953125q0.484375 0.3125 1.5 0.5625q1.09375 0.25 1.71875 0.484375q0.625 0.21875 1.03125 0.671875q0.421875 0.4375 0.421875 1.171875q0 0.71875 -0.390625 1.265625q-0.390625 0.53125 -1.109375 0.828125q-0.703125 0.296875 -1.609375 0.296875zm5.0446777 -0.03125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 7.625q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125zm2.784027 0q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm8.799652 1.234375q1.9375 0 1.9375 2.3125l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.328125 0.125q-0.21875 0 -0.359375 -0.125q-0.140625 -0.125 -0.140625 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.890625 -0.359375q-0.734375 0 -1.15625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.8125 -0.296875 -1.171875q-0.28125 -0.359375 -0.90625 -0.359375q-0.71875 0 -1.140625 0.5q-0.421875 0.484375 -0.421875 1.328125l0 2.921875q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.359375 -0.140625q0.203125 0 0.328125 0.125q0.140625 0.125 0.140625 0.34375l0 0.578125q0.265625 -0.515625 0.734375 -0.78125q0.46875 -0.28125 1.078125 -0.28125q1.375 0 1.78125 1.140625q0.265625 -0.515625 0.78125 -0.828125q0.515625 -0.3125 1.171875 -0.3125z" fill-rule="nonzero"/><path fill="#d9ead3" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m282.5035 134.76706l87.49606 0l0 30.992126l-87.49606 0z" fill-rule="evenodd"/><path fill="#000000" d="m297.8283 154.87688q-1.1875 0 -2.0625 -0.515625q-0.875 -0.53125 -1.359375 -1.5q-0.46875 -0.984375 -0.46875 -2.3125q0 -1.328125 0.46875 -2.296875q0.484375 -0.984375 1.359375 -1.5q0.875 -0.53125 2.0625 -0.53125q0.8125 0 1.515625 0.265625q0.71875 0.25 1.25 0.734375q0.1875 0.1875 0.1875 0.421875q0 0.171875 -0.09375 0.296875q-0.09375 0.125 -0.21875 0.125q-0.15625 0 -0.359375 -0.140625q-0.609375 -0.46875 -1.109375 -0.65625q-0.5 -0.203125 -1.140625 -0.203125q-1.390625 0 -2.140625 0.90625q-0.75 0.90625 -0.75 2.578125q0 1.671875 0.75 2.578125q0.75 0.90625 2.140625 0.90625q0.640625 0 1.140625 -0.1875q0.5 -0.1875 1.109375 -0.671875q0.203125 -0.125 0.359375 -0.125q0.125 0 0.21875 0.125q0.09375 0.109375 0.09375 0.296875q0 0.234375 -0.1875 0.40625q-0.53125 0.484375 -1.25 0.75q-0.703125 0.25 -1.515625 0.25zm7.358429 -6.078125q1.03125 0 1.546875 0.578125q0.53125 0.578125 0.53125 1.734375l0 3.25q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.21875q0 -0.78125 -0.328125 -1.15625q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.203125 0.125 -0.328125q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.125q0.125 0.125 0.125 0.34375l0 3.140625q0.28125 -0.53125 0.796875 -0.796875q0.515625 -0.28125 1.1875 -0.28125zm8.37854 4.625q0.140625 0 0.234375 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.296875 -0.421875 0.546875q-0.4375 0.25 -0.921875 0.375q-0.46875 0.125 -0.921875 0.125q-1.359375 0 -2.15625 -0.796875q-0.78125 -0.8125 -0.78125 -2.21875q0 -0.90625 0.34375 -1.59375q0.359375 -0.6875 0.984375 -1.0625q0.640625 -0.390625 1.4375 -0.390625q1.140625 0 1.8125 0.75q0.671875 0.734375 0.671875 2.0q0 0.25 -0.09375 0.359375q-0.09375 0.109375 -0.3125 0.109375l-3.859375 0q0.09375 2.0625 1.953125 2.0625q0.46875 0 0.796875 -0.125q0.34375 -0.125 0.71875 -0.34375q0.3125 -0.1875 0.421875 -0.1875zm-2.09375 -3.875q-0.765625 0 -1.234375 0.484375q-0.46875 0.484375 -0.546875 1.359375l3.390625 0q-0.015625 -0.890625 -0.4375 -1.359375q-0.421875 -0.484375 -1.171875 -0.484375zm6.308441 5.3125q-0.8125 0 -1.453125 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.6875 -0.34375 -1.578125q0 -0.90625 0.359375 -1.59375q0.359375 -0.703125 0.984375 -1.078125q0.640625 -0.390625 1.46875 -0.390625q0.453125 0 0.90625 0.125q0.453125 0.125 0.78125 0.359375q0.21875 0.140625 0.3125 0.28125q0.09375 0.140625 0.09375 0.3125q0 0.171875 -0.09375 0.28125q-0.09375 0.09375 -0.234375 0.09375q-0.078125 0 -0.1875 -0.046875q-0.09375 -0.046875 -0.15625 -0.09375q-0.0625 -0.046875 -0.09375 -0.0625q-0.3125 -0.203125 -0.59375 -0.3125q-0.28125 -0.125 -0.6875 -0.125q-0.875 0 -1.359375 0.59375q-0.484375 0.59375 -0.484375 1.65625q0 1.046875 0.484375 1.625q0.484375 0.578125 1.359375 0.578125q0.40625 0 0.703125 -0.109375q0.296875 -0.125 0.59375 -0.328125q0.140625 -0.09375 0.25 -0.15625q0.125 -0.0625 0.203125 -0.0625q0.140625 0 0.21875 0.109375q0.09375 0.109375 0.09375 0.28125q0 0.15625 -0.09375 0.28125q-0.078125 0.125 -0.296875 0.28125q-0.34375 0.234375 -0.8125 0.375q-0.46875 0.125 -0.953125 0.125zm7.998047 -0.84375q0.203125 0.171875 0.203125 0.375q0 0.1875 -0.125 0.328125q-0.125 0.125 -0.3125 0.125q-0.15625 0 -0.328125 -0.140625l-3.125 -2.703125l0 2.359375q0 0.234375 -0.140625 0.359375q-0.140625 0.125 -0.34375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -7.625q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.359375 -0.125q0.203125 0 0.34375 0.125q0.140625 0.125 0.140625 0.34375l0 4.875l2.859375 -2.625q0.15625 -0.140625 0.328125 -0.140625q0.1875 0 0.3125 0.140625q0.140625 0.125 0.140625 0.296875q0 0.203125 -0.171875 0.359375l-2.375 2.109375l2.59375 2.265625zm4.2812805 -5.21875q0.765625 0 1.34375 0.390625q0.59375 0.375 0.921875 1.0625q0.328125 0.6875 0.328125 1.609375q0 0.90625 -0.328125 1.59375q-0.328125 0.671875 -0.90625 1.046875q-0.578125 0.359375 -1.359375 0.359375q-0.6875 0 -1.203125 -0.296875q-0.5 -0.296875 -0.765625 -0.84375l0 2.8125q0 0.21875 -0.125 0.34375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.140625q-0.125 -0.125 -0.125 -0.328125l0 -7.234375q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.125 0.125 0.125 0.34375l0 0.640625q0.265625 -0.546875 0.765625 -0.84375q0.515625 -0.296875 1.203125 -0.296875zm-0.203125 5.265625q0.859375 0 1.328125 -0.578125q0.46875 -0.578125 0.46875 -1.625q0 -1.0625 -0.46875 -1.65625q-0.46875 -0.59375 -1.328125 -0.59375q-0.84375 0 -1.3125 0.578125q-0.453125 0.578125 -0.453125 1.640625q0 1.0625 0.453125 1.65625q0.46875 0.578125 1.3125 0.578125zm6.67157 0.796875q-0.828125 0 -1.46875 -0.359375q-0.625 -0.375 -0.96875 -1.0625q-0.34375 -0.703125 -0.34375 -1.609375q0 -0.90625 0.34375 -1.59375q0.34375 -0.703125 0.96875 -1.0625q0.640625 -0.375 1.46875 -0.375q0.828125 0 1.453125 0.375q0.640625 0.359375 0.984375 1.0625q0.34375 0.6875 0.34375 1.59375q0 0.90625 -0.34375 1.609375q-0.34375 0.6875 -0.984375 1.0625q-0.625 0.359375 -1.453125 0.359375zm0 -0.796875q0.859375 0 1.3125 -0.5625q0.46875 -0.578125 0.46875 -1.671875q0 -1.0625 -0.46875 -1.640625q-0.46875 -0.59375 -1.3125 -0.59375q-0.859375 0 -1.328125 0.59375q-0.46875 0.578125 -0.46875 1.640625q0 1.078125 0.453125 1.65625q0.46875 0.578125 1.34375 0.578125zm4.722534 0.78125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.234375 0.125 -0.359375q0.140625 -0.125 0.359375 -0.125q0.21875 0 0.34375 0.125q0.140625 0.125 0.140625 0.359375l0 5.0625q0 0.234375 -0.140625 0.359375q-0.125 0.125 -0.34375 0.125zm0 -7.28125q-0.296875 0 -0.484375 -0.171875q-0.171875 -0.171875 -0.171875 -0.453125q0 -0.25 0.171875 -0.421875q0.1875 -0.171875 0.484375 -0.171875q0.28125 0 0.453125 0.171875q0.1875 0.171875 0.1875 0.421875q0 0.28125 -0.1875 0.453125q-0.171875 0.171875 -0.453125 0.171875zm5.237152 1.234375q2.09375 0 2.09375 2.3125l0 3.25q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.21875 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -3.1875q0 -0.8125 -0.328125 -1.1875q-0.3125 -0.375 -1.0 -0.375q-0.8125 0 -1.296875 0.5q-0.46875 0.484375 -0.46875 1.328125l0 2.921875q0 0.234375 -0.125 0.359375q-0.125 0.125 -0.359375 0.125q-0.234375 0 -0.359375 -0.125q-0.125 -0.125 -0.125 -0.359375l0 -5.0625q0 -0.21875 0.125 -0.34375q0.125 -0.140625 0.359375 -0.140625q0.21875 0 0.34375 0.140625q0.125 0.125 0.125 0.328125l0 0.609375q0.28125 -0.53125 0.796875 -0.8125q0.53125 -0.28125 1.1875 -0.28125zm6.5660706 5.28125q0.421875 0.03125 0.421875 0.375q0 0.203125 -0.15625 0.3125q-0.140625 0.09375 -0.4375 0.078125l-0.328125 -0.03125q-0.953125 -0.0625 -1.421875 -0.5625q-0.453125 -0.515625 -0.453125 -1.53125l0 -3.015625l-0.796875 0q-0.203125 0 -0.328125 -0.09375q-0.109375 -0.109375 -0.109375 -0.28125q0 -0.171875 0.109375 -0.28125q0.125 -0.109375 0.328125 -0.109375l0.796875 0l0 -1.359375q0 -0.21875 0.125 -0.34375q0.140625 -0.140625 0.375 -0.140625q0.21875 0 0.34375 0.140625q0.140625 0.125 0.140625 0.34375l0 1.359375l1.328125 0q0.1875 0 0.296875 0.109375q0.125 0.109375 0.125 0.28125q0 0.171875 -0.125 0.28125q-0.109375 0.09375 -0.296875 0.09375l-1.328125 0l0 3.0625q0 0.65625 0.265625 0.953125q0.265625 0.296875 0.8125 0.328125l0.3125 0.03125zm3.361267 0.78125q-0.5625 0 -1.0625 -0.125q-0.5 -0.140625 -0.875 -0.375q-0.21875 -0.140625 -0.3125 -0.265625q-0.078125 -0.125 -0.078125 -0.3125q0 -0.15625 0.078125 -0.25q0.09375 -0.109375 0.234375 -0.109375q0.15625 0 0.421875 0.1875q0.359375 0.21875 0.71875 0.34375q0.359375 0.125 0.875 0.125q0.65625 0 1.015625 -0.21875q0.359375 -0.234375 0.359375 -0.671875q0 -0.265625 -0.140625 -0.421875q-0.125 -0.171875 -0.453125 -0.296875q-0.3125 -0.125 -0.9375 -0.25q-1.0625 -0.234375 -1.515625 -0.609375q-0.453125 -0.390625 -0.453125 -1.046875q0 -0.515625 0.28125 -0.90625q0.28125 -0.40625 0.796875 -0.625q0.515625 -0.234375 1.15625 -0.234375q0.46875 0 0.90625 0.125q0.4375 0.125 0.78125 0.34375q0.40625 0.296875 0.40625 0.609375q0 0.15625 -0.09375 0.265625q-0.09375 0.109375 -0.234375 0.109375q-0.140625 0 -0.4375 -0.203125q-0.328125 -0.21875 -0.625 -0.34375q-0.296875 -0.125 -0.75 -0.125q-0.5625 0 -0.90625 0.265625q-0.34375 0.25 -0.34375 0.671875q0 0.25 0.125 0.421875q0.125 0.15625 0.421875 0.28125q0.296875 0.125 0.84375 0.25q0.828125 0.1875 1.265625 0.40625q0.453125 0.203125 0.640625 0.515625q0.203125 0.3125 0.203125 0.796875q0 0.75 -0.640625 1.21875q-0.640625 0.453125 -1.671875 0.453125z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m233.1085 268.03217l-66.74016 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" stroke-dasharray="4.0,3.0" d="m233.10852 268.03217l-63.313095 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m169.79543 268.03217l1.124588 -1.1246033l-3.0897675 1.1246033l3.0897675 1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m75.62205 99.34843l0 19.652092l46.992126 0l0 133.54475" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m75.62205 99.34843l0 19.652084l46.992126 0l0 130.11768" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1182l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 133.5463" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m171.49606 99.34974l0 19.650558l-48.88189 0l0 130.1192" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m122.614174 249.1195l-1.124588 -1.124588l1.124588 3.0897675l1.1245804 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l-42.960632 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m233.89502 131.35573l-1.124588 -1.124588l1.124588 3.0897675l1.1245728 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 17.724327" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85565 99.34974l0 17.70874l49.385803 0l0 14.297249" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m326.24146 131.35573l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#c9daf8" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 235.66077l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m579.47955 247.1612q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm8.868103 0q0.203125 0 0.328125 0.140625q0.125 0.125 0.125 0.359375l0 7.578125q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.359375 0.140625q-0.234375 0 -0.390625 -0.203125l-4.984375 -6.65625l0 6.359375q0 0.21875 -0.125 0.359375q-0.125 0.140625 -0.34375 0.140625q-0.21875 0 -0.34375 -0.140625q-0.109375 -0.140625 -0.109375 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.359375 -0.140625q0.234375 0 0.40625 0.203125l4.96875 6.65625l0 -6.359375q0 -0.234375 0.125 -0.359375q0.125 -0.140625 0.34375 -0.140625zm12.917175 7.953125q0.046875 0.09375 0.046875 0.203125q0 0.171875 -0.140625 0.296875q-0.140625 0.125 -0.328125 0.125q-0.296875 0 -0.421875 -0.296875l-0.84375 -1.9375l-4.53125 0l-0.859375 1.9375q-0.125 0.296875 -0.421875 0.296875q-0.1875 0 -0.34375 -0.125q-0.140625 -0.125 -0.140625 -0.3125q0 -0.09375 0.046875 -0.1875l3.4375 -7.640625q0.078125 -0.15625 0.21875 -0.234375q0.140625 -0.09375 0.3125 -0.09375q0.171875 0 0.3125 0.09375q0.15625 0.078125 0.21875 0.234375l3.4375 7.640625zm-5.859375 -2.421875l3.8125 0l-1.90625 -4.3125l-1.90625 4.3125zm7.78656 3.046875q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.546875q0 -0.21875 0.125 -0.34375q0.140625 -0.125 0.375 -0.125l2.84375 0q1.328125 0 2.0625 0.65625q0.75 0.640625 0.75 1.828125q0 1.1875 -0.75 1.84375q-0.734375 0.65625 -2.0625 0.65625l-2.359375 0l0 3.03125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625zm2.765625 -4.34375q1.9375 0 1.9375 -1.6875q0 -1.671875 -1.9375 -1.671875l-2.265625 0l0 3.359375l2.265625 0zm4.9744263 4.34375q-0.21875 0 -0.359375 -0.140625q-0.125 -0.140625 -0.125 -0.359375l0 -7.578125q0 -0.234375 0.125 -0.359375q0.140625 -0.140625 0.359375 -0.140625q0.234375 0 0.359375 0.140625q0.140625 0.125 0.140625 0.359375l0 7.578125q0 0.21875 -0.140625 0.359375q-0.125 0.140625 -0.359375 0.140625z" fill-rule="nonzero"/><path fill="#c9daf8" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m548.5407 193.79199l87.49603 0l0 30.992126l-87.49603 0z" fill-rule="evenodd"/><path fill="#000000" d="m589.5417 213.87056q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7480469 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875zm2.7479858 0q-0.28125 0 -0.484375 -0.1875q-0.1875 -0.1875 -0.1875 -0.484375q0 -0.296875 0.1875 -0.484375q0.203125 -0.203125 0.484375 -0.203125q0.28125 0 0.46875 0.203125q0.1875 0.1875 0.1875 0.484375q0 0.296875 -0.1875 0.484375q-0.1875 0.1875 -0.46875 0.1875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m122.620316 283.52823l0 14.9730835l75.49606 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m122.620316 283.52823l0 14.9730835l75.49608 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m276.85654 283.52823l0 14.9730835l-78.74016 0l0 20.90091" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m276.85654 283.52823l0 14.9730835l-78.74014 0l0 17.473846" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m198.1164 315.97516l-1.124588 -1.1246033l1.124588 3.0897827l1.1245728 -3.0897827z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 0.06298828l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 0.06298828l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 334.95734l-1.1245728 1.1246033l3.0897827 -1.1246033l-3.0897827 -1.1245728z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -41.858246l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -41.858246l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 293.0361l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.1246033z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -83.74802l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -83.74802l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 251.14633l-1.1245728 1.1245728l3.0897827 -1.1245728l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m500.5223 334.89435l24.009003 0l0 -125.60629l24.022522 0" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m500.5223 334.89435l24.009003 0l0 -125.60629l20.595398 0" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m545.1267 209.28806l-1.1245728 1.124588l3.0897827 -1.124588l-3.0897827 -1.124588z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m233.88803 165.75919l0 17.70752l42.960632 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m233.88805 165.75919l0 17.70752l42.960617 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.84866 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 17.694061" fill-rule="evenodd"/><path stroke="#000000" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.25156 165.75919l0 17.70752l-49.385834 0l0 14.266968" fill-rule="evenodd"/><path fill="#000000" stroke="#000000" stroke-width="1.0" stroke-linecap="butt" d="m276.86572 197.73367l-1.1245728 -1.124588l1.1245728 3.0897675l1.1246033 -3.0897675z" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
index d38db85280d7bd935a47cda70227d383a513fbac..0fffab574ddd8ad75ec07ae4442f363a36ed289e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/convert_pure_conv_to_depthwise.cc
@@ -33,6 +33,11 @@ bool ConvertPureConvToDepthwise::Run(Model* model, std::size_t op_index) {
   if (conv_op->stride_width != conv_op->stride_height) {
     return false;
   }
+  if ((conv_op->dilation_width_factor != 1) ||
+      (conv_op->dilation_height_factor != 1)) {
+    // Depthwise conv does not support dilation
+    return false;
+  }
   auto& weights_array = model->GetArray(conv_op->inputs[1]);
   if (!weights_array.buffer) {
     // Yield until the weights are resolved as a constant array.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 640afc7c74d7284fb9e212ab23d74a8215314add..27c5044bb3e06e4a052ff0c4984226fb9d113f95 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -128,6 +128,7 @@ DECLARE_GRAPH_TRANSFORMATION(IdentifyL2Pool)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyLstmCell)
 DECLARE_GRAPH_TRANSFORMATION(SplitLstmCellInputs)
 DECLARE_GRAPH_TRANSFORMATION(MergeLstmCellInputs)
+DECLARE_GRAPH_TRANSFORMATION(MergeReshapeIntoPrecedingTranspose)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyRelu1)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyPRelu)
 DECLARE_GRAPH_TRANSFORMATION(IdentifyDilatedConv)
@@ -152,7 +153,8 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveConstantUnaryOperator)
 DECLARE_GRAPH_TRANSFORMATION(CreateIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(DropIm2colArrays)
 DECLARE_GRAPH_TRANSFORMATION(ReadFakeQuantMinMax)
-DECLARE_GRAPH_TRANSFORMATION(ReorderActivationFunctions)
+DECLARE_GRAPH_TRANSFORMATION(ReorderElementwiseUnary)
+DECLARE_GRAPH_TRANSFORMATION(ReorderReshapeTranspose)
 DECLARE_GRAPH_TRANSFORMATION(ResolveReorderAxes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowConcat)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTensorFlowMatMul)
@@ -173,6 +175,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 5cc82da5d544846cc095046ceccf0664525aae41..437e30a91803bfc847bf246875fa2924b7c0d3fe 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -95,30 +95,37 @@ bool HardcodeMinMaxForConcatenation(Model* model, Operator* op) {
   overall_minmax.min = overall_min;
   overall_minmax.max = overall_max;
   bool changed = false;
-  for (const auto& input : op->inputs) {
-    auto& array = model->GetArray(input);
-    if (!array.minmax) {
-      changed = true;
-    } else if (!(overall_minmax == array.GetMinMax())) {
-      changed = true;
-      LOG(WARNING)
-          << "Tweaking the MinMax of array " << input << ", which is "
-          << "an input to " << LogName(*op) << ", because we want all inputs "
-          << "and outputs of a Concatenation operator to have the same MinMax "
-          << "so that it can be implemented as a pure byte-copy, no "
-             "arithmetic.";
+  if (model->flags.change_concat_input_ranges()) {
+    for (const auto& input : op->inputs) {
+      auto& array = model->GetArray(input);
+      if (!array.minmax) {
+        changed = true;
+      } else if (!(overall_minmax == array.GetMinMax())) {
+        changed = true;
+        LOG(WARNING)
+            << "Tweaking the MinMax of array " << input << ", which is "
+            << "an input to " << LogName(*op) << ", because we want all inputs "
+            << "and outputs of a Concatenation operator to have the same "
+            << "MinMax so that it can be implemented as a pure byte-copy, no "
+               "arithmetic.";
+      }
+      array.GetOrCreateMinMax() = overall_minmax;
     }
-    array.GetOrCreateMinMax() = overall_minmax;
   }
   if (!output.minmax) {
     changed = true;
   } else if (!(overall_minmax == output.GetMinMax())) {
-    changed = true;
-    LOG(WARNING)
-        << "Tweaking the MinMax of the output array of " << LogName(*op)
-        << ", because we want all inputs "
-        << "and outputs of a Concatenation operator to have the same MinMax "
-        << "so that it can be implemented as a pure byte-copy, no arithmetic.";
+    if (model->flags.change_concat_input_ranges()) {
+      changed = true;
+      LOG(WARNING)
+          << "Tweaking the MinMax of the output array of " << LogName(*op)
+          << ", because we want all inputs "
+          << "and outputs of a Concatenation operator to have the same MinMax "
+          << "so that it can be implemented as a pure byte-copy, no "
+          << "arithmetic.";
+    } else {
+      return false;
+    }
   }
   output.GetOrCreateMinMax() = overall_minmax;
 
@@ -223,8 +230,11 @@ bool PropagateMinMaxAmongArrays(Model* model,
     if (array.minmax) {
       CHECK(*array.minmax == *reference_minmax)
           << "Both the following arrays have minmax, and they disagree: "
-          << reference_array_name << " and " << array_name
-          << ". Expected that either only one of them would have minmax, or at "
+          << reference_array_name << " (" << reference_minmax->min << ","
+          << reference_minmax->max << ") and " << array_name << " ("
+          << array.minmax->min << "," << array.minmax->max
+          << "). Expected that either only one of them would have minmax, or "
+             "at "
              "least that they would agree.";
     } else {
       array.GetOrCreateMinMax() = *reference_minmax;
@@ -332,6 +342,7 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPad:
     case OperatorType::kGather:
     case OperatorType::kTranspose:
+    case OperatorType::kMean:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
index 935da9f966ca63095faa17476be3a559d1a0193a..183b3d3f2e0ac74099236817e1b6cf5390f2d920 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/make_initial_dequantize_operator.cc
@@ -78,15 +78,21 @@ bool AddDequantizeOperatorToInput(const string& input_name, const Operator* op,
   image_input_op->outputs = {dequantized_input_name};
   model->operators.emplace(model->operators.begin(), image_input_op);
 
-  CHECK(input_array.final_data_type == ArrayDataType::kUint8);
-  input_array.data_type = ArrayDataType::kUint8;
   dequantized_input_array.data_type = ArrayDataType::kFloat;
   const auto& input_minmax = input_array.GetMinMax();
   auto& dequantized_input_minmax = dequantized_input_array.GetOrCreateMinMax();
   dequantized_input_minmax = input_minmax;
   auto& input_qparams = input_array.GetOrCreateQuantizationParams();
-  GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
-                                                         &input_qparams);
+  input_array.data_type = input_array.final_data_type;
+  if (input_array.data_type == ArrayDataType::kUint8) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kUint8>(input_minmax,
+                                                           &input_qparams);
+  } else if (input_array.data_type == ArrayDataType::kInt16) {
+    GetQuantizationParamsFromMinMax<ArrayDataType::kInt16>(input_minmax,
+                                                           &input_qparams);
+  } else {
+    LOG(FATAL) << "unhandled data type";
+  }
 
   transformation->AddMessageF(
       "Created %s"
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5065004093434475172a39efdcfd26c10c49148b
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/merge_reshape_into_preceding_transpose.cc
@@ -0,0 +1,190 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/runtime/types.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    // Arrays are missing.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Returns whether the reshape could be a transpose.
+std::vector<int32> ReshapeToTranspose(const Model& model,
+                                      const TensorFlowReshapeOperator* op) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  std::vector<int> one_indices;
+  std::vector<int> not_one_indices;
+
+  // Separate into one indices and not one indices.
+  for (int i = 0; i < in_shape.size(); i++) {
+    if (in_shape[i] == 1) {
+      one_indices.push_back(i);
+    } else {
+      not_one_indices.push_back(i);
+    }
+  }
+
+  // Reorder the vertices.
+  std::vector<int> perm;
+  perm.reserve(in_shape.size());
+  int one_index = 0;
+  int not_one_index = 0;
+  for (const auto val : out_shape) {
+    if (val == 1) {
+      perm.push_back(one_indices[one_index]);
+      one_index++;
+    } else {
+      perm.push_back(not_one_indices[not_one_index]);
+      not_one_index++;
+    }
+  }
+
+  return perm;
+}
+
+}  // namespace
+
+// When a transpose is fed into a reshape, it is possible for the two operators
+// to be merged if the reshape does not affect memory ordering and does not
+// affects the number of dimensions. This only occurs when only unary dimensions
+// are shifting position.
+bool MergeReshapeIntoPrecedingTranspose::Run(Model* model,
+                                             std::size_t op_index) {
+  auto it = model->operators.begin() + op_index;
+  auto* reshape_op = ConvertOperator<TensorFlowReshapeOperator*>(
+      it->get(), OperatorType::kTensorFlowReshape);
+
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  const string intermediate_name = reshape_op->inputs[0];
+  const string output_name = reshape_op->outputs[0];
+
+  // Guarantee the input is only consume by the reshape.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    return false;
+  }
+
+  // Check for the parent operator.
+  const auto& transpose_it = FindOpWithOutput(*model, intermediate_name);
+  if (transpose_it == model->operators.end()) {
+    return false;
+  }
+
+  // Find the parent operator and guarantee it is a transpose.
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    return false;
+  }
+
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      false /*allow_extra_unary_dimensions*/)) {
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot fuse %s and %s as it would invalidate the transpose "
+        "output array.",
+        LogName(*transpose_op), LogName(*reshape_op));
+    return false;
+  }
+
+  AddMessageF("Merging operations %s and %s", LogName(*transpose_op),
+              LogName(*reshape_op));
+
+  // const auto& intermediate_array = model->GetArray(intermediate_name);
+  // const auto& output_array = model->GetArray(output_name);
+
+  auto merged_perm = ReshapeToTranspose(*model, reshape_op);
+
+  // Combine the permutations.
+  const auto& transpose_perm = transpose_op->perm;
+  for (int i = 0; i < merged_perm.size(); i++) {
+    merged_perm[i] = transpose_perm[merged_perm[i]];
+  }
+
+  // Remove the reshape as passthrough operation.
+  if (!RemoveTrivialPassthroughOp(this, model, op_index)) {
+    return false;
+  }
+
+  // Update transpose_op's constant buffer to contain the new permutation.
+  model->GetArray(transpose_op->inputs[1])
+      .GetMutableBuffer<ArrayDataType::kInt32>()
+      .data = merged_perm;
+  transpose_op->perm = merged_perm;
+
+  // transpose_ops's shape will likely has changed.
+  model->GetArray(transpose_op->outputs[0]).clear_shape();
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 778da39bf13563cbbdbe54f1140595b057253ae3..89ad58f887f3644125b64e9341e572c6b435edab 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -50,78 +50,108 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     old_output_data_types[output] = model->GetArray(output).data_type;
   }
   // Do the actual output data types propagation.
-  if (op->type == OperatorType::kDequantize ||
-      op->type == OperatorType::kResizeBilinear) {
-    // These operators unconditionally produce float outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
-  } else if (op->type == OperatorType::kTensorFlowLess ||
-             op->type == OperatorType::kTensorFlowLessEqual ||
-             op->type == OperatorType::kTensorFlowGreater ||
-             op->type == OperatorType::kTensorFlowGreaterEqual) {
-    // These operators unconditionally produce bool outputs
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
-  } else if (op->type == OperatorType::kRank ||
-             op->type == OperatorType::kTensorFlowShape) {
-    // These operators only produce int32 outputs.
-    SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
-  } else if (op->type == OperatorType::kTensorFlowSplit ||
-             op->type == OperatorType::kTensorFlowConcat ||
-             op->type == OperatorType::kFill) {
-    // These operators produce an output with the same type as their 2nd input
-    CHECK_GE(op->inputs.size(), 2);
-    const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTransposeConv) {
-    // These operators produce an output with the same type as their 3rd input
-    CHECK_GE(op->inputs.size(), 3);
-    const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kCast) {
-    // Data type of the Cast op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* cast_op = static_cast<CastOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
-  } else if (op->type == OperatorType::kArgMax) {
-    // Data type of the ArgMax op is specified.
-    CHECK_EQ(op->outputs.size(), 1);
-    auto* argmax_op = static_cast<ArgMaxOperator*>(op);
-    model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
-  } else if (op->type == OperatorType::kRange) {
-    auto* range_op = static_cast<RangeOperator*>(op);
-    // Output type of the Range op can be set via an attribute
-    ArrayDataType data_type;
-    if (range_op->dtype != ArrayDataType::kNone) {
-      // Use the type if specified
-      data_type = range_op->dtype;
-    } else {
-      // Otherwise use the first input
-      CHECK_GE(op->inputs.size(), 1);
-      data_type = model->GetArray(op->inputs[0]).data_type;
+  switch (op->type) {
+    case OperatorType::kDequantize:
+    case OperatorType::kResizeBilinear:
+      // These operators unconditionally produce float outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat);
+      break;
+    case OperatorType::kTensorFlowLess:
+    case OperatorType::kTensorFlowLessEqual:
+    case OperatorType::kTensorFlowGreater:
+    case OperatorType::kTensorFlowGreaterEqual:
+      // These operators unconditionally produce bool outputs
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
+      break;
+    case OperatorType::kRank:
+    case OperatorType::kTensorFlowShape:
+      // These operators only produce int32 outputs.
+      SetDataTypeForAllOutputs(model, op, ArrayDataType::kInt32);
+      break;
+    case OperatorType::kTensorFlowSplit:
+    case OperatorType::kTensorFlowConcat:
+    case OperatorType::kFill: {
+      // These operators produce an output with the same type as their 2nd input
+      CHECK_GE(op->inputs.size(), 2);
+      const ArrayDataType data_type = model->GetArray(op->inputs[1]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-    CHECK_EQ(op->outputs.size(), 1);
-    SetDataTypeForAllOutputs(model, op, data_type);
-  } else if (op->type == OperatorType::kTensorFlowUnsupported) {
-    auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
-    // Some output tensors from the op could be eliminated by optimization.
-    // This can make unsupported_op->output_data_types have more elements than
-    // op->outputs.
-    if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+    case OperatorType::kTransposeConv: {
+      // These operators produce an output with the same type as their 3rd input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kCast: {
+      // Data type of the Cast op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* cast_op = static_cast<CastOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = cast_op->dst_data_type;
+      break;
+    }
+    case OperatorType::kArgMax: {
+      // Data type of the ArgMax op is specified.
+      CHECK_EQ(op->outputs.size(), 1);
+      auto* argmax_op = static_cast<ArgMaxOperator*>(op);
+      model->GetArray(op->outputs[0]).data_type = argmax_op->output_data_type;
+      break;
+    }
+    case OperatorType::kRange: {
+      auto* range_op = static_cast<RangeOperator*>(op);
+      // Output type of the Range op can be set via an attribute
+      ArrayDataType data_type;
+      if (range_op->dtype != ArrayDataType::kNone) {
+        // Use the type if specified
+        data_type = range_op->dtype;
+      } else {
+        // Otherwise use the first input
+        CHECK_GE(op->inputs.size(), 1);
+        data_type = model->GetArray(op->inputs[0]).data_type;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kRandomUniform: {
+      auto* rand_op = static_cast<RandomUniformOperator*>(op);
+      // The output type of RandomUniform is specified with an attribute
+      if (rand_op->dtype == ArrayDataType::kNone) {
+        return false;
+      }
+      CHECK_EQ(op->outputs.size(), 1);
+      SetDataTypeForAllOutputs(model, op, rand_op->dtype);
+      break;
+    }
+    case OperatorType::kTensorFlowUnsupported: {
+      auto* unsupported_op = static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Some output tensors from the op could be eliminated by optimization.
+      // This can make unsupported_op->output_data_types have more elements than
+      // op->outputs.
+      if (unsupported_op->output_data_types.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        auto output = op->outputs[i];
+        auto data_type = unsupported_op->output_data_types[i];
+        model->GetArray(output).data_type = data_type;
+      }
+      break;
+    }
+    case OperatorType::kExpandDims: {
+      // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
-    for (int i = 0; i < op->outputs.size(); ++i) {
-      auto output = op->outputs[i];
-      auto data_type = unsupported_op->output_data_types[i];
-      model->GetArray(output).data_type = data_type;
+    default: {
+      // These operators produce outputs with the same type as their 1st input
+      CHECK_GT(op->inputs.size(), 0);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
     }
-  } else if (op->type == OperatorType::kExpandDims) {
-    // Yield on ExpandDim until it is converted to Reshape
-    return false;
-  } else {
-    // These operators produce outputs with the same type as their 1st input
-    CHECK_GT(op->inputs.size(), 0);
-    const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
-    SetDataTypeForAllOutputs(model, op, data_type);
   }
+
   // Return true if any output data type changed, false if none changed.
   for (const auto& output : op->outputs) {
     if (old_output_data_types[output] != model->GetArray(output).data_type) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 676736cfc523c03c9f4d99c404eb2b5209209945..a648b770f841fa755634cad2d48f031901e5a24a 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -38,6 +38,16 @@ void ComputeConvSizes(const Shape& input_shape, int output_depth, int kwidth,
   const int input_height = input_shape.dims(1);
   const int batch = input_shape.dims(0);
 
+  CHECK_GE(input_width, 1);
+  CHECK_GE(input_height, 1);
+  CHECK_GE(batch, 1);
+  CHECK_GE(kwidth, 1);
+  CHECK_GE(kheight, 1);
+  CHECK_GE(stride_width, 1);
+  CHECK_GE(stride_height, 1);
+  CHECK_GE(dilation_width_factor, 1);
+  CHECK_GE(dilation_height_factor, 1);
+
   int dilated_kwidth = dilation_width_factor * (kwidth - 1) + 1;
   int dilated_kheight = dilation_height_factor * (kheight - 1) + 1;
 
@@ -392,8 +402,7 @@ void ProcessSpaceToDepthOperator(Model* model, SpaceToDepthOperator* op) {
                          depth * block_size * block_size}));
 }
 
-void ProcessFillOperator(Model* model, FillOperator* op) {
-  CHECK_EQ(op->inputs.size(), 2);
+void ProcessOpWithShapeInput(Model* model, Operator* op) {
   CHECK_EQ(op->outputs.size(), 1);
   auto& output_array = model->GetArray(op->outputs[0]);
   if (output_array.has_shape()) {
@@ -1470,6 +1479,7 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
+    case OperatorType::kLog:
     case OperatorType::kLogistic:
     case OperatorType::kTanh:
     case OperatorType::kLocalResponseNormalization:
@@ -1529,7 +1539,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
                                   static_cast<SpaceToDepthOperator*>(op));
       break;
     case OperatorType::kFill:
-      ProcessFillOperator(model, static_cast<FillOperator*>(op));
+      CHECK_EQ(op->inputs.size(), 2);
+      ProcessOpWithShapeInput(model, op);
       break;
     case OperatorType::kFullyConnected:
       ProcessFullyConnectedOperator(model,
@@ -1659,6 +1670,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       // transforms that remove them, so we avoid propagating shapes through
       // them and let things settle once they've been removed.
       break;
+    case OperatorType::kRandomUniform:
+      CHECK_EQ(op->inputs.size(), 1);
+      ProcessOpWithShapeInput(model, op);
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 9679ea0a776f9049699b087fd34f6a9088257c06..f50830ae606d2b39330840692dc07db0878d5840 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -44,6 +44,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowMinimum ||
          type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
+         type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kTensorFlowReshape ||
@@ -52,7 +53,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kStridedSlice ||
          type == OperatorType::kDepthToSpace ||
          type == OperatorType::kLstmCell || type == OperatorType::kGather ||
-         type == OperatorType::kTranspose;
+         type == OperatorType::kTranspose || type == OperatorType::kMean;
 }
 
 template <ArrayDataType A>
@@ -394,6 +395,19 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
                                  *quantization_params));
     return true;
   }
+  if (op.type == OperatorType::kLogSoftmax) {
+    // LogSoftmax has range: [LogSoftmaxOperator::kOutputRangeMin, 0].
+    *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
+    const QuantizationPoints qp = GetQuantizationPoints(*quantized_data_type);
+    quantization_params->zero_point = qp.max_value;
+    quantization_params->scale =
+        -LogSoftmaxOperator::kOutputRangeMin / (qp.max_value + 1);
+    // While not strictly necessary, it is easier to interpret output data and
+    // quantization if the scale is similar to others (such as power of 2).
+    CHECK(IsExactlyRepresentable(LogSoftmaxOperator::kOutputRangeMin / 2,
+                                 *quantized_data_type, *quantization_params));
+    return true;
+  }
   if (op.type == OperatorType::kTanh) {
     // Tanh has the range: [-1, 1].
     *quantized_data_type = GetQuantizedDataType(array, *quantized_data_type);
@@ -431,7 +445,8 @@ bool ChooseQuantizationForOperatorOutput(
       (op.type == OperatorType::kSpaceToDepth) ||
       (op.type == OperatorType::kTensorFlowReshape) ||
       (op.type == OperatorType::kTensorFlowSplit) ||
-      (op.type == OperatorType::kConcatenation)) {
+      (op.type == OperatorType::kConcatenation &&
+       model->flags.change_concat_input_ranges())) {
     int data_input_index = 0;
     if (op.type == OperatorType::kTensorFlowSplit) {
       data_input_index = 1;
@@ -472,6 +487,44 @@ bool ChooseQuantizationForOperatorOutput(
 
   return true;
 }
+
+// Fixes array minmax info to match the quantization parameters.
+// This is required for when quantization parameters change for an array during
+// quantization (such as ChooseQuantizationForOperatorOutput).
+void FixMinMaxPostQuantization(ArrayDataType quantized_data_type,
+                               const QuantizationParams& quantization_params,
+                               MinMax* minmax) {
+  double qmin, qmax;
+  switch (quantized_data_type) {
+    case ArrayDataType::kUint8:
+      qmin = 0;
+      qmax = 255;
+      break;
+    case ArrayDataType::kInt16:
+      qmin = -32768;
+      qmax = 32767;
+      break;
+    default:
+      // No update required.
+      return;
+  }
+
+  // Compute new minmax values.
+  double min =
+      (qmin - quantization_params.zero_point) * quantization_params.scale;
+  double max =
+      (qmax - quantization_params.zero_point) * quantization_params.scale;
+
+  // If we are close to the existing minmax values don't bother changing them.
+  // This prevents propagating small floating point precision errors.
+  constexpr double kMinMaxThreshold = 1e-5;
+  const double width = max - min;
+  if (std::abs(min - minmax->min) > kMinMaxThreshold * width ||
+      std::abs(max - minmax->max) > kMinMaxThreshold * width) {
+    minmax->min = min;
+    minmax->max = max;
+  }
+}
 }  // namespace
 
 bool Quantize::Run(Model* model, std::size_t op_index) {
@@ -618,12 +671,21 @@ bool Quantize::Run(Model* model, std::size_t op_index) {
                                             &quantization_params)) {
       changed = true;
       const auto& output = op.outputs[output_index];
+      auto& output_array = model->GetArray(output);
+
+      // Fix up the min/max information on the output array to match the chosen
+      // quantization parameters.
+      CHECK(output_array.minmax)
+          << "Output array named " << output << " lacks minmax";
+      auto& output_minmax = output_array.GetMinMax();
+      FixMinMaxPostQuantization(quantized_data_type, quantization_params,
+                                &output_minmax);
+
       QuantizeArray(this, model, output, quantized_data_type,
                     quantization_params);
+
       const auto& dequantized_output =
           AvailableArrayName(*model, output + "_dequantized");
-      const auto& output_array = model->GetArray(output);
-      const auto& output_minmax = output_array.GetMinMax();
       auto& dequantized_output_array =
           model->GetOrCreateArray(dequantized_output);
       dequantized_output_array.data_type = ArrayDataType::kFloat;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
index 11f8d4b6eea836c5fe4efcbd5136e6183a59dc62..bdcca5b7caf61a62203debaa32c4d7a9b2eb43fa 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/read_fake_quant_min_max.cc
@@ -72,6 +72,13 @@ bool ReadFakeQuantMinMax::Run(Model* model, std::size_t op_index) {
     minmax.min = min_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     minmax.max = max_array.GetBuffer<ArrayDataType::kFloat>().data[0];
     // We always want [min, max] to contain 0.
+    if (minmax.min > 0 || minmax.max < 0) {
+      LOG(ERROR) << "For " << LogName(*fakequant_op) << " the MinMax range "
+                 << "[" << minmax.min << ", " << minmax.max
+                 << "] does not contain 0. "
+                 << "Proceeding by tweaking it to contain 0, which will result "
+                    "in poor accuracy.";
+    }
     minmax.min = std::min(minmax.min, 0.);
     minmax.max = std::max(minmax.max, 0.);
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
deleted file mode 100644
index 9852c86c21b9a0714bc728e60b5d9dfe61ff52d1..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/toco/graph_transformations/reorder_activation_functions.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
-#include "tensorflow/contrib/lite/toco/model.h"
-#include "tensorflow/contrib/lite/toco/runtime/types.h"
-#include "tensorflow/contrib/lite/toco/tooling_util.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-
-bool ReorderActivationFunctions::Run(Model* model, std::size_t op_index) {
-  const auto ac_it = model->operators.begin() + op_index;
-  std::unique_ptr<Operator>& ac_op = *ac_it;
-  DCHECK(ac_op);
-
-  if (ac_op->type != OperatorType::kRelu6 &&
-      ac_op->type != OperatorType::kRelu1 &&
-      ac_op->type != OperatorType::kRelu) {
-    return false;
-  }
-
-  auto exchange_it = FindOpWithOutput(*model, ac_op->inputs[0]);
-  if (exchange_it == model->operators.end()) return false;
-  // Find the op producing the array passed to this activation function
-  std::unique_ptr<Operator>& exchange_op = *exchange_it;
-  DCHECK(exchange_op);
-
-  // Allow activation functions to move up over any operator that does not
-  // change the values.
-  switch (exchange_op->type) {
-    case OperatorType::kExpandDims:
-    case OperatorType::kSqueeze:
-    case OperatorType::kTensorFlowReshape:
-    case OperatorType::kTranspose:
-      break;
-    default:
-      return false;
-  }
-
-  DCHECK_EQ(exchange_op->outputs[0], ac_op->inputs[0]);
-  const auto exchange_op_input = exchange_op->inputs[0];
-  const auto intermediate_array = exchange_op->outputs[0];
-  const auto ac_op_output = ac_op->outputs[0];
-
-  int count_ops_consuming_output =
-      CountOpsWithInput(*model, intermediate_array);
-  DCHECK_GE(count_ops_consuming_output, 1);
-  if (count_ops_consuming_output > 1) {
-    AddMessageF(
-        "Not exchanging activation function with %s because it is consumed by "
-        "more than 1 other operator",
-        LogName(*exchange_op));
-    return false;
-  }
-
-  // If the ac_op was originally producing an output_array we can't trivially
-  // reorder as otherwise the output array name would change and break
-  // downstream assumptions. To work around that we perform some renaming below
-  // in that case at the cost of a bit more confusing array names in this rare
-  // case.
-  bool is_ac_op_output =
-      std::find(model->flags.output_arrays().begin(),
-                model->flags.output_arrays().end(),
-                ac_op_output) != model->flags.output_arrays().end();
-  if (is_ac_op_output) {
-    // To preserve the output array name of the activation function we need to
-    // create a temporary to use to pass between ac->ex.
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c') -> EX -> (c)
-    AddMessageF(
-        "Exchanging activation function %s with %s but renaming to preserve "
-        "output array %s",
-        LogName(*ac_op), LogName(*exchange_op), ac_op->outputs[0]);
-
-    auto renamed_ac_op_output =
-        AvailableArrayName(*model, ac_op_output + "_exchange");
-    ac_op->inputs[0] = exchange_op_input;
-    ac_op->outputs[0] = renamed_ac_op_output;
-    model->EraseArray(exchange_op->outputs[0]);
-    exchange_op->inputs[0] = renamed_ac_op_output;
-    exchange_op->outputs[0] = ac_op_output;
-  } else {
-    // Simply swap the order and update consumers to use the exchange_op output
-    // array (b).
-    //
-    // Original:
-    //  (a) -> EX -> (b) -> AC -> (c)
-    // Now:
-    //  (a) -> AC -> (c) -> EX -> (b)
-    AddMessageF("Exchanging activation function %s with %s", LogName(*ac_op),
-                LogName(*exchange_op));
-
-    Operator* consumer = GetFirstOpWithInput(*model, ac_op_output);
-    while (consumer) {
-      for (int i = 0; i < consumer->inputs.size(); ++i) {
-        if (consumer->inputs[i] == ac_op_output) {
-          consumer->inputs[i] = intermediate_array;
-        }
-      }
-      consumer = GetFirstOpWithInput(*model, ac_op_output);
-    }
-    ac_op->inputs[0] = exchange_op_input;
-    exchange_op->inputs[0] = ac_op_output;
-  }
-
-  // Clear shapes; this will allow shape propagation to fix the sizes for us.
-  model->GetOrCreateArray(ac_op->outputs[0]).clear_shape();
-  model->GetOrCreateArray(exchange_op->outputs[0]).clear_shape();
-
-  // Finally, reorder operators.  Note that this only works when there are no
-  // other direct descendents of the exchange_op.
-  ac_op.swap(exchange_op);
-
-  return true;
-}
-
-}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9f5b7920cb937b021eb23fc1d5fdc3c1ff18a72d
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_elementwise_unary.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool IsElementwiseOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kCast:
+    case OperatorType::kExp:
+    case OperatorType::kFloor:
+    case OperatorType::kNeg:
+    case OperatorType::kRelu:
+    case OperatorType::kRelu1:
+    case OperatorType::kRelu6:
+    case OperatorType::kTanh:
+    case OperatorType::kTensorFlowSqrt:
+    case OperatorType::kTensorFlowSquare:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsMoveOperator(OperatorType optype) {
+  switch (optype) {
+    case OperatorType::kDepthToSpace:
+    case OperatorType::kExpandDims:
+    case OperatorType::kSpaceToDepth:
+    case OperatorType::kSqueeze:
+    case OperatorType::kTensorFlowReshape:
+    case OperatorType::kTranspose:
+      return true;
+    default:
+      return false;
+  }
+}
+
+}  // namespace
+
+// Swap elementwise operators such that all value operators occur before all
+// element move operators, e.g. negation then transpose.
+bool ReorderElementwiseUnary::Run(Model* model, std::size_t op_index) {
+  const auto element_op_it = model->operators.begin() + op_index;
+  std::unique_ptr<Operator>& element_op = *element_op_it;
+  if (!IsElementwiseOperator(element_op->type)) {
+    return false;
+  }
+
+  const string intermediate_name = element_op->inputs[0];
+  auto it = FindOpWithOutput(*model, intermediate_name);
+  if (it == model->operators.end()) {
+    AddMessageF("No preceding operator");
+    return false;
+  }
+
+  std::unique_ptr<Operator>& move_op = *it;
+  if (!IsMoveOperator(move_op->type)) {
+    AddMessageF("Preceding operator is not a move operator");
+    return false;
+  }
+
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is discardable.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot swap elementwise as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // op->inputs may change so we need to keep a value by copy.
+  const string input_name = move_op->inputs[0];
+  const string output_name = element_op->outputs[0];
+
+  AddMessageF("Swapping around operators with %s and %s", LogName(*element_op),
+              LogName(*move_op));
+
+  // If the output array is an exit node for the graph then we need to retain
+  // the name as an output node. This makes the naming scheme a little confusing
+  // but is required in this rare case.
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, element_op->outputs[0] + "_reorder");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, output_name);
+
+    element_op->inputs[0] = input_name;
+    element_op->outputs[0] = new_intermediate_name;
+    model->EraseArray(intermediate_name);
+    move_op->inputs[0] = new_intermediate_name;
+    move_op->outputs[0] = output_name;
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    element_op->inputs[0] = input_name;
+    move_op->inputs[0] = output_name;
+  }
+
+  // Reset both arrays as shape, type, min/max, etc can all change because of
+  // the position swap.
+  model->EraseArray(element_op->outputs[0]);
+  model->EraseArray(move_op->outputs[0]);
+
+  // Reconstruct.
+  model->GetOrCreateArray(element_op->outputs[0]);
+  model->GetOrCreateArray(move_op->outputs[0]);
+
+  // Swap the order of the operators.
+  element_op.swap(move_op);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e7fe1b1ccd851dd998e59e75ff798f52f7c6e5a
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/reorder_reshape_transpose.cc
@@ -0,0 +1,248 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+bool OperatorReady(const Model& model, const Operator* op) {
+  if (!model.HasArray(op->inputs[0]) || !model.HasArray(op->inputs[1]) ||
+      !model.HasArray(op->outputs[0])) {
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[0]).has_shape() ||
+      !model.GetArray(op->outputs[0]).has_shape()) {
+    // Input and output needs the shape.
+    return false;
+  }
+
+  if (!model.GetArray(op->inputs[1]).buffer) {
+    // Buffer needs to be a constant.
+    return false;
+  }
+
+  return true;
+}
+
+// Utility function to filter out a value.
+void Filter(std::vector<int>* vec, int value) {
+  vec->erase(std::remove(vec->begin(), vec->end(), value), vec->end());
+}
+
+// Computes a new permutation used to swap a reshape-transpose to a
+// transpose-reshape. In this case the permutation operates on the intermediate
+// shape.
+std::vector<int> ComputeNewPerm(std::vector<int> input_dims,
+                                std::vector<int> intermediate_dims,
+                                std::vector<int> perm) {
+  // These are the major axis of the input.
+  std::vector<int> input_indices;
+  for (int i = 0; i < input_dims.size(); i++) {
+    if (input_dims[i] != 1) {
+      input_indices.push_back(i);
+    }
+  }
+
+  // This maps which indices of the input produced the intermediate indices for
+  // non-unary dimensions.
+  std::unordered_map<int, int> intermediate_to_input_indices_map;
+  for (int i = 0; i < intermediate_dims.size(); i++) {
+    if (intermediate_dims[i] != 1) {
+      intermediate_to_input_indices_map[i] =
+          input_indices[intermediate_to_input_indices_map.size()];
+    }
+  }
+
+  // Translate the transpose permutation to a new permutation starting with the
+  // major indices.
+  std::vector<int> new_perm;
+  new_perm.reserve(input_dims.size());
+  for (int i = 0; i < perm.size(); i++) {
+    if (intermediate_dims[perm[i]] == 1) continue;
+
+    new_perm.push_back(intermediate_to_input_indices_map[perm[i]]);
+  }
+
+  // Fill the rest of the transpose in with the ones.
+  for (int index = 0; index < input_dims.size(); index++) {
+    if (input_dims[index] == 1) {
+      new_perm.push_back(index);
+    }
+  }
+
+  CHECK_EQ(new_perm.size(), input_dims.size());
+  return new_perm;
+}
+
+}  // namespace
+
+// Swaps reshape-transpose to transpose-reshape whenever possible. This is
+// possible when the reshape does not affect memory ordering.
+bool ReorderReshapeTranspose::Run(Model* model, std::size_t op_index) {
+  auto transpose_it = model->operators.begin() + op_index;
+
+  TransposeOperator* transpose_op = ConvertOperator<TransposeOperator*>(
+      transpose_it->get(), OperatorType::kTranspose);
+
+  if (transpose_op == nullptr) {
+    return false;
+  }
+
+  if (!OperatorReady(*model, transpose_op) || transpose_op->perm.empty()) {
+    // Wait for values to propagate.
+    return false;
+  }
+
+  // Find the operator that produces the transpose op.
+  auto reshape_it = FindOpWithOutput(*model, transpose_op->inputs[0]);
+  if (reshape_it == model->operators.end()) {
+    return false;
+  }
+
+  TensorFlowReshapeOperator* reshape_op =
+      ConvertOperator<TensorFlowReshapeOperator*>(
+          reshape_it->get(), OperatorType::kTensorFlowReshape);
+  if (reshape_op == nullptr) {
+    return false;
+  }
+
+  // Ignore if the reshape is uninitialized.
+  if (!OperatorReady(*model, reshape_op) || reshape_op->shape.empty()) {
+    return false;
+  }
+
+  // Need to copy to keep static if permutated.
+  const string input_name = reshape_op->inputs[0];
+  const string intermediate_name = reshape_op->outputs[0];
+  const string output_name = transpose_op->outputs[0];
+
+  // Intermediate should not be consumed by any other operators.
+  if (CountOpsWithInput(*model, intermediate_name) != 1) {
+    AddMessageF("Input %s used elsewhere", intermediate_name);
+    return false;
+  }
+
+  // Check that the intermediate is not an output array.
+  if (!IsDiscardableArray(*model, intermediate_name)) {
+    AddMessageF(
+        "Cannot reorder reshape-transpose as it would invalidate %s which is "
+        "an output array.",
+        intermediate_name);
+    return false;
+  }
+
+  // Get the arrays.
+  const auto& input_array = model->GetArray(input_name);
+  const auto& intermediate_array = model->GetArray(intermediate_name);
+  const auto& output_array = model->GetArray(output_name);
+
+  // Get the shapes of each array.
+  Shape input_shape = input_array.shape();
+  Shape intermediate_shape = intermediate_array.shape();
+  Shape output_shape = output_array.shape();
+
+  // Assign ids to non-unary indices.
+  std::vector<int> input_dims = input_shape.dims();
+  std::vector<int> intermediate_dims = intermediate_shape.dims();
+  std::vector<int> output_dims = output_shape.dims();
+
+  // If the reshape is equivalent to a transpose with fewer/more unary
+  // dimensions then it can be moved between the transpose.
+  if (!ReshapeIsEquivalentToTranspose(*model, reshape_op,
+                                      true /*allow_extra_unary_dims*/)) {
+    return false;
+  }
+
+  if (!IsDiscardableArray(*model, output_name)) {
+    // The output name of the sequence needs to stay static, so create a new
+    // array new use for the intermediate.
+    const auto new_intermediate_name =
+        AvailableArrayName(*model, transpose_op->outputs[0] + "_exchange");
+    AddMessageF("Adding new array %s to preserve output array name %s",
+                new_intermediate_name, transpose_op->outputs[0]);
+    transpose_op->inputs[0] = input_name;
+    transpose_op->outputs[0] = new_intermediate_name;
+    reshape_op->inputs[0] = new_intermediate_name;
+    reshape_op->outputs[0] = output_name;
+    model->EraseArray(intermediate_name);
+  } else {
+    // The intermediate array is now the output array.
+    for (int i = 0; i < model->operators.size(); i++) {
+      Operator* consumer = model->operators[i].get();
+      for (int j = 0; j < consumer->inputs.size(); j++) {
+        if (consumer->inputs[j] == output_name) {
+          consumer->inputs[j] = intermediate_name;
+        }
+      }
+    }
+
+    transpose_op->inputs[0] = input_name;
+    reshape_op->inputs[0] = output_name;
+  }
+
+  // If transposes constant buffer is used elsewhere, make a new copy.
+  if (CountOpsWithInput(*model, transpose_op->inputs[1]) != 1) {
+    transpose_op->inputs[1] =
+        AvailableArrayName(*model, transpose_op->inputs[1] + "_copy");
+  }
+
+  // Make the new transpose permutation.
+  const std::vector<int> new_perm =
+      ComputeNewPerm(input_dims, intermediate_dims, transpose_op->perm);
+  CHECK_EQ(input_dims.size(), new_perm.size());
+
+  auto& transpose_array = model->GetOrCreateArray(transpose_op->inputs[1]);
+  transpose_array.GetMutableBuffer<ArrayDataType::kInt32>().data = new_perm;
+  *(transpose_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(new_perm.size())};
+  transpose_op->perm = new_perm;
+
+  // If the reshape's constant buffer is reused, create a new one.
+  if (CountOpsWithInput(*model, reshape_op->inputs[1]) != 1) {
+    reshape_op->inputs[1] =
+        AvailableArrayName(*model, reshape_op->inputs[1] + "_copy");
+  }
+
+  // We need to modify the reshape input array to target the new output size.
+  auto& reshape_array = model->GetOrCreateArray(reshape_op->inputs[1]);
+  reshape_array.GetMutableBuffer<ArrayDataType::kInt32>().data = output_dims;
+  *(reshape_array.mutable_shape()->mutable_dims()) = {
+      static_cast<int>(output_shape.dimensions_count())};
+  reshape_op->shape.clear();
+
+  AddMessageF("Swapping around operators between %s and %s", input_name,
+              output_name);
+
+  model->GetOrCreateArray(transpose_op->outputs[0]).clear_shape();
+  model->GetOrCreateArray(reshape_op->outputs[0]).clear_shape();
+
+  // Swap the order of the operators.
+  transpose_it->swap(*reshape_it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
index 064810b53e7c3bee4601204c9dbd976c374a6a60..d916ae0ddf017fe6a2fb2709db6e9de8c258adfc 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_concatenation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -105,7 +106,8 @@ void ConcatenateTensorBuffers(const std::vector<Array*>& input_arrays,
 // already set (e.g. because of previous pass in TOCO), it doesn't change it and
 // returns. Otherwise it uses the input arrays min and max values to compute the
 // concatenated array min and max.
-void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
+void SetMinMaxForConcatenedArray(GraphTransformation* transformation,
+                                 const std::vector<Array*>& input_arrays,
                                  Array* concatenated_array) {
   CHECK(concatenated_array->data_type == ArrayDataType::kFloat);
   // If the minmax is already set, use it
@@ -125,6 +127,9 @@ void SetMinMaxForConcatenedArray(const std::vector<Array*>& input_arrays,
   MinMax& minmax = concatenated_array->GetOrCreateMinMax();
   minmax.min = concat_min;
   minmax.max = concat_max;
+
+  transformation->AddMessageF("Setting concatenated array min/max to %g,%g",
+                              concat_min, concat_max);
 }
 
 }  // namespace
@@ -161,11 +166,14 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
     input_arrays.push_back(&model->GetArray(input_name));
   }
 
+  AddMessageF("Performing constant concat of %s into %s",
+              absl::StrJoin(concat_op->inputs, ", "), concatenated_array_name);
+
   switch (concatenated_array.data_type) {
     case ArrayDataType::kFloat:
       ConcatenateTensorBuffers<ArrayDataType::kFloat>(
           input_arrays, concatenation_axis, &concatenated_array);
-      SetMinMaxForConcatenedArray(input_arrays, &concatenated_array);
+      SetMinMaxForConcatenedArray(this, input_arrays, &concatenated_array);
       break;
     case ArrayDataType::kUint8:
       ConcatenateTensorBuffers<ArrayDataType::kUint8>(
@@ -189,13 +197,13 @@ bool ResolveConstantConcatenation::Run(Model* model, std::size_t op_index) {
 
   // Remove all the resolved arrays.
   for (const string& input_name : concat_op->inputs) {
-    // Check to prevent removal of shared tensors
+    // Check to prevent removal of shared tensors.
     if (CountOpsWithInput(*model, input_name) == 1) {
       model->EraseArray(input_name);
     }
   }
 
-  // Remove concatenate operator
+  // Remove concatenate operator.
   model->operators.erase(concat_it);
   return true;
 }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
index d999c2df9483e096f333c6af83e1d9fee873d4d6..debe298a5a93034bcb928d7384b5ec1fc7439e47 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -98,6 +98,16 @@ bool ResolveConstantGather::Run(Model* model, std::size_t op_index) {
   CHECK(coords_array.data_type == ArrayDataType::kInt32)
       << "Only int32 indices are supported";
 
+  // Copy min/max info if present. The ranges of the selected values may be
+  // a subset of the original range but we want to ensure the quantization
+  // params stay the same.
+  if (input_array.minmax) {
+    const auto& input_minmax = input_array.GetMinMax();
+    auto& output_minmax = output_array.GetOrCreateMinMax();
+    output_minmax.min = input_minmax.min;
+    output_minmax.max = input_minmax.max;
+  }
+
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..88d06d7dc75005c89a69b881aa0064d1162227d5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_random_uniform.cc
@@ -0,0 +1,116 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+
+namespace toco {
+
+template <ArrayDataType Type>
+bool ComputeRandomUniformArray(Model* model, RandomUniformOperator* op) {
+  typedef tensorflow::random::UniformDistribution<
+      tensorflow::random::PhiloxRandom, DataType<Type>>
+      Distribution;
+
+  // Allocate output
+  auto& output_array = model->GetArray(op->outputs[0]);
+  CHECK(output_array.data_type == Type);
+  std::vector<DataType<Type>>& data =
+      output_array.GetMutableBuffer<Type>().data;
+  data.resize(RequiredBufferSizeForShape(output_array.shape()));
+
+  // We use the same random number generator and distribution as TensorFlow to
+  // produce the exact same values given the same seeds. See
+  // tensorflow::functor::FillPhiloxRandomTask<Distribution, false> in
+  // //third_party/tensorflow/core/kernels/random_op.cc for the implementation.
+  tensorflow::random::PhiloxRandom generator(op->seed, op->seed2);
+  Distribution dist;
+
+  // The generator creates Distribution::kResultElementCount samples at a time.
+  size_t offset = 0;
+  size_t num_samples = Distribution::kResultElementCount;
+  while (offset < data.size()) {
+    const typename Distribution::ResultType samples = dist(&generator);
+    std::copy(&samples[0],
+              &samples[0] + std::min(num_samples, data.size() - offset),
+              &data[0] + offset);
+    offset += num_samples;
+  }
+
+  return true;
+}
+
+bool ResolveConstantRandomUniform::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  auto* base_op = it->get();
+  if (base_op->type != OperatorType::kRandomUniform) {
+    return false;
+  }
+  auto* op = static_cast<RandomUniformOperator*>(base_op);
+
+  CHECK_EQ(op->inputs.size(), 1);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes
+    return false;
+  }
+
+  if ((op->seed == 0) && (op->seed2 == 0)) {
+    LOG(WARNING) << "RandomUniform op outputting \"" << op->outputs[0]
+                 << "\" is truly random (using /dev/random system entropy). "
+                    "Therefore, cannot resolve as constant. Set \"seed\" or "
+                    "\"seed2\" attr non-zero to fix this";
+    return false;
+  }
+
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!ComputeRandomUniformArray<ArrayDataType::kFloat>(model, op)) {
+        return false;
+      }
+      break;
+    // For future support of double or half.
+    // case ArrayDataType::kDouble...
+    default:
+      LOG(FATAL)
+          << "Unsupported data type given to RandomUniform op with output \""
+          << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input arrays if no longer used
+  toco::DeleteArrayIfUsedOnce(op->inputs[0], model);
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
index d4db6f1c009cd19515655fb31974a2e97cfa42e8..f6c8f79d8d3311dc2294e3ec406a184b2a16a6b5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_unary.cc
@@ -51,6 +51,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
   // Test for unary ops of types that we know how to resolve.
   switch (unary_op->type) {
     case OperatorType::kCast:
+    case OperatorType::kLog:
     case OperatorType::kNeg:
     case OperatorType::kTensorFlowRsqrt:
     case OperatorType::kTensorFlowSqrt:
@@ -218,6 +219,7 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
     }
     output_float_data[0] = max;
   } else if (unary_op->type == OperatorType::kNeg ||
+             unary_op->type == OperatorType::kLog ||
              unary_op->type == OperatorType::kTensorFlowRsqrt ||
              unary_op->type == OperatorType::kTensorFlowSqrt ||
              unary_op->type == OperatorType::kTensorFlowSquare) {
@@ -231,6 +233,8 @@ bool ResolveConstantUnaryOperator::Run(Model* model, std::size_t op_index) {
       float outval = 0.f;
       if (unary_op->type == OperatorType::kNeg) {
         outval = -val;
+      } else if (unary_op->type == OperatorType::kLog) {
+        outval = std::log(val);
       } else if (unary_op->type == OperatorType::kTensorFlowRsqrt) {
         outval = 1.0f / std::sqrt(val);
       } else if (unary_op->type == OperatorType::kTensorFlowSqrt) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
index f38203c80fcb7ab8bc1639129fd98e4e342e5cb7..2a236d3f98784e8244942f94d5a250b5bc00a8ad 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_tensorflow_matmul.cc
@@ -60,6 +60,13 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   string input_lhs = matmul_op->inputs[0];
   string input_rhs = transpose_op->outputs[0];
 
+  // Construct the new FullyConnectedOperator.
+  auto* fc_op = new FullyConnectedOperator;
+  fc_op->outputs = matmul_op->outputs;
+
+  // Insert the newly constructed FullyConnectedOperator.
+  model->operators.emplace(matmul_it, fc_op) + 1;
+
   // Find the op producing the array passed to this MatMul
   auto previous_op_it = model->operators.begin();
   bool found = false;
@@ -76,13 +83,6 @@ bool ResolveTensorFlowMatMul::Run(Model* model, std::size_t op_index) {
   }
   Operator* previous_op = (found) ? previous_op_it->get() : nullptr;
 
-  // Construct the new FullyConnectedOperator.
-  auto* fc_op = new FullyConnectedOperator;
-  fc_op->outputs = matmul_op->outputs;
-
-  // Insert the newly constructed FullyConnectedOperator.
-  model->operators.emplace(matmul_it, fc_op) + 1;
-
   // Refresh iterator.
   matmul_it = model->operators.begin();
   for (; matmul_it != model->operators.end(); ++matmul_it) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 2f94f9cd8a9ab24809fb3d137b5d05ab12f43003..8dcd4adc90b188c745cadb9815c3c46383705833 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -28,15 +28,3 @@ tf_cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
index 48c326651f3201b4f7a31ac2440b171841e8ed7b..cbea39bcc09ea6787c055d5aaca7f291c2b47a7f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/unpartition_embedding_lookup.cc
@@ -165,7 +165,7 @@ bool UnpartitionEmbeddingLookup::Run(Model* model, std::size_t op_index) {
   CHECK(mod_op && mod_op->type == OperatorType::kFloorMod)
       << "Unsupported partition strategy";
   CHECK_EQ(mod_op, GetOpWithOutput(*model, indices_partition_op->inputs[1]))
-      << "Indices and data parition ops require the same partition strategy "
+      << "Indices and data partition ops require the same partition strategy "
          "and inputs";
 
   // Glob together all of the gather data. This is not yet in the correct order.
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index b844e0b9484f55ffaad63e55956ff789036f05e3..155d890c9f23ba206f1f0e6db645a601308cea5b 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -74,7 +74,7 @@ const string& GetStringAttr(const NodeDef& node, const string& attr_name) {
   return attr.s();
 }
 
-int GetIntAttr(const NodeDef& node, const string& attr_name) {
+int64 GetIntAttr(const NodeDef& node, const string& attr_name) {
   CHECK(HasAttr(node, attr_name)) << attr_name << " not found in:\n"
                                   << node.DebugString();
   const auto& attr = node.attr().at(attr_name);
@@ -569,6 +569,23 @@ void ConvertBiasAddOperator(const NodeDef& node,
   model->operators.emplace_back(biasadd);
 }
 
+void ConvertRandomUniform(const NodeDef& node,
+                          const TensorFlowImportFlags& tf_import_flags,
+                          Model* model) {
+  CHECK_EQ(node.op(), "RandomUniform");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_INT32);
+  auto op = absl::make_unique<RandomUniformOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  op->dtype = ConvertDataType(GetDataTypeAttr(node, "dtype"));
+  op->seed = GetIntAttr(node, "seed");
+  op->seed2 = GetIntAttr(node, "seed2");
+  CHECK(model != nullptr);
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertReluOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -594,6 +611,18 @@ void ConvertRelu6Operator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
+void ConvertLogOperator(const NodeDef& node,
+                        const TensorFlowImportFlags& tf_import_flags,
+                        Model* model) {
+  CHECK_EQ(node.op(), "Log");
+  CheckInputsCount(node, tf_import_flags, 1);
+
+  auto op = absl::make_unique<LogOperator>();
+  op->inputs.push_back(node.input(0));
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(std::move(op));
+}
+
 void ConvertLogisticOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -665,6 +694,8 @@ void ConvertFakeQuantWithMinMaxArgs(
   minmax.min = GetFloatAttr(node, "min");
   minmax.max = GetFloatAttr(node, "max");
   op->outputs.push_back(node.name());
+  // tf.fake_quant_with_min_max_args num_bits defaults to 8.
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -682,6 +713,7 @@ void ConvertFakeQuantWithMinMaxVars(
     op->inputs.push_back(node.input(i));
   }
   op->outputs.push_back(node.name());
+  op->num_bits = HasAttr(node, "num_bits") ? GetIntAttr(node, "num_bits") : 8;
   model->operators.emplace_back(op);
 }
 
@@ -1343,13 +1375,16 @@ void ConvertFloorOperator(const NodeDef& node,
 void ConvertGatherOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "Gather");
-  CheckInputsCount(node, tf_import_flags, 2);
+  CHECK(node.op() == "Gather" || node.op() == "GatherV2");
+  if (node.op() == "Gather") CheckInputsCount(node, tf_import_flags, 2);
+  if (node.op() == "GatherV2") CheckInputsCount(node, tf_import_flags, 3);
   const auto indices_data_type = GetDataTypeAttr(node, "Tindices");
   CHECK(indices_data_type == DT_INT32 || indices_data_type == DT_INT64);
   auto* op = new GatherOperator;
   op->inputs.push_back(node.input(0));
   op->inputs.push_back(node.input(1));
+  // TODO(ahentz): we currently ignore the third tensor in GatherV2 but we
+  // should read it an pass it on to the TF Lite Interpreter.
   op->outputs.push_back(node.name());
   model->operators.emplace_back(op);
 }
@@ -1928,7 +1963,7 @@ void ConvertTopKV2Operator(const NodeDef& node,
   // K can be encoded as attr (TopK) convert it to a const.
   if (HasAttr(node, "k")) {
     string k_array = CreateConstArray<ArrayDataType::kInt32>(
-        model, node.name() + "k", {GetIntAttr(node, "k")});
+        model, node.name() + "k", {static_cast<int32>(GetIntAttr(node, "k"))});
     op->inputs.push_back(k_array);
   } else {
     CheckInputsCount(node, tf_import_flags, 2);
@@ -2071,6 +2106,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertLRNOperator(node, tf_import_flags, model);
     } else if (node.op() == "Softmax") {
       ConvertSoftmaxOperator(node, tf_import_flags, model);
+    } else if (node.op() == "Log") {
+      ConvertLogOperator(node, tf_import_flags, model);
     } else if (node.op() == "LogSoftmax") {
       ConvertLogSoftmaxOperator(node, tf_import_flags, model);
     } else if (node.op() == "All") {
@@ -2119,7 +2156,7 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
       ConvertCastOperator(node, tf_import_flags, model);
     } else if (node.op() == "Floor") {
       ConvertFloorOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Gather") {
+    } else if (node.op() == "Gather" || node.op() == "GatherV2") {
       ConvertGatherOperator(node, tf_import_flags, model);
     } else if (node.op() == "ResizeBilinear") {
       ConvertResizeBilinearOperator(node, tf_import_flags, model);
@@ -2165,6 +2202,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
     } else if (node.op() == "DynamicStitch" ||
                node.op() == "ParallelDynamicStitch") {
       ConvertDynamicStitchOperator(node, tf_import_flags, model);
+    } else if (node.op() == "RandomUniform") {
+      ConvertRandomUniform(node, tf_import_flags, model);
     } else {
       ConvertUnsupportedOperator(node, tf_import_flags, model);
     }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 5199e292e19c2ac59dcfc2efd9947cc788b0299d..616d53ae3e355c3baea6af37e3d82f7a8058d840 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -56,10 +56,12 @@ enum class OperatorType {
   kL2Pool,
   kLstmCell,
   kLocalResponseNormalization,
+  kLog,
   kLogistic,
   kMaxPool,
   kFakeQuant,
   kMul,
+  kRandomUniform,
   kRange,
   kRank,
   kRelu,
@@ -590,6 +592,17 @@ struct LogisticOperator : Operator {
   LogisticOperator() : Operator(OperatorType::kLogistic) {}
 };
 
+// Element-wise natural log operator:
+//   x -> ln(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Log
+struct LogOperator : Operator {
+  LogOperator() : Operator(OperatorType::kLog) {}
+};
+
 // Element-wise Tanh operator:
 //   x -> Tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 //
@@ -711,8 +724,7 @@ struct L2PoolOperator : Operator {
 // The expected [min, max] range of values in a given array.
 // Used for quantization only.
 // This information typically comes from special nodes found in quantized
-// models,
-// see FakeQuantOperator, and is used during quantization to resolve
+// models, see FakeQuantOperator, and is used during quantization to resolve
 // actual quantization parameters (see QuantizationParams).
 struct MinMax {
   double min = 0.;
@@ -740,6 +752,7 @@ inline bool operator==(const MinMax& m1, const MinMax& m2) {
 struct FakeQuantOperator : Operator {
   FakeQuantOperator() : Operator(OperatorType::kFakeQuant) {}
   std::unique_ptr<MinMax> minmax;
+  int num_bits = 8;
 };
 
 // Element-wise division operator.
@@ -946,6 +959,13 @@ struct FloorModOperator : Operator {
   FloorModOperator() : Operator(OperatorType::kFloorMod) {}
 };
 
+struct RandomUniformOperator : Operator {
+  RandomUniformOperator() : Operator(OperatorType::kRandomUniform) {}
+  ArrayDataType dtype = ArrayDataType::kNone;
+  int64 seed;
+  int64 seed2;
+};
+
 // Creates a sequence of numbers that begins at start and extends by increments
 // of delta up to but not including limit.
 //
@@ -1309,6 +1329,15 @@ struct SoftmaxOperator : Operator {
 // TensorFlow equivalent: LogSoftmax
 struct LogSoftmaxOperator : Operator {
   LogSoftmaxOperator() : Operator(OperatorType::kLogSoftmax) {}
+
+  // LogSoftmax can in principal have very large negative output, depending on
+  // the input size.  However, input x_i that is less than x_max-10 is
+  // accumulated as exp(x_i-x_max), which is truncated to zero.
+  //
+  // Since we effectively disregard smallish inputs in the normalizing factor,
+  // we also drop them in the output (set to minimum output), and in doing so
+  // make better use of the quantization range / resolution.
+  static constexpr float kOutputRangeMin = -16.0;
 };
 
 // Cast operator.
@@ -1499,7 +1528,14 @@ class Shape {
 
   // We still have that one convenience accessor to avoid
   // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const { return dims_[i]; }
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
 
   bool operator==(const Shape& comp) const {
     return (this->dims_ == comp.dims());
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 4264f21c76e6f4a26d1be710874c0edb96a6ca6d..7bbeab7c9d1e42d28f221f1a1134d9d05fe6ab51 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -160,6 +160,16 @@ bool ParseModelFlagsFromCommandLineFlags(
           "Path to an optional file containing a serialized ArraysExtraInfo "
           "proto allowing to pass extra information about arrays not specified "
           "in the input model file, such as extra MinMax information."),
+      Flag("model_flags_file", parsed_flags.model_flags_file.bind(),
+           parsed_flags.model_flags_file.default_value(),
+           "Path to an optional file containing a serialized ModelFlags proto. "
+           "Options specified on the command line will override the values in "
+           "the proto."),
+      Flag("change_concat_input_ranges",
+           parsed_flags.change_concat_input_ranges.bind(),
+           parsed_flags.change_concat_input_ranges.default_value(),
+           "Boolean to change the behavior of min/max ranges for inputs and"
+           " output of the concat operators."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -182,7 +192,24 @@ void ReadModelFlagsFromCommandLineFlags(
     const ParsedModelFlags& parsed_model_flags, ModelFlags* model_flags) {
   toco::port::CheckInitGoogleIsDone("InitGoogle is not done yet");
 
-// "batch" flag only exists internally
+  // Load proto containing the initial model flags.
+  // Additional flags specified on the command line will overwrite the values.
+  if (parsed_model_flags.model_flags_file.specified()) {
+    string model_flags_file_contents;
+    QCHECK(port::file::GetContents(parsed_model_flags.model_flags_file.value(),
+                                   &model_flags_file_contents,
+                                   port::file::Defaults())
+               .ok())
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " was not found or could not be read";
+    QCHECK(ParseFromStringEitherTextOrBinary(model_flags_file_contents,
+                                             model_flags))
+        << "Specified --model_flags_file="
+        << parsed_model_flags.model_flags_file.value()
+        << " could not be parsed";
+  }
+
 #ifdef PLATFORM_GOOGLE
   CHECK(!((base::SpecifiedOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
@@ -377,12 +404,15 @@ void ReadModelFlagsFromCommandLineFlags(
       parsed_model_flags.allow_nonascii_arrays.value());
   model_flags->set_allow_nonexistent_arrays(
       parsed_model_flags.allow_nonexistent_arrays.value());
+  model_flags->set_change_concat_input_ranges(
+      parsed_model_flags.change_concat_input_ranges.value());
 
   if (parsed_model_flags.arrays_extra_info_file.specified()) {
     string arrays_extra_info_file_contents;
-    port::file::GetContents(parsed_model_flags.arrays_extra_info_file.value(),
-                            &arrays_extra_info_file_contents,
-                            port::file::Defaults());
+    CHECK(port::file::GetContents(
+              parsed_model_flags.arrays_extra_info_file.value(),
+              &arrays_extra_info_file_contents, port::file::Defaults())
+              .ok());
     ParseFromStringEitherTextOrBinary(arrays_extra_info_file_contents,
                                       model_flags->mutable_arrays_extra_info());
   }
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index 42e0f54826dd809a801a8ac1bfd0a5a7660382a8..d23e80c464c9fe9d717d4af8093fa5dee04dca6d 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -98,8 +98,8 @@ message ArraysExtraInfo {
   message Entry {
     // Next ID to use: 7.
     optional string name = 1;
-    optional float min = 2;
-    optional float max = 3;
+    optional double min = 2;
+    optional double max = 3;
     optional IODataType data_type = 4;
     optional InputArrayShape shape = 5;
     optional float constant_float_value = 6;
@@ -128,7 +128,7 @@ message ArraysExtraInfo {
 //   optional int32 input_dims = 11 [ default = 4];
 //   repeated int32 input_shape = 13;
 //
-// Next ID to USE: 19.
+// Next ID to USE: 20.
 message ModelFlags {
   // Information about the input arrays, i.e. the arrays from which input
   // activations will be read.
@@ -175,4 +175,8 @@ message ModelFlags {
   // If set, this ArraysExtraInfo allows to pass extra information about arrays
   // not specified in the input model file, such as extra MinMax information.
   optional ArraysExtraInfo arrays_extra_info = 18;
+
+  // When set to false, toco will not change the input ranges and the output
+  // ranges of concat operator to the overlap of all input ranges.
+  optional bool change_concat_input_ranges = 19 [default = true];
 }
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 17115047d2ef93cce7004926c2b1a4bfa58f6243..6c4f8e12cdd5b3222997c4a2b0ac243cc74324e0 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -45,9 +45,6 @@ py_binary(
     name = "toco_wrapper",
     srcs = ["toco_wrapper.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
 )
 
 tf_py_test(
@@ -63,15 +60,3 @@ tf_py_test(
     ],
     tags = ["no_pip"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
index c35b6f99259b762aa83d92d21512169a7ab50b70..3761e0095ebb06b9e26eca55a36718b92058e47b 100644
--- a/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
+++ b/tensorflow/contrib/lite/toco/python/toco_from_protos_test.py
@@ -50,6 +50,7 @@ class TocoFromProtosTest(googletest.TestCase):
     toco_flags.output_format = toco_flags_pb2.TFLITE
     toco_flags.inference_input_type = types_pb2.FLOAT
     toco_flags.inference_type = types_pb2.FLOAT
+    toco_flags.allow_custom_ops = True;
     model_flags = model_flags_pb2.ModelFlags()
     input_array = model_flags.input_arrays.add()
     input_array.name = TensorName(in_tensor)
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 8a5e483f3f1676ebed3244bd6f7eb610fad21557..153c117d17e4564d7cb0aaea64d792f63a587d91 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -75,7 +75,8 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
   string output_file_contents_txt;
-  Export(toco_flags, *model, &output_file_contents_txt);
+  Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+         &output_file_contents_txt);
 
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
index e39b5f22c7c8ffafaf72129be6f54090e6761dc3..6d6b500d7eccd353f566a4bad76df35e0e849d95 100644
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
@@ -22,14 +22,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import sys
-import tensorflow as tf
 
 
 def main():
   # Pip installs the binary in aux-bin off of main site-package install.
   # Just find it and exec, passing all arguments in the process.
   # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  os.execvp(binary, sys.argv)
+  print("""TOCO from pip install is currently not working on command line.
+Please use the python TOCO API or use
+bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
+""")
+  sys.exit(1)
+  # TODO(aselle): Replace this when we find a way to run toco without
+  # blowing up executable size.
+  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
+  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
index 0c1a1141fca91e7d27fe48ffae4f834ae92a1e08..336e94de1ed3238d64f521cf1347acc8f0737de7 100644
--- a/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/contrib/lite/toco/tensorflow_graph_matching/BUILD
@@ -88,15 +88,3 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/BUILD b/tensorflow/contrib/lite/toco/tflite/BUILD
index 9d3e1daf1258c6bc076dac566129174430bb761d..e0191801a0f0076565c51085ec293524d63cbe88 100644
--- a/tensorflow/contrib/lite/toco/tflite/BUILD
+++ b/tensorflow/contrib/lite/toco/tflite/BUILD
@@ -137,15 +137,3 @@ tf_cc_test(
         "@flatbuffers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 27719599708a7eb14f72a82f8e5d76b3b8af9dc4..335b496dccdbdb7e342515868e1d7195c98f0351 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -300,6 +300,17 @@ void Export(const Model& model, bool allow_custom_ops,
   std::set<string> error_summary;
   auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
                                       &builder, &error_summary);
+  const string fake_quant_operation_name = "FAKE_QUANT";
+  if (error_summary.count(fake_quant_operation_name) != 0) {
+    LOG(ERROR)
+        << fake_quant_operation_name
+        << " operation was not converted. If running quantized make sure you "
+           "are passing --inference_type=QUANTIZED_UINT8 and values for "
+           "--std_values and --mean_values.";
+    // Remove the fake quant operation from the errors, since it shouldn't
+    // be provided a custom implementation.
+    error_summary.erase(fake_quant_operation_name);
+  }
   if (!allow_custom_ops && !error_summary.empty()) {
     LOG(QFATAL) << "Some of the operators in the model are not supported by "
                    "the standard TensorFlow Lite runtime. If you have a custom "
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 0989bfe5a3de9a7c0f62b272b0be84df1f4ddcb0..0e057fd2527fd7057a5ac9c9e14b6e793eebc849 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -204,17 +204,22 @@ class BatchToSpaceND
                    TocoOperator* op) const override {}
 };
 
-class Cast : public CustomOperator<CastOperator> {
+class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
+                                    ::tflite::BuiltinOptions_CastOptions> {
  public:
-  using CustomOperator::CustomOperator;
-  void WriteOptions(const TocoOperator& op,
-                    flexbuffers::Builder* fbb) const override {
-    fbb->Int("src_data_type", DataType::Serialize(op.src_data_type));
-    fbb->Int("dst_data_type", DataType::Serialize(op.dst_data_type));
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateCastOptions(*builder,
+                                       DataType::Serialize(op.src_data_type),
+                                       DataType::Serialize(op.dst_data_type));
   }
-  void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
-    op->src_data_type = DataType::Deserialize(m["src_data_type"].AsInt64());
-    op->dst_data_type = DataType::Deserialize(m["dst_data_type"].AsInt64());
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->src_data_type = DataType::Deserialize(options.in_data_type());
+    op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
 };
 
@@ -255,12 +260,15 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
                     flexbuffers::Builder* fbb) const override {
     fbb->Float("min", op.minmax->min);
     fbb->Float("max", op.minmax->max);
+    fbb->Int("num_bits", op.num_bits);
   }
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     auto* minmax = new MinMax;
     minmax->min = m["min"].AsFloat();
     minmax->max = m["max"].AsFloat();
     op->minmax.reset(minmax);
+    const auto& num_bits = m["num_bits"];
+    op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
 };
 
@@ -657,6 +665,23 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
                    TocoOperator* op) const override {}
 };
 
+class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
+                                      ::tflite::BuiltinOptions_ArgMaxOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateArgMaxOptions(
+        *builder, DataType::Serialize(op.output_data_type));
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->output_data_type = DataType::Deserialize(options.output_type());
+  }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -827,9 +852,12 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new TopK_V2(::tflite::BuiltinOperator_TOPK_V2, OperatorType::kTopK_V2));
   ops.emplace_back(
       new Lstm(::tflite::BuiltinOperator_LSTM, OperatorType::kLstmCell));
+  ops.emplace_back(
+      new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
+  ops.emplace_back(
+      new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
 
   // Custom Operators.
-  ops.emplace_back(new Cast("CAST", OperatorType::kCast));
   ops.emplace_back(
       new DepthToSpace("DEPTH_TO_SPACE", OperatorType::kDepthToSpace));
   ops.emplace_back(new FakeQuant("FAKE_QUANT", OperatorType::kFakeQuant));
@@ -855,7 +883,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(
       new SimpleOperator<Relu6Operator>("RELU6", OperatorType::kRelu6));
   ops.emplace_back(
-      new SimpleOperator<Relu1Operator>("PRELU", OperatorType::kPRelu));
+      new SimpleOperator<PReluOperator>("PRELU", OperatorType::kPRelu));
   ops.emplace_back(new SimpleOperator<LogisticOperator>(
       "LOGISTIC", OperatorType::kLogistic));
   ops.emplace_back(
@@ -865,6 +893,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "LOG_SOFTMAX", OperatorType::kLogSoftmax));
   ops.emplace_back(new SimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
+  ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index f7a213ecfc539e009f78e7c0e424d36a38b3486c..a947630e28458f3f2c73cd62c52ba8c1e40e9912 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -111,6 +111,8 @@ TEST_F(OperatorTest, SimpleOperators) {
                                           OperatorType::kLogSoftmax);
   CheckSimpleOperator<TensorFlowMaximumOperator>(
       "MAXIMUM", OperatorType::kTensorFlowMaximum);
+  CheckSimpleOperator<TensorFlowMinimumOperator>(
+      "MINIMUM", OperatorType::kTensorFlowMinimum);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -131,7 +133,7 @@ TEST_F(OperatorTest, BuiltinMean) {
   EXPECT_EQ(op.keep_dims, output_toco_op->keep_dims);
 }
 
-TEST_F(OperatorTest, CustomCast) {
+TEST_F(OperatorTest, BuiltinCast) {
   CastOperator op;
   op.src_data_type = ArrayDataType::kFloat;
   op.dst_data_type = ArrayDataType::kUint8;
@@ -163,10 +165,12 @@ TEST_F(OperatorTest, CustomFakeQuant) {
   minmax->min = -10;
   minmax->max = 200;
   op.minmax.reset(minmax);
+  op.num_bits = 16;
   auto output_toco_op = SerializeAndDeserialize(
       GetOperator("FAKE_QUANT", OperatorType::kFakeQuant), op);
   EXPECT_EQ(op.minmax->min, output_toco_op->minmax->min);
   EXPECT_EQ(op.minmax->max, output_toco_op->minmax->max);
+  EXPECT_EQ(op.num_bits, output_toco_op->num_bits);
 }
 
 TEST_F(OperatorTest, CustomFullyConnected) {
@@ -391,6 +395,13 @@ TEST_F(OperatorTest, BuiltinTopKV2) {
   ASSERT_NE(nullptr, output_toco_op.get());
 }
 
+TEST_F(OperatorTest, BuiltinArgMax) {
+  ArgMaxOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ARG_MAX", OperatorType::kArgMax), op);
+  EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/toco_saved_model.cc b/tensorflow/contrib/lite/toco/toco_saved_model.cc
index 91a742b9e0d3c7ba5b5b955a3da27d7bf3d48871..26f55a66c729894a990258080e397bb42ea98a13 100644
--- a/tensorflow/contrib/lite/toco/toco_saved_model.cc
+++ b/tensorflow/contrib/lite/toco/toco_saved_model.cc
@@ -35,9 +35,12 @@ const tensorflow::SavedModelBundle* LoadSavedModel(
       << "Model is not saved in the supported SavedModel format.\n";
 
   // Gets the tags identifying the MetaGraphDef from the command line arguments.
-  QCHECK(parsed_toco_flags.savedmodel_tagset.specified())
-      << "Missing required flag --savedmodel_tagset.\n";
-  const string tags_str = parsed_toco_flags.savedmodel_tagset.value();
+  string tags_str;
+  if (parsed_toco_flags.savedmodel_tagset.specified()) {
+    tags_str = parsed_toco_flags.savedmodel_tagset.value();
+  } else {
+    tags_str = parsed_toco_flags.savedmodel_tagset.default_value();
+  }
   auto tags = absl::StrSplit(tags_str, ',');
 
   // Loads MetaGraphDef.
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 30dd6fab9ebbad9c2add7f830f9b58a73f41714b..96c5ebd64f3343d454a9c445a6edcd318b08c953 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -74,11 +74,14 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveTensorFlowMatMul);
   transformations->Add(new FuseBinaryIntoPrecedingAffine);
   transformations->Add(new FuseBinaryIntoFollowingAffine);
-  transformations->Add(new ReorderActivationFunctions);
+  transformations->Add(new MergeReshapeIntoPrecedingTranspose);
+  transformations->Add(new ReorderElementwiseUnary);
+  transformations->Add(new ReorderReshapeTranspose);
   transformations->Add(new ResolveBatchNormalization);
   transformations->Add(new ResolveConstantBinaryOperator);
   transformations->Add(new ResolveConstantFill);
   transformations->Add(new ResolveConstantGather);
+  transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
@@ -127,20 +130,26 @@ bool SupportsPreallocatedWorkspace(FileFormat format) {
 }
 
 bool IsRealValued(toco::ArrayDataType type) {
+  // TODO(benoitjacob) - this is hardcoding that uint8 and int16 are only used
+  // for quantized real-number values, and no other integer type is ever used
+  // for that. This is dirty, should be resolved as part of a more general push
+  // to more explicitly distinguish between true-integers and
+  // integers used as quantized values representing real numbers.
   return static_cast<bool>(type == toco::ArrayDataType::kFloat ||
-                           type == toco::ArrayDataType::kUint8);
+                           type == toco::ArrayDataType::kUint8 ||
+                           type == toco::ArrayDataType::kInt16);
 }
 
 void SetFinalDataTypeOnInputs(const TocoFlags& toco_flags, Model* model) {
   const FileFormat output_format = toco_flags.output_format();
   ArrayDataType type;
-  if (toco_flags.has_inference_input_type()) {
+  if (!SupportsQuantization(output_format)) {
+    // Data type is implicitly float for non-quantized formats
+    type = ArrayDataType::kFloat;
+  } else if (toco_flags.has_inference_input_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_input_type());
   } else if (toco_flags.has_inference_type()) {
     type = ConvertIODataTypeToArrayDataType(toco_flags.inference_type());
-  } else if (!SupportsQuantization(output_format)) {
-    // Data type is implicitly float for non-quantized formats
-    type = ArrayDataType::kFloat;
   } else {
     // Nothing to do. Data types stay as-is.
     return;
@@ -195,11 +204,6 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
 }
 
 void Transform(const TocoFlags& toco_flags, Model* model) {
-  // Clean up after import.
-  SetFinalDataTypeOnInputs(toco_flags, model);
-  UseArraysExtraInfo(model);
-  FinishBuildingRNNStates(model);
-
   const FileFormat output_format = toco_flags.output_format();
   const IODataType inference_type = toco_flags.inference_type();
 
@@ -212,6 +216,11 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         << "Quantized inference is not allowed with float inputs.";
   }
 
+  // Clean up after import.
+  SetFinalDataTypeOnInputs(toco_flags, model);
+  UseArraysExtraInfo(model, quantize_output);
+  FinishBuildingRNNStates(model);
+
   // Remove unused ops before performing any other optimizations. This is to
   // stop optimizations from crossing the input/output boundaries. For example
   // this will stop BatchNorm fusing if the output node is in between a conv
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index f3f50487ff74904bf3708fa4c86f522997b55ca0..bd2d5f7df039b61d50c3f7ab9fa7d5d9efd75afb 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -291,12 +291,14 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Dequantize)
     HANDLE_OPERATORTYPENAME_CASE(L2Normalization)
     HANDLE_OPERATORTYPENAME_CASE(LocalResponseNormalization)
+    HANDLE_OPERATORTYPENAME_CASE(Log)
     HANDLE_OPERATORTYPENAME_CASE(Logistic)
     HANDLE_OPERATORTYPENAME_CASE(LstmCell)
     HANDLE_OPERATORTYPENAME_CASE(MaxPool)
     HANDLE_OPERATORTYPENAME_CASE(L2Pool)
     HANDLE_OPERATORTYPENAME_CASE(FakeQuant)
     HANDLE_OPERATORTYPENAME_CASE(Mul)
+    HANDLE_OPERATORTYPENAME_CASE(RandomUniform)
     HANDLE_OPERATORTYPENAME_CASE(Relu)
     HANDLE_OPERATORTYPENAME_CASE(Relu1)
     HANDLE_OPERATORTYPENAME_CASE(Relu6)
@@ -1377,12 +1379,22 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
     const float mean_value = input_array_proto.mean_value();
     const float std_value = input_array_proto.std_value();
     MinMax input_minmax;
-    input_minmax.min = (0.f - mean_value) / std_value;
-    input_minmax.max = (255.f - mean_value) / std_value;
+    float qmin = 0, qmax = 255;
+    if (input_array.data_type == ArrayDataType::kInt16) {
+      qmin = -32768;
+      qmax = 32767;
+    }
+    input_minmax.min = (qmin - mean_value) / std_value;
+    input_minmax.max = (qmax - mean_value) / std_value;
     if (input_array.minmax) {
       if (input_array_proto.has_mean_value() ||
           input_array_proto.has_std_value()) {
-        CHECK(input_minmax == *input_array.minmax)
+        const double width = input_minmax.max - input_minmax.min;
+        const double kMinMaxAllowedDiff = 1e-6 * width;
+        CHECK(std::abs(input_minmax.min - input_array.minmax->min) <
+                  kMinMaxAllowedDiff &&
+              std::abs(input_minmax.max - input_array.minmax->max) <
+                  kMinMaxAllowedDiff)
             << input_minmax.min << ", " << input_minmax.max
             << " != " << input_array.minmax->min << ", "
             << input_array.minmax->max;
@@ -1402,7 +1414,8 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
       CHECK(input_array.shape().dims_size());
     }
   }
-
+  model->flags.set_change_concat_input_ranges(
+      model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
   model->flags.set_allow_nonexistent_arrays(
       model_flags.allow_nonexistent_arrays());
@@ -1920,6 +1933,35 @@ bool IsDiscardableArray(const Model& model, const string& array_name) {
   return true;
 }
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims) {
+  CHECK(!op->shape.empty());
+  CHECK(model.HasArray(op->inputs[0]));
+  CHECK(model.HasArray(op->outputs[0]));
+
+  const auto& input_array = model.GetArray(op->inputs[0]);
+  const auto& output_array = model.GetArray(op->outputs[0]);
+
+  CHECK(input_array.has_shape());
+  CHECK(output_array.has_shape());
+
+  std::vector<int> in_shape = input_array.shape().dims();
+  std::vector<int> out_shape = output_array.shape().dims();
+
+  // If the reshape changes the number of dimensions so it cannot be interpreted
+  // as a transpose.
+  if (!allow_extra_unary_dims && in_shape.size() != out_shape.size()) {
+    return false;
+  }
+
+  in_shape.erase(std::remove(in_shape.begin(), in_shape.end(), 1),
+                 in_shape.end());
+  out_shape.erase(std::remove(out_shape.begin(), out_shape.end(), 1),
+                  out_shape.end());
+  return in_shape == out_shape;
+}
+
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
@@ -1970,19 +2012,19 @@ void FinishBuildingRNNStates(Model* model) {
   }
 }
 
-void UseArraysExtraInfo(Model* model) {
+void UseArraysExtraInfo(Model* model, bool quantize_output) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
     if (!model->HasArray(entry.name())) {
       continue;
     }
     auto& array = model->GetArray(entry.name());
-    auto& minmax = array.GetOrCreateMinMax();
     if (entry.has_min() || entry.has_max()) {
       CHECK_EQ(entry.has_min(), entry.has_max());
+      auto& minmax = array.GetOrCreateMinMax();
       minmax.min = entry.min();
       minmax.max = entry.max();
     }
-    if (entry.has_data_type()) {
+    if (entry.has_data_type() && quantize_output) {
       array.final_data_type =
           ConvertIODataTypeToArrayDataType(entry.data_type());
     }
@@ -1997,11 +2039,12 @@ void UseArraysExtraInfo(Model* model) {
     }
     if (entry.has_constant_float_value()) {
       CHECK(array.has_shape());
-      CHECK(array.data_type == ArrayDataType::kFloat);
-      auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
-      data.resize(RequiredBufferSizeForShape(array.shape()));
-      for (float& f : data) {
-        f = entry.constant_float_value();
+      if (array.data_type == ArrayDataType::kFloat) {
+        auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+        data.resize(RequiredBufferSizeForShape(array.shape()));
+        for (float& f : data) {
+          f = entry.constant_float_value();
+        }
       }
     }
   }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index d3b7224fe3a773e389ad8fc9a40f0a0fad4debe5..dfd81173c3d3bf31a0ce688ce5434cd37fb959c7 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -169,10 +169,23 @@ void GetQuantizationParamsFromMinMax(const MinMax& minmax,
       ::tflite::ChooseQuantizationParams<Integer>(rmin, rmax);
 }
 
+template <typename T>
+T ConvertOperator(Operator* o, OperatorType type) {
+  if (o != nullptr && o->type == type) {
+    return static_cast<T>(o);
+  }
+
+  return nullptr;
+}
+
 void CheckIsReadyForQuantization(const Model& model);
 void UseDefaultMinMaxRangeValues(Model* model, double default_ranges_min,
                                  double default_ranges_max);
 
+bool ReshapeIsEquivalentToTranspose(const Model& model,
+                                    const TensorFlowReshapeOperator* op,
+                                    bool allow_extra_unary_dims);
+
 inline int Offset(const Shape& shape, const std::vector<int>& indices) {
   DCHECK_EQ(shape.dimensions_count(), indices.size());
   const int dims_count = shape.dimensions_count();
@@ -272,7 +285,7 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type);
 // already quantized, then case (a) should hold.
 void FinishBuildingRNNStates(Model* model);
 
-void UseArraysExtraInfo(Model* model);
+void UseArraysExtraInfo(Model* model, bool quantize_output);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index b5abbc0712599814e078d19bc015bc7bf1812f95..44fde69a1e1536b8d2ecff16876248cfe66a9b8a 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -91,18 +91,6 @@ cc_library(
     deps = ["//tensorflow/contrib/lite:framework"],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "verifier",
     srcs = ["verifier.cc"],
diff --git a/tensorflow/contrib/lookup/BUILD b/tensorflow/contrib/lookup/BUILD
index 8ca03f4193f260ce32f942ccaf76a8260b282156..f616207d462954341dd0c4b2722471b50c06c917 100644
--- a/tensorflow/contrib/lookup/BUILD
+++ b/tensorflow/contrib/lookup/BUILD
@@ -46,16 +46,5 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     grpc_enabled = True,
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
diff --git a/tensorflow/contrib/lookup/lookup_ops.py b/tensorflow/contrib/lookup/lookup_ops.py
index a03e731be32c5964cb4aece8e8a67525883a4e7c..4942d941765951ed2ee5555138e91a202b96bf7c 100644
--- a/tensorflow/contrib/lookup/lookup_ops.py
+++ b/tensorflow/contrib/lookup/lookup_ops.py
@@ -298,7 +298,7 @@ class MutableHashTable(LookupInterface):
   table = tf.contrib.lookup.MutableHashTable(key_dtype=tf.string,
                                              value_dtype=tf.int64,
                                              default_value=-1)
-  table.insert(keys, values)
+  sess.run(table.insert(keys, values))
   out = table.lookup(query_keys)
   print(out.eval())
   ```
diff --git a/tensorflow/contrib/losses/BUILD b/tensorflow/contrib/losses/BUILD
index 56942115213a762e532971a81da768b53b8537d8..728f75f8ef1eb3b107dbd0ab4ffbecd63787bf3e 100644
--- a/tensorflow/contrib/losses/BUILD
+++ b/tensorflow/contrib/losses/BUILD
@@ -97,15 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/makefile/BUILD b/tensorflow/contrib/makefile/BUILD
index 701eeb44fe3f814cb3fb1cedd8618753946cc3e5..1abb46f4d41d2a9cc60d0cd9de865070689ddbfc 100644
--- a/tensorflow/contrib/makefile/BUILD
+++ b/tensorflow/contrib/makefile/BUILD
@@ -3,12 +3,3 @@
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:private"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/makefile/proto_text_cc_files.txt b/tensorflow/contrib/makefile/proto_text_cc_files.txt
index 77c936d8c5b99033ff5c5e149a6ce6613b603132..76428bc1d4e682e000998a6e28fc290e218c2341 100644
--- a/tensorflow/contrib/makefile/proto_text_cc_files.txt
+++ b/tensorflow/contrib/makefile/proto_text_cc_files.txt
@@ -12,6 +12,7 @@ tensorflow/core/platform/posix/env.cc
 tensorflow/core/platform/posix/load_library.cc
 tensorflow/core/platform/posix/env_time.cc
 tensorflow/core/platform/file_system.cc
+tensorflow/core/platform/file_system_helper.cc
 tensorflow/core/platform/env.cc
 tensorflow/core/platform/env_time.cc
 tensorflow/core/platform/setround.cc
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index 7a7683c95369aa929d93591e6bf78fd945ce36bc..d4c3f2eda8be0c70e961afe582983b9f73769c77 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -151,6 +151,7 @@ tensorflow/core/kernels/decode_bmp_op.cc
 tensorflow/core/kernels/depthtospace_op.cc
 tensorflow/core/kernels/data_format_ops.cc
 tensorflow/core/kernels/spacetodepth_op.cc
+tensorflow/core/kernels/dense_update_functor.cc
 tensorflow/core/kernels/dense_update_ops.cc
 tensorflow/core/kernels/deep_conv2d.cc
 tensorflow/core/kernels/decode_wav_op.cc
@@ -228,6 +229,11 @@ tensorflow/core/kernels/cast_op_impl_int64.cc
 tensorflow/core/kernels/cast_op_impl_int8.cc
 tensorflow/core/kernels/cast_op_impl_uint16.cc
 tensorflow/core/kernels/cast_op_impl_uint8.cc
+tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+tensorflow/core/kernels/boosted_trees/resource_ops.cc
+tensorflow/core/kernels/boosted_trees/resources.cc
+tensorflow/core/kernels/boosted_trees/stats_ops.cc
+tensorflow/core/kernels/boosted_trees/training_ops.cc
 tensorflow/core/kernels/bias_op.cc
 tensorflow/core/kernels/bcast_ops.cc
 tensorflow/core/kernels/batch_norm_op.cc
@@ -286,6 +292,7 @@ tensorflow/core/ops/data_flow_ops.cc
 tensorflow/core/ops/ctc_ops.cc
 tensorflow/core/ops/control_flow_ops.cc
 tensorflow/core/ops/candidate_sampling_ops.cc
+tensorflow/core/ops/boosted_trees_ops.cc
 tensorflow/core/ops/array_ops.cc
 tensorflow/core/ops/array_grad.cc
 tensorflow/core/kernels/spacetobatch_functor.cc
@@ -295,3 +302,5 @@ tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
 tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
+tensorflow/core/kernels/decode_proto_op.cc
+tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/makefile/tf_proto_files.txt b/tensorflow/contrib/makefile/tf_proto_files.txt
index d569bde637b20e0ca55c48c616855332abd9fb13..1f254692d7a8fb7af3ce795428464c48f5997a54 100644
--- a/tensorflow/contrib/makefile/tf_proto_files.txt
+++ b/tensorflow/contrib/makefile/tf_proto_files.txt
@@ -18,6 +18,7 @@ tensorflow/core/protobuf/device_properties.proto
 tensorflow/core/protobuf/rewriter_config.proto
 tensorflow/core/protobuf/tensor_bundle.proto
 tensorflow/core/lib/core/error_codes.proto
+tensorflow/core/kernels/boosted_trees/boosted_trees.proto
 tensorflow/core/framework/versions.proto
 tensorflow/core/framework/variable.proto
 tensorflow/core/framework/types.proto
diff --git a/tensorflow/contrib/memory_stats/BUILD b/tensorflow/contrib/memory_stats/BUILD
index 72424c32e7b756e6c50965f38135869e03ba730f..63843b993c16363a80b64622af665aaa64e05830 100644
--- a/tensorflow/contrib/memory_stats/BUILD
+++ b/tensorflow/contrib/memory_stats/BUILD
@@ -79,15 +79,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/meta_graph_transform/BUILD b/tensorflow/contrib/meta_graph_transform/BUILD
index 4b5b1c3e15d36b7602791856416ece54d24798b2..24400789f8a937c88b86141704f7977494c1495e 100644
--- a/tensorflow/contrib/meta_graph_transform/BUILD
+++ b/tensorflow/contrib/meta_graph_transform/BUILD
@@ -59,15 +59,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index e90c525113348532a3ebdadde7e712bf2d98cee9..5ca42f41c1c5055bf1917ad175b7b30666b18d4b 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -97,14 +97,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 81f05e7ce587ed1da67a17efbbeb809dbe7fc0b3..2bf281b7916e296660089234c8487f188a597e5d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -62,7 +62,8 @@ def _safe_div(numerator, denominator, name):
       0,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
                              labels,
                              weights=None,
@@ -106,7 +107,8 @@ def streaming_true_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.true_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_true_negatives(predictions,
                              labels,
                              weights=None,
@@ -150,7 +152,8 @@ def streaming_true_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_positives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_positives(predictions,
                               labels,
                               weights=None,
@@ -194,7 +197,8 @@ def streaming_false_positives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.false_negatives. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_false_negatives(predictions,
                               labels,
                               weights=None,
@@ -237,7 +241,7 @@ def streaming_false_negatives(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean')
 def streaming_mean(values,
                    weights=None,
                    metrics_collections=None,
@@ -286,7 +290,7 @@ def streaming_mean(values,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.mean_tensor')
 def streaming_mean_tensor(values,
                           weights=None,
                           metrics_collections=None,
@@ -340,9 +344,8 @@ def streaming_mean_tensor(values,
       name=name)
 
 
-@deprecated(None,
-            'Please switch to tf.metrics.accuracy. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.accuracy. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_accuracy(predictions,
                        labels,
                        weights=None,
@@ -399,7 +402,8 @@ def streaming_accuracy(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.precision. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_precision(predictions,
                         labels,
                         weights=None,
@@ -455,7 +459,8 @@ def streaming_precision(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None, 'Please switch to tf.metrics.recall. Note that the order '
+                  'of the labels and predictions arguments has been switched.')
 def streaming_recall(predictions,
                      labels,
                      weights=None,
@@ -975,8 +980,8 @@ def streaming_curve_points(labels=None,
     return points, update_op
 
 
-@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of the '
-            'labels and predictions arguments has been switched.')
+@deprecated(None, 'Please switch to tf.metrics.auc. Note that the order of '
+                  'the labels and predictions arguments has been switched.')
 def streaming_auc(predictions,
                   labels,
                   weights=None,
@@ -1797,9 +1802,9 @@ def streaming_sensitivity_at_specificity(predictions,
       name=name)
 
 
-@deprecated(
-    None, 'Please switch to tf.metrics.precision_at_thresholds. Note that the '
-    'order of the labels and predictions arguments has been switched.')
+@deprecated(None,
+            'Please switch to tf.metrics.precision_at_thresholds. Note that '
+            'the order of the labels and predictions arguments are switched.')
 def streaming_precision_at_thresholds(predictions,
                                       labels,
                                       thresholds,
@@ -2706,7 +2711,9 @@ def streaming_sparse_average_precision_at_top_k(top_k_predictions,
       name=name)
 
 
-@deprecated(None, 'Please switch to tf.metrics.mean.')
+@deprecated(None,
+            'Please switch to tf.metrics.mean_absolute_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_absolute_error(predictions,
                                   labels,
                                   weights=None,
@@ -2825,7 +2832,9 @@ def streaming_mean_relative_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(None,
+            'Please switch to tf.metrics.mean_squared_error. Note that the '
+            'order of the labels and predictions arguments has been switched.')
 def streaming_mean_squared_error(predictions,
                                  labels,
                                  weights=None,
@@ -2883,7 +2892,10 @@ def streaming_mean_squared_error(predictions,
       updates_collections=updates_collections,
       name=name)
 
-
+@deprecated(
+    None,
+    'Please switch to tf.metrics.root_mean_squared_error. Note that the '
+    'order of the labels and predictions arguments has been switched.')
 def streaming_root_mean_squared_error(predictions,
                                       labels,
                                       weights=None,
diff --git a/tensorflow/contrib/model_pruning/BUILD b/tensorflow/contrib/model_pruning/BUILD
index ca3f13479ed32e9ab3d43dfe9a392ef8466ce5f2..f50575b2cf311e33f7b7c77488bc94b8d24c70ec 100644
--- a/tensorflow/contrib/model_pruning/BUILD
+++ b/tensorflow/contrib/model_pruning/BUILD
@@ -125,15 +125,3 @@ py_library(
         ":rnn_cells",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
index e7848adcc5ac126a2b85ef6dcb0ffa355b8b0628..30ea9122229c72950bee280f7a6c5eda4ac2fdbf 100644
--- a/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
+++ b/tensorflow/contrib/model_pruning/examples/cifar10/BUILD
@@ -68,15 +68,3 @@ py_binary(
         "//tensorflow/contrib/model_pruning:pruning",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/mpi_collectives/BUILD b/tensorflow/contrib/mpi_collectives/BUILD
index 9f9802b8fe12356c0da82ebb2b48b565cf3f7319..a7be92a35e0d62a61f7923ac61bb2c1267d039c6 100644
--- a/tensorflow/contrib/mpi_collectives/BUILD
+++ b/tensorflow/contrib/mpi_collectives/BUILD
@@ -126,15 +126,3 @@ tf_py_test(
     ],
     tags = ["manual"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nccl/BUILD b/tensorflow/contrib/nccl/BUILD
index 94d01efee1546feca89a7e88acedf915b1dfb3a4..334e70318dd88185cecd93ebeb2587861b7999b9 100644
--- a/tensorflow/contrib/nccl/BUILD
+++ b/tensorflow/contrib/nccl/BUILD
@@ -31,7 +31,7 @@ tf_custom_op_library(
         "kernels/nccl_ops.cc",
     ],
     deps = if_cuda([
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
         "//tensorflow/core:gpu_headers_lib",
     ]),
 )
@@ -61,7 +61,7 @@ tf_cuda_cc_test(
             "//tensorflow/core:test",
             "//tensorflow/core:test_main",
             "//tensorflow/core:testlib",
-            "@nccl_archive//:nccl",
+            "@local_config_nccl//:nccl",
         ],
 )
 
@@ -80,7 +80,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:stream_executor",
-        "@nccl_archive//:nccl",
+        "@local_config_nccl//:nccl",
     ],
     alwayslink = 1,
 )
@@ -141,15 +141,3 @@ cuda_py_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.cc b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
index 913935b38246f1c5c0f7da4c1ea1f986bc00891b..b9b482a6981e03144c6d00f2a38b71959b4b3621 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.cc
@@ -76,6 +76,8 @@ struct NcclManager::Communicator {
 namespace {
 ncclDataType_t ToNcclType(DataType t) {
   switch (t) {
+    case DT_HALF:
+      return ncclHalf;
     case DT_FLOAT:
       return ncclFloat;
     case DT_DOUBLE:
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager.h b/tensorflow/contrib/nccl/kernels/nccl_manager.h
index bb219e0edc8a2c4ba0ce0583cbe4018a4fa3a1d1..6ff8cea84eb912d5e5c891c40efc617661725a63 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager.h
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 985b2bae2566c38dfb2c71a899e4b03bbb8fa55d..06ca65e33ad6f5fb6620144231dd368379dcc190 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -48,35 +48,9 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   return gpus;
 }
 
+template <typename Scalar>
 class NcclManagerTest : public ::testing::Test {
- protected:
-  static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices->empty());
-    LOG(ERROR) << "Running test with " << devices->size() << " gpus";
-  }
-  static void TearDownTestCase() {
-    for (auto device : *devices) delete device;
-    delete devices;
-  }
-
-  static Allocator* gpu_allocator(BaseGPUDevice* device) {
-    return device->GetStepAllocator(AllocatorAttributes(),
-                                    nullptr /* step_resource_manager */);
-  }
-
-  static std::vector<BaseGPUDevice*>* devices;
-
-  template <typename Scalar>
-  perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
-      const Scalar* cuda_memory) {
-    perftools::gputools::DeviceMemoryBase wrapped(
-        const_cast<Scalar*>(cuda_memory));
-    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
-    return typed;
-  }
-
+ public:
   // A single all-reduce to apply.
   struct TestCase {
     string key;
@@ -89,42 +63,52 @@ class NcclManagerTest : public ::testing::Test {
     int num_completed = 0;
   };
 
+  static void SetUpTestCase() {
+    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
+    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
+    CHECK(!devices_->empty());
+    LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
+  }
+
+  static void TearDownTestCase() {
+    for (auto device : *devices_) delete device;
+    delete devices_;
+  }
+
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
     TestCase* test_case = new TestCase();
-    test_case->expected = Tensor(DT_FLOAT, shape);
+    test_case->expected = Tensor(data_type_, shape);
     if (reduction_op == ncclProd) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 1; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(1); });
     } else if (reduction_op == ncclSum) {
-      test::FillFn<float>(&test_case->expected, [](int) { return 0; });
+      test::FillFn<Scalar>(&test_case->expected,
+                           [](int) { return static_cast<Scalar>(0); });
     } else if (reduction_op == ncclMax) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return -1 * std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return -max_; });
     } else if (reduction_op == ncclMin) {
-      test::FillFn<float>(&test_case->expected, [](int) {
-        return std::numeric_limits<float>::max();
-      });
+      test::FillFn<Scalar>(&test_case->expected, [](int) { return max_; });
     } else {
       LOG(FATAL) << "Invalid reduction_op " << reduction_op;
     }
 
-    int mult = 1;
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    float value_scale = 0.01;  // Small scale to avoid fp16 overflow.
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
 
-      Tensor in_cpu(DT_FLOAT, shape);
-      test::FillFn<float>(&in_cpu, [mult, value_offset](int index) {
-        return value_offset + (index + 1) * mult;
+      Tensor in_cpu(data_type_, shape);
+      test::FillFn<Scalar>(&in_cpu, [&](int index) {
+        return static_cast<Scalar>((index + 1) * value_scale + value_offset);
       });
       for (int j = 0; j < shape.num_elements(); ++j) {
-        auto in_val = in_cpu.flat<float>()(j);
-        auto out_expr = test_case->expected.flat<float>();
+        auto in_val = in_cpu.flat<Scalar>()(j);
+        auto out_expr = test_case->expected.template flat<Scalar>();
         if (reduction_op == ncclProd) {
-          out_expr(j) *= in_val;
+          out_expr(j) = out_expr(j) * in_val;
         } else if (reduction_op == ncclSum) {
-          out_expr(j) += in_val;
+          out_expr(j) = out_expr(j) + in_val;
         } else if (reduction_op == ncclMax) {
           if (in_val > out_expr(j)) {
             out_expr(j) = in_val;
@@ -136,26 +120,18 @@ class NcclManagerTest : public ::testing::Test {
         }
       }
 
-      mult *= 10;
-      test_case->ins.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
-      test_case->outs.emplace_back(gpu_allocator(device), DT_FLOAT, shape);
+      value_scale *= 10;
+      test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape);
+      test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape);
 
       const Tensor& in_gpu = test_case->ins.back();
-      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<float>().data());
-      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<float>().data(),
+      auto in_gpu_mem = AsDeviceMemory(in_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat<Scalar>().data(),
                          in_cpu.TotalBytes());
     }
     return test_case;
   }
 
-  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
-    return [this, test_case](Status s) {
-      mutex_lock l(test_case->mu);
-      ++test_case->num_completed;
-      test_case->final_status.Update(s);
-    };
-  }
-
   void VerifyResults(const string& case_label, TestCase* test_case) {
     // Wait for the done callback to be called.
     {
@@ -168,41 +144,84 @@ class NcclManagerTest : public ::testing::Test {
       test_case->mu.unlock();
     }
     // Copy memory to host and verify.
-    for (int i = 0; i < test_case->outs.size(); ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < test_case->outs.size(); ++rank) {
+      auto* device = GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
-      const Tensor& out_gpu = test_case->outs[i];
-      Tensor out_cpu(DT_FLOAT, out_gpu.shape());
-      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<float>().data());
-      stream->ThenMemcpy(out_cpu.flat<float>().data(), out_gpu_mem,
+      const Tensor& out_gpu = test_case->outs[rank];
+      Tensor out_cpu(data_type_, out_gpu.shape());
+      auto out_gpu_mem = AsDeviceMemory(out_gpu.flat<Scalar>().data());
+      stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorEqual<float>(test_case->expected, out_cpu);
+      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
     }
   }
+
+  NcclManager::DoneCallback CreateDoneCallback(TestCase* test_case) {
+    return [this, test_case](Status s) {
+      mutex_lock l(test_case->mu);
+      ++test_case->num_completed;
+      test_case->final_status.Update(s);
+    };
+  }
+
+  static BaseGPUDevice* GetDevice(size_t rank) {
+    return devices_->at(rank % devices_->size());
+  }
+
+ private:
+  static Allocator* GpuAllocator(BaseGPUDevice* device) {
+    return device->GetStepAllocator(AllocatorAttributes(),
+                                    nullptr /* step_resource_manager */);
+  }
+
+  static perftools::gputools::DeviceMemory<Scalar> AsDeviceMemory(
+      const Scalar* cuda_memory) {
+    perftools::gputools::DeviceMemoryBase wrapped(
+        const_cast<Scalar*>(cuda_memory));
+    perftools::gputools::DeviceMemory<Scalar> typed(wrapped);
+    return typed;
+  }
+
+ private:
+  static std::vector<BaseGPUDevice*>* devices_;
+  static const DataType data_type_;
+  static const Scalar max_;
 };
-std::vector<BaseGPUDevice*>* NcclManagerTest::devices = nullptr;
+
+template <typename Scalar>
+std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+template <typename Scalar>
+const DataType NcclManagerTest<Scalar>::data_type_ =
+    DataTypeToEnum<Scalar>::value;
+template <typename Scalar>
+const Scalar NcclManagerTest<Scalar>::max_ =
+    Eigen::NumTraits<Scalar>::highest();
+
+// Instantiate tests for float and half.
+using TypeList = ::testing::Types<float, Eigen::half>;
+TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
-TEST_F(NcclManagerTest, BasicSumReduction) {
+TYPED_TEST(NcclManagerTest, BasicSumReduction) {
   const int num_ranks = 3;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
-    std::unique_ptr<TestCase> test_case(
-        MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0));
-    for (int device_num = 0; device_num < num_ranks; ++device_num) {
-      auto* device = devices->at(device_num % devices->size());
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
           num_ranks, "allreduce", reduction_op, device->executor(),
-          device->gpu_id(), event_mgr, stream, &test_case->ins[device_num],
-          &test_case->outs[device_num], CreateDoneCallback(test_case.get()));
+          device->gpu_id(), event_mgr, stream, &test_case->ins[rank],
+          &test_case->outs[rank], this->CreateDoneCallback(test_case.get()));
     }
 
     LOG(ERROR) << "Verifying results";
-    VerifyResults("test_case", test_case.get());
+    this->VerifyResults("test_case", test_case.get());
   }
 }
 
@@ -213,7 +232,7 @@ TEST_F(NcclManagerTest, BasicSumReduction) {
 // with num_ranks > devices->size(), for some GPUs (e.g. K20m).
 // To test the higher settings, increase num_ranks,
 // num_collectives_per_iteration and time_limit_micros.
-TEST_F(NcclManagerTest, MultipleCallers) {
+TYPED_TEST(NcclManagerTest, MultipleCallers) {
   const int num_ranks = 1;                      // 2;
   const int num_collectives_per_iteration = 1;  // 1000;
   const int num_threads = 3;
@@ -223,49 +242,49 @@ TEST_F(NcclManagerTest, MultipleCallers) {
   srand(Env::Default()->NowMicros());
 
   for (;;) {
-    std::vector<std::pair<int, int>> case_and_device_num;
-    std::vector<std::unique_ptr<TestCase>> test_cases;
+    std::vector<std::pair<int, int>> case_and_rank;
+    std::vector<std::unique_ptr<typename TestFixture::TestCase>> test_cases;
     for (int i = 0; i < num_collectives_per_iteration; ++i) {
-      test_cases.emplace_back(
-          MakeTestCase(num_ranks, ncclSum,
-                       TensorShape({100, i % 5 + 1, i % 3 + 1}), i + 0.1 * i));
+      test_cases.emplace_back(this->MakeTestCase(
+          num_ranks, ncclSum, TensorShape({100, i % 5 + 1, i % 3 + 1}),
+          1.1f * i));
       for (int j = 0; j < num_ranks; ++j) {
-        case_and_device_num.emplace_back(i, j);
+        case_and_rank.emplace_back(i, j);
       }
     }
 
-    for (int i = 0; i < num_ranks; ++i) {
-      auto* device = devices->at(i % devices->size());
+    for (int rank = 0; rank < num_ranks; ++rank) {
+      auto* device = this->GetDevice(rank);
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       SE_ASSERT_OK(stream->BlockHostUntilDone());
     }
 
-    std::shuffle(case_and_device_num.begin(), case_and_device_num.end(),
+    std::shuffle(case_and_rank.begin(), case_and_rank.end(),
                  std::mt19937(std::random_device()()));
 
-    mutex mu;  // guards case_and_device_num.
+    mutex mu;  // guards case_and_rank.
     std::unique_ptr<thread::ThreadPool> pool(
         new thread::ThreadPool(Env::Default(), "test", num_threads));
-    const int to_schedule = case_and_device_num.size();
+    const int to_schedule = case_and_rank.size();
     for (int i = 0; i < to_schedule; ++i) {
       auto fn = [&]() {
-        int device_num;
+        int rank;
         int test_num;
         {
           mutex_lock l(mu);
-          test_num = case_and_device_num.back().first;
-          device_num = case_and_device_num.back().second;
-          case_and_device_num.pop_back();
+          test_num = case_and_rank.back().first;
+          rank = case_and_rank.back().second;
+          case_and_rank.pop_back();
         }
-        auto* device = devices->at(device_num % devices->size());
+        auto* device = this->GetDevice(rank);
         auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
         auto* stream = device->tensorflow_gpu_device_info()->stream;
-        TestCase* test_case = test_cases[test_num].get();
+        typename TestFixture::TestCase* test_case = test_cases[test_num].get();
         NcclManager::instance()->AddToAllReduce(
             num_ranks, strings::StrCat("allreduce", test_num), ncclSum,
             device->executor(), device->gpu_id(), event_mgr, stream,
-            &test_case->ins[device_num], &test_case->outs[device_num],
-            CreateDoneCallback(test_case));
+            &test_case->ins[rank], &test_case->outs[rank],
+            this->CreateDoneCallback(test_case));
       };
       pool->Schedule(fn);
     }
@@ -274,7 +293,8 @@ TEST_F(NcclManagerTest, MultipleCallers) {
     LOG(ERROR) << "Verifying results for " << num_collectives_per_iteration
                << " collectives";
     for (int i = 0; i < test_cases.size(); ++i) {
-      VerifyResults(strings::StrCat("collective", i), test_cases[i].get());
+      this->VerifyResults(strings::StrCat("collective", i),
+                          test_cases[i].get());
     }
 
     int64 delta = Env::Default()->NowMicros() - start;
diff --git a/tensorflow/contrib/nccl/kernels/nccl_ops.cc b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
index 266d4f6f0de0274dca2bfc9022503f09b0ca7d42..c2b76caef38a4af248387b65701b8f8936e8431f 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "src/nccl.h"
+#include "third_party/nccl/nccl.h"
 #include "tensorflow/contrib/nccl/kernels/nccl_manager.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
index a4de46a93fab1dfe93b47f2789cc533bc447e43a..4676e937e56e35cdec5d2ac57fa07b7bda5fe291 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_rewrite.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #include <forward_list>
@@ -254,7 +255,7 @@ class NcclReplacePass : public GraphOptimizationPass {
     // Find reduction and broadcast ops and replace them with Send/Recv ops.
     for (Node* node : graph->op_nodes()) {
       StringPiece type = node->type_string();
-      if (!type.starts_with("Nccl")) {
+      if (!str_util::StartsWith(type, "Nccl")) {
         continue;
       }
       if (type == "NcclReduce") {
diff --git a/tensorflow/contrib/nccl/ops/nccl_ops.cc b/tensorflow/contrib/nccl/ops/nccl_ops.cc
index 8eb804c2e988f313ba1b340217cae20f1f5502c7..a353a34b80add119fcdc8bc4230eddf5a77b30e8 100644
--- a/tensorflow/contrib/nccl/ops/nccl_ops.cc
+++ b/tensorflow/contrib/nccl/ops/nccl_ops.cc
@@ -25,7 +25,7 @@ REGISTER_OP("NcclAllReduce")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -51,7 +51,7 @@ REGISTER_OP("NcclReduce")
     .Input("input: num_devices * T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -69,7 +69,7 @@ reduction: the reduction operation to perform.
 REGISTER_OP("_NcclReduceSend")
     .Input("input: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -92,7 +92,7 @@ REGISTER_OP("_NcclReduceRecv")
     .Input("input: T")
     .Output("data: T")
     .Attr("reduction: {'min', 'max', 'prod', 'sum'}")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -118,7 +118,7 @@ shared_name: Identifier that is shared between ops of the same reduce.
 REGISTER_OP("NcclBroadcast")
     .Input("input: T")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("shape: shape")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape)
@@ -135,7 +135,7 @@ shape: The shape of the input tensor.
 
 REGISTER_OP("_NcclBroadcastSend")
     .Input("input: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
@@ -157,7 +157,7 @@ shared_name: Identifier that is shared between ops of the same broadcast.
 REGISTER_OP("_NcclBroadcastRecv")
     .Input("shape: int32")
     .Output("output: T")
-    .Attr("T: {float, float64, int32, int64}")
+    .Attr("T: {half, float, float64, int32, int64}")
     .Attr("num_devices: int")
     .Attr("shared_name: string")
     .SetIsStateful()
diff --git a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
index 98fe394c5b38294700617591992d3207b0a4706b..423a8689aeee062fb58eaf9d6d9b980b0998754e 100644
--- a/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
+++ b/tensorflow/contrib/nccl/python/ops/nccl_ops_test.py
@@ -72,7 +72,7 @@ class NcclTestCase(test.TestCase):
           two.
       device_sets: Tuple of virtual devices to run test on.
     """
-    for dtype in [np.float32, np.int32, np.int64, np.float64]:
+    for dtype in [np.float16, np.float32, np.int32, np.int64, np.float64]:
       # Create session inside outer loop to test use of
       # same communicator across multiple sessions.
       with self.test_session(use_gpu=True) as sess:
diff --git a/tensorflow/contrib/nearest_neighbor/BUILD b/tensorflow/contrib/nearest_neighbor/BUILD
index 9500c18b1df9d772dfb827bc2b3d33e0a65974f6..6fa762446705310a60cbdd9302c1a5083b69f065 100644
--- a/tensorflow/contrib/nearest_neighbor/BUILD
+++ b/tensorflow/contrib/nearest_neighbor/BUILD
@@ -111,15 +111,3 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/nn/BUILD b/tensorflow/contrib/nn/BUILD
index 5543eb6c6e3785978e9c878f309b9bd0863b0b0a..ef7ab2264655ca0148a9c045bba04018d9599dfc 100644
--- a/tensorflow/contrib/nn/BUILD
+++ b/tensorflow/contrib/nn/BUILD
@@ -98,14 +98,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/opt/BUILD b/tensorflow/contrib/opt/BUILD
index bacf15bbd6140caf647552f0dca02209634ae56b..c57c5e3f29f1e36fa2f36f8113cb208be6c6be3e 100644
--- a/tensorflow/contrib/opt/BUILD
+++ b/tensorflow/contrib/opt/BUILD
@@ -265,14 +265,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..26ea9135f57fb9fe95e61023bccb97d1d4f5ea1c
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -0,0 +1,205 @@
+# Prototype of OptimizerV2.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "optimizer_v2_py",
+    srcs = ["optimizer_v2_symbols.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adam.py",
+        "gradient_descent.py",
+        "momentum.py",
+        "optimizer_v2.py",
+        "rmsprop.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adagrad_test",
+    size = "small",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "small",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpointable_utils_test",
+    srcs = ["checkpointable_utils_test.py"],
+    additional_deps = [
+        ":training",
+        "@six_archive//:six",
+        "//tensorflow/contrib/eager/python:checkpointable_utils",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:layers_base",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
+    ],
+    tags = ["notsan"],
+)
+
+cuda_py_test(
+    name = "gradient_descent_test",
+    size = "medium",
+    srcs = ["gradient_descent_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "momentum_test",
+    size = "medium",
+    srcs = ["momentum_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "optimizer_v2_test",
+    size = "medium",
+    srcs = ["optimizer_v2_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "small",
+    srcs = ["rmsprop_test.py"],
+    additional_deps = [
+        ":training",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta.py b/tensorflow/contrib/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..b206f9f61bd56581e5105b2bc635c69abbc9af4c
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta.py
@@ -0,0 +1,113 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class AdadeltaOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adadelta algorithm.
+
+  See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+  ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+  """
+
+  def __init__(self, learning_rate=0.001, rho=0.95, epsilon=1e-8,
+               use_locking=False, name="Adadelta"):
+    """Construct a new Adadelta optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A float hyperparameter. The decay rate.
+      epsilon: A float hyperparameter. A constant epsilon used to better
+        condition the grad update.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+    """
+    super(AdadeltaOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("rho", rho)
+    self._set_hyper("epsilon", epsilon)
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "accum")
+      state.zeros_slot(v, "accum_update")
+
+  def _apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.sparse_apply_adadelta(
+        var,
+        accum,
+        accum_update,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    accum = state.get_slot(var, "accum")
+    accum_update = state.get_slot(var, "accum_update")
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum.handle,
+        accum_update.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("rho", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adadelta_test.py b/tensorflow/contrib/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cfec0d50d691cb9e618400fa4b37708a8a3ba2
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adadelta_test.py
@@ -0,0 +1,167 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adadelta
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          with self.test_session():
+            var0_init = [1.0, 2.0]
+            var1_init = [3.0, 4.0]
+            if use_resource:
+              var0 = resource_variable_ops.ResourceVariable(
+                  var0_init, dtype=dtype)
+              var1 = resource_variable_ops.ResourceVariable(
+                  var1_init, dtype=dtype)
+            else:
+              var0 = variables.Variable(var0_init, dtype=dtype)
+              var1 = variables.Variable(var1_init, dtype=dtype)
+
+            grads = constant_op.constant([grad, grad], dtype=dtype)
+
+            accum = 0.0
+            accum_update = 0.0
+
+            # ADADELTA gradient optimizer
+            rho = 0.95
+            epsilon = 1e-8
+            adadelta_opt = adadelta.AdadeltaOptimizer(lr, rho, epsilon)
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+
+            opt_vars = adadelta_opt.variables()
+            self.assertStartsWith(opt_vars[0].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[1].name, var0._shared_name)
+            self.assertStartsWith(opt_vars[2].name, var1._shared_name)
+            self.assertStartsWith(opt_vars[3].name, var1._shared_name)
+            self.assertEqual(4, len(opt_vars))
+
+            variables.global_variables_initializer().run()
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            self.assertEqual(["accum", "accum_update"],
+                             adadelta_opt.get_slot_names())
+            slot[0] = adadelta_opt.get_slot(var0, "accum")
+            self.assertEquals(slot[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot[0] in variables.trainable_variables())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_update")
+            self.assertEquals(slot_update[0].get_shape(), var0.get_shape())
+            self.assertFalse(slot_update[0] in variables.trainable_variables())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum")
+            self.assertEquals(slot[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot[1] in variables.trainable_variables())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_update")
+            self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
+            self.assertFalse(slot_update[1] in variables.trainable_variables())
+
+            # Fetch params to validate initial values
+            self.assertAllClose(var0_init, var0.eval())
+            self.assertAllClose(var1_init, var1.eval())
+
+            update = [None] * num_updates
+            tot_update = 0
+            for step in range(num_updates):
+              # Run adadelta update for comparison
+              adadelta_update.run()
+
+              # Perform initial update without previous accum values
+              accum = accum * rho + (grad**2) * (1 - rho)
+              update[step] = (np.sqrt(accum_update + epsilon) *
+                              (1. / np.sqrt(accum + epsilon)) * grad)
+              accum_update = (accum_update * rho + (update[step]**2) *
+                              (1.0 - rho))
+              tot_update += update[step] * lr
+
+              # Check that the accumulators have been updated
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    slot[slot_idx].eval(),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    slot_update[slot_idx].eval(),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var0.eval(),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  var1.eval(),
+                  rtol=1e-5)
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=False)
+
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.AdadeltaOptimizer(
+            1.0, 1.0, 1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adagrad.py b/tensorflow/contrib/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..c333d1e089047e707c3f13acddae268d935b2b3e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class AdagradOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adagrad algorithm.
+
+  See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  or this
+  [intro](http://cs.stanford.edu/~ppasupat/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self, learning_rate, initial_accumulator_value=0.1,
+               use_locking=False, name="Adagrad"):
+    """Construct a new Adagrad optimizer.
+
+    The learning_rate arg below is a hyperparameter, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` is invalid.
+    """
+    if initial_accumulator_value <= 0.0:
+      raise ValueError("initial_accumulator_value must be positive: %s" %
+                       initial_accumulator_value)
+    super(AdagradOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+    self._initial_accumulator_value = initial_accumulator_value
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      # TODO(isaprykin): Delete colocate_with(v) from other optimizers and
+      # confirm that colocation will happen anyway.
+      dtype = v.dtype.base_dtype
+      if v.get_shape().is_fully_defined():
+        init = init_ops.constant_initializer(self._initial_accumulator_value,
+                                             dtype=dtype)
+      else:
+        # Use a Tensor instead of initializer if variable does not have static
+        # shape.
+        init_constant = gen_array_ops.fill(
+            array_ops.shape(v), self._initial_accumulator_value)
+        init = math_ops.cast(init_constant, dtype)
+      state.create_slot_with_initializer(v, init, v.get_shape(), dtype,
+                                         "accumulator")
+
+  def _apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_dense(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.sparse_apply_adagrad(
+        var,
+        acc,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    acc = state.get_slot(var, "accumulator")
+    return training_ops.resource_sparse_apply_adagrad(
+        var.handle,
+        acc.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/adagrad_test.py b/tensorflow/contrib/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..18191c3ef2cb78f63b6558c289b36b6107b6c171
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adagrad_test.py
@@ -0,0 +1,282 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adagrad
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_locking=False, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testBasic(self):
+    self.doTestBasic(use_locking=False)
+
+  def testBasicResource(self):
+    self.doTestBasic(use_locking=False, use_resource=True)
+
+  def testBasicLocked(self):
+    self.doTestBasic(use_locking=True)
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(
+            constant_op.constant(3.0), initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        ada_opt = adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([[1.0], [2.0]], var0.eval())
+        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([[3.0], [3.715679168701172]]), var1.eval())
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.AdagradOptimizer(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def testSparseRepeatedIndicesResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_repeated)
+        update_op_aggregated = adagrad.AdagradOptimizer(
+            2.0).minimize(loss_aggregated)
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  def testSparseStability(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        shape = [1, 6]
+        var0 = variables.Variable(
+            [[
+                0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257,
+                -0.0105945
+            ]],
+            dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[
+                    -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05,
+                    -8.4877e-05, -9.48906e-05
+                ]],
+                shape=shape,
+                dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.AdagradOptimizer(1.0, initial_accumulator_value=0.1)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        ada_opt = adagrad.AdagradOptimizer(3.0)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        self.assertEqual(["accumulator"], ada_opt.get_slot_names())
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+        # Validate updated params (the same as with only 1 Adagrad).
+        self.assertAllCloseAccordingToType(
+            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+
+  def testDynamicShapeVariable_Ok(self):
+    with self.test_session():
+      v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
+                                      validate_shape=False)
+      self.assertFalse(v.shape.is_fully_defined())
+      # Creating optimizer should cause no exception.
+      adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b7f92a76c1971e2a63722d769ee006c3f3210b
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -0,0 +1,202 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adam optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class AdamOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Adam algorithm.
+
+  See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+               use_locking=False, name="Adam"):
+    """Construct a new Adam optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize initial 2nd moment vector)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section2 of the paper:
+
+    ```
+    t <- t + 1
+    lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
+    variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+    ```
+
+    The default value of 1e-8 for epsilon might not be a good default in
+    general. For example, when training an Inception network on ImageNet a
+    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
+    formulation just before Section 2.1 of the Kingma and Ba paper rather than
+    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
+    hat" in the paper.
+
+    The sparse implementation of this algorithm (used when the gradient is an
+    IndexedSlices object, typically because of `tf.gather` or an embedding
+    lookup in the forward pass) does apply momentum to variable slices even if
+    they were not used in the forward pass (meaning they have a gradient equal
+    to zero). Momentum decay (beta1) is also applied to the entire momentum
+    accumulator. This means that the sparse behavior is equivalent to the dense
+    behavior (in contrast to some momentum implementations which ignore momentum
+    unless a variable slice was actually used).
+
+    Some of the args below are hyperparameters where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      beta1: A float hyperparameter. The exponential decay rate for the 1st
+        moment estimates.
+      beta2: A float hyperparameter. The exponential decay rate for the 2nd
+        moment estimates.
+      epsilon: A float hyperparameter. This epsilon is "epsilon hat" in the
+        Kingma and Ba paper (in the formula just before Section 2.1), not the
+        epsilon in Algorithm 1 of the paper.
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adam".
+    """
+    super(AdamOptimizer, self).__init__(use_locking, name)
+
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("beta1", beta1)
+    self._set_hyper("beta2", beta2)
+    self._set_hyper("epsilon", epsilon)
+
+  def _get_beta_accumulators(self, state=None):
+    if state is None:
+      state = self._get_per_graph_state()
+    return (state.get_non_slot("beta1_power"),
+            state.get_non_slot("beta2_power"))
+
+  def _create_vars(self, var_list, state):
+    # Non-slot variables end up on the same device(s).
+    state.create_non_slot(initial_value=state.get_hyper("beta1"),
+                          name="beta1_power")
+    state.create_non_slot(initial_value=state.get_hyper("beta2"),
+                          name="beta2_power")
+
+    # Create slots for the first and second moments.
+    for v in var_list:
+      state.zeros_slot(v, "m")
+      state.zeros_slot(v, "v")
+
+  def _apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.apply_adam(
+        var, m, v,
+        math_ops.cast(beta1_power, var.dtype.base_dtype),
+        math_ops.cast(beta2_power, var.dtype.base_dtype),
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        state.get_hyper("beta1", var.dtype.base_dtype),
+        state.get_hyper("beta2", var.dtype.base_dtype),
+        state.get_hyper("epsilon", var.dtype.base_dtype),
+        grad, use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    m = state.get_slot(var, "m")
+    v = state.get_slot(var, "v")
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    return training_ops.resource_apply_adam(
+        var.handle, m.handle, v.handle,
+        math_ops.cast(beta1_power, grad.dtype.base_dtype),
+        math_ops.cast(beta2_power, grad.dtype.base_dtype),
+        state.get_hyper("learning_rate", grad.dtype.base_dtype),
+        state.get_hyper("beta1", grad.dtype.base_dtype),
+        state.get_hyper("beta2", grad.dtype.base_dtype),
+        state.get_hyper("epsilon", grad.dtype.base_dtype),
+        grad, use_locking=self._use_locking)
+
+  def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
+    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
+    lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
+    beta1_t = state.get_hyper("beta1", var.dtype.base_dtype)
+    beta2_t = state.get_hyper("beta2", var.dtype.base_dtype)
+    epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = state.get_slot(var, "m")
+    m_scaled_g_values = grad * (1 - beta1_t)
+    m_t = state_ops.assign(m, m * beta1_t,
+                           use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = scatter_add(m, indices, m_scaled_g_values)
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = state.get_slot(var, "v")
+    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
+    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = scatter_add(v, indices, v_scaled_g_values)
+    v_sqrt = math_ops.sqrt(v_t)
+    var_update = state_ops.assign_sub(var,
+                                      lr * m_t / (v_sqrt + epsilon_t),
+                                      use_locking=self._use_locking)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _apply_sparse(self, grad, var, state):
+    return self._apply_sparse_shared(
+        grad.values, var, grad.indices,
+        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
+            x, i, v, use_locking=self._use_locking),
+        state)
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(
+            x.handle, i, v)]):
+      return x.value()
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    return self._apply_sparse_shared(
+        grad, var, indices, self._resource_scatter_add, state)
+
+  def _finish(self, state):
+    # Update the power accumulators.
+    beta1_power, beta2_power = self._get_beta_accumulators(state)
+    update_beta1 = beta1_power.assign(
+        beta1_power * state.get_hyper("beta1"),
+        use_locking=self._use_locking)
+    update_beta2 = beta2_power.assign(
+        beta2_power * state.get_hyper("beta2"),
+        use_locking=self._use_locking)
+    return control_flow_ops.group(update_beta1, update_beta2)
diff --git a/tensorflow/contrib/optimizer_v2/adam_test.py b/tensorflow/contrib/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ad58b0a607ecef1df097c8858b074361e7892b
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/adam_test.py
@@ -0,0 +1,333 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      alpha=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
+        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.AdamOptimizer(3.0)
+        minimize_op = optimizer.minimize(gathered_sum)
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.AdamOptimizer().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  def doTestBasic(self, use_resource=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.test_session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i)
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.AdamOptimizer()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertTrue(beta1_power is not None)
+        self.assertTrue(beta2_power is not None)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+
+        with ops.Graph().as_default():
+          # Shouldn't return non-slot variables from other graphs.
+          self.assertEqual(0, len(opt.variables()))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta2_power))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          if use_resource:
+            self.assertEqual("var0_%d/Adam:0" % (i,),
+                             opt.get_slot(var=var0, name="m").name)
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testTwoSessions(self):
+    optimizer = adam.AdamOptimizer()
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.AdamOptimizer(1.)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len(set(opt.variables())))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..54bc23cdefab58bd84c378a2cf99327c48f0a3f1
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -0,0 +1,731 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# TODO(josh11b): Forked from contrib/eager/python to test OptimizerV2 the same way
+# OptimizerV1 is tested. This file should be removed once the fork is resolved.
+
+import functools
+import os
+
+import six
+
+from tensorflow.contrib.eager.python import checkpointable_utils
+from tensorflow.contrib.optimizer_v2 import adam
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import saver as core_saver
+from tensorflow.python.training import training_util
+
+
+class NonLayerCheckpointable(checkpointable.Checkpointable):
+
+  def __init__(self):
+    super(NonLayerCheckpointable, self).__init__()
+    self.a_variable = checkpointable_utils.add_variable(
+        self, name="a_variable", shape=[])
+
+
+# pylint: disable=not-callable
+class MyModel(training.Model):
+  """A concrete Model for testing."""
+
+  def __init__(self):
+    super(MyModel, self).__init__()
+    self._named_dense = core.Dense(1, use_bias=True)
+    self._second = core.Dense(1, use_bias=False)
+    # We can still track Checkpointables which aren't Layers.
+    self._non_layer = NonLayerCheckpointable()
+
+  def call(self, values):
+    ret = self._second(self._named_dense(values))
+    return ret
+
+
+class _MirroringSaveable(
+    core_saver.BaseSaverBuilder.ResourceVariableSaveable):
+
+  def __init__(self, primary_variable, mirrored_variable, name):
+    self._primary_variable = primary_variable
+    self._mirrored_variable = mirrored_variable
+    super(_MirroringSaveable, self).__init__(
+        self._primary_variable, "", name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restore the same value into both variables."""
+    tensor, = restored_tensors
+    return control_flow_ops.group(
+        self._primary_variable.assign(tensor),
+        self._mirrored_variable.assign(tensor))
+
+
+class _OwnsMirroredVariables(checkpointable.CheckpointableBase):
+  """A Checkpointable object which returns a more complex SaveableObject."""
+
+  def __init__(self):
+    self.non_dep_variable = variable_scope.get_variable(
+        name="non_dep_variable", initializer=6., use_resource=True)
+    self.mirrored = variable_scope.get_variable(
+        name="mirrored", initializer=15., use_resource=True)
+
+  def _gather_saveables_for_checkpoint(self):
+    def _saveable_factory(name=self.non_dep_variable.name):
+      return _MirroringSaveable(
+          primary_variable=self.non_dep_variable,
+          mirrored_variable=self.mirrored,
+          name=name)
+    return {checkpointable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  # The Saver sorts by name before parsing, so we need a name property.
+  @property
+  def name(self):
+    return self.non_dep_variable.name
+
+
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testNamingWithOptimizer(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    # A nuisance Model using the same optimizer. Its slot variables should not
+    # go in the checkpoint, since it is never depended on.
+    other_model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value),
+          global_step=optimizer_step)
+      optimizer.minimize(
+          lambda: other_model(input_value),
+          global_step=optimizer_step)
+    else:
+      train_op = optimizer.minimize(
+          model(input_value), global_step=optimizer_step)
+      optimizer.minimize(
+          other_model(input_value),
+          global_step=optimizer_step)
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    named_variables, serialized_graph = (
+        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    expected_checkpoint_names = (
+        # Created in the root node, so no prefix.
+        "optimizer_step",
+        "model/_second/kernel",
+        "model/_named_dense/kernel",
+        "model/_named_dense/bias",
+        # non-Layer dependency of the model
+        "model/_non_layer/a_variable",
+        # The optimizer creates two non-slot variables
+        "optimizer/beta1_power",
+        "optimizer/beta2_power",
+        # Slot variables
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_second/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/v",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/m",
+        "model/_named_dense/bias/.OPTIMIZER_SLOT/optimizer/v",
+    )
+    suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
+    expected_checkpoint_names = [
+        name + suffix for name in expected_checkpoint_names]
+    six.assertCountEqual(self, expected_checkpoint_names,
+                         named_variables.keys())
+    # Check that we've mapped to the right variable objects (not exhaustive)
+    self.assertEqual(
+        "global_step:0",
+        named_variables["optimizer_step" + suffix].name)
+    self.assertEqual(
+        "my_model/dense_1/kernel:0",
+        named_variables["model/_second/kernel" + suffix].name)
+    self.assertEqual(
+        "my_model/dense/kernel:0",
+        named_variables["model/_named_dense/kernel" + suffix].name)
+    self.assertEqual(
+        "beta1_power:0",
+        named_variables["optimizer/beta1_power" + suffix].name)
+    self.assertEqual(
+        "beta2_power:0",
+        named_variables["optimizer/beta2_power" + suffix].name)
+    # Spot check the generated protocol buffers.
+    self.assertEqual("optimizer",
+                     serialized_graph.nodes[0].children[1].local_name)
+    optimizer_node = serialized_graph.nodes[serialized_graph.nodes[0].children[
+        1].node_id]
+    self.assertEqual("beta1_power",
+                     optimizer_node.children[0].local_name)
+    self.assertEqual("beta1_power",
+                     serialized_graph.nodes[optimizer_node.children[0].node_id]
+                     .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .original_variable_node_id]
+        .attributes[0].full_name)
+    # We strip off the :0 suffix, as variable.name-based saving does.
+    self.assertEqual(
+        "my_model/dense/kernel/Adam",
+        serialized_graph.nodes[optimizer_node.slot_variables[0]
+                               .slot_variable_node_id]
+        .attributes[0].full_name)
+    self.assertEqual(
+        "my_model/dense/kernel/Adam:0",
+        optimizer.get_slot(
+            var=named_variables["model/_named_dense/kernel" + suffix],
+            name="m").name)
+    self.assertEqual(
+        "model/_named_dense/kernel" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .original_variable_node_id].attributes[0].checkpoint_key)
+    self.assertEqual("m", optimizer_node.slot_variables[0].slot_name)
+    self.assertEqual(
+        "model/_named_dense/kernel/.OPTIMIZER_SLOT/optimizer/m" + suffix,
+        serialized_graph.nodes[
+            optimizer_node.slot_variables[0]
+            .slot_variable_node_id].attributes[0].checkpoint_key)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveRestore(self):
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model)
+    input_value = constant_op.constant([[3.]])
+    if context.executing_eagerly():
+      optimizer.minimize(
+          lambda: model(input_value))
+    else:
+      train_op = optimizer.minimize(model(input_value))
+      # TODO(allenl): Make initialization more pleasant when graph building.
+      root_checkpointable.save_counter  # pylint: disable=pointless-statement
+      self.evaluate(checkpointable_utils.gather_initializers(
+          root_checkpointable))
+      self.evaluate(train_op)
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [42.]))
+    m_bias_slot = optimizer.get_slot(model._named_dense.variables[1], "m")
+    self.evaluate(state_ops.assign(m_bias_slot, [1.5]))
+    save_path = root_checkpointable.save(file_prefix=prefix)
+    self.evaluate(state_ops.assign(model._named_dense.variables[1], [43.]))
+    self.evaluate(state_ops.assign(root_checkpointable.save_counter, 3))
+    optimizer_variables = self.evaluate(optimizer.variables())
+    self.evaluate(state_ops.assign(m_bias_slot, [-2.]))
+    # Immediate restoration
+    status = root_checkpointable.restore(save_path=save_path).assert_consumed()
+    status.run_restore_ops()
+    self.assertAllEqual([42.], self.evaluate(model._named_dense.variables[1]))
+    self.assertAllEqual(1, self.evaluate(root_checkpointable.save_counter))
+    self.assertAllEqual([1.5], self.evaluate(m_bias_slot))
+    if not context.executing_eagerly():
+      return  # Restore-on-create is only supported when executing eagerly
+    on_create_model = MyModel()
+    on_create_optimizer = adam.AdamOptimizer(
+        0.001,
+        # Preserve beta1_power and beta2_power when appying gradients so we can
+        # test that they've been restored correctly.
+        beta1=1.0, beta2=1.0)
+    on_create_root = checkpointable_utils.Checkpoint(
+        optimizer=on_create_optimizer, model=on_create_model)
+    # Deferred restoration
+    status = on_create_root.restore(save_path=save_path)
+    on_create_model(constant_op.constant([[3.]]))  # create variables
+    self.assertAllEqual(1, self.evaluate(on_create_root.save_counter))
+    self.assertAllEqual([42.],
+                        self.evaluate(
+                            on_create_model._named_dense.variables[1]))
+    on_create_m_bias_slot = on_create_optimizer.get_slot(
+        on_create_model._named_dense.variables[1], "m")
+    # Optimizer slot variables are created when the original variable is
+    # restored.
+    self.assertAllEqual([1.5], self.evaluate(on_create_m_bias_slot))
+    self.assertAllEqual(optimizer_variables[2:],
+                        self.evaluate(on_create_optimizer.variables()))
+    dummy_var = resource_variable_ops.ResourceVariable([1.])
+    on_create_optimizer.minimize(loss=dummy_var.read_value)
+    status.assert_consumed()
+    beta1_power, beta2_power = on_create_optimizer._get_beta_accumulators()
+    self.assertAllEqual(optimizer_variables[0], self.evaluate(beta1_power))
+    self.assertAllEqual(optimizer_variables[1], self.evaluate(beta2_power))
+
+  # TODO(allenl): Debug garbage created by this test in python3.
+  def testDeferredRestorationUsageEager(self):
+    """An idiomatic eager execution example."""
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      model = MyModel()
+      optimizer = adam.AdamOptimizer(0.001)
+      root = checkpointable_utils.Checkpoint(
+          optimizer=optimizer, model=model,
+          optimizer_step=training_util.get_or_create_global_step())
+      root.restore(core_saver.latest_checkpoint(checkpoint_directory))
+      for _ in range(num_training_steps):
+        # TODO(allenl): Use a Dataset and serialize/checkpoint it.
+        input_value = constant_op.constant([[3.]])
+        optimizer.minimize(
+            lambda: model(input_value),  # pylint: disable=cell-var-from-loop
+            global_step=root.optimizer_step)
+      root.save(file_prefix=checkpoint_prefix)
+      self.assertEqual((training_continuation + 1) * num_training_steps,
+                       root.optimizer_step.numpy())
+
+  def testUsageGraph(self):
+    """Expected usage when graph building."""
+    with context.graph_mode():
+      num_training_steps = 10
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      for training_continuation in range(3):
+        with ops.Graph().as_default():
+          model = MyModel()
+          optimizer = adam.AdamOptimizer(0.001)
+          root = checkpointable_utils.Checkpoint(
+              optimizer=optimizer, model=model,
+              global_step=training_util.get_or_create_global_step())
+          input_value = constant_op.constant([[3.]])
+          train_op = optimizer.minimize(
+              model(input_value),
+              global_step=root.global_step)
+          checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+          with self.test_session(graph=ops.get_default_graph()) as session:
+            status = root.restore(save_path=checkpoint_path)
+            status.initialize_or_restore(session=session)
+            if checkpoint_path is None:
+              self.assertEqual(0, training_continuation)
+              with self.assertRaises(AssertionError):
+                status.assert_consumed()
+            else:
+              status.assert_consumed()
+            for _ in range(num_training_steps):
+              session.run(train_op)
+            root.save(file_prefix=checkpoint_prefix, session=session)
+            self.assertEqual((training_continuation + 1) * num_training_steps,
+                             session.run(root.global_step))
+            self.assertEqual(training_continuation + 1,
+                             session.run(root.save_counter))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAgnosticUsage(self):
+    """Graph/eager agnostic usage."""
+    # Does create garbage when executing eagerly due to ops.Graph() creation.
+    num_training_steps = 10
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        optimizer = adam.AdamOptimizer(0.001)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        input_value = constant_op.constant([[3.]])
+        train_fn = functools.partial(
+            optimizer.minimize,
+            functools.partial(model, input_value),
+            global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+
+  # pylint: disable=cell-var-from-loop
+  @test_util.run_in_graph_and_eager_modes()
+  def testWithDefun(self):
+    num_training_steps = 2
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    for training_continuation in range(3):
+      with ops.Graph().as_default(), self.test_session(
+          graph=ops.get_default_graph()), test_util.device(use_gpu=True):
+        model = MyModel()
+        # Don't actually train so we can test variable values
+        optimizer = adam.AdamOptimizer(0.)
+        root = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, model=model,
+            global_step=training_util.get_or_create_global_step())
+        checkpoint_path = core_saver.latest_checkpoint(checkpoint_directory)
+        status = root.restore(save_path=checkpoint_path)
+        def train_fn():
+          @function.defun
+          def _call_model(x):
+            return model(x)
+          with backprop.GradientTape() as tape:
+            loss = _call_model(constant_op.constant([[3.]]))
+          gradients = tape.gradient(loss, model.variables)
+          return optimizer.apply_gradients(zip(gradients, model.variables),
+                                           global_step=root.global_step)
+        if not context.executing_eagerly():
+          train_fn = functools.partial(
+              self.evaluate, train_fn())
+        status.initialize_or_restore()
+        for _ in range(num_training_steps):
+          train_fn()
+        if training_continuation > 0:
+          status.assert_consumed()
+          self.assertAllClose([[42.]], self.evaluate(model.variables[0]))
+        else:
+          self.evaluate(model.variables[0].assign([[42.]]))
+        root.save(file_prefix=checkpoint_prefix)
+        self.assertEqual((training_continuation + 1) * num_training_steps,
+                         self.evaluate(root.global_step))
+        self.assertEqual(training_continuation + 1,
+                         self.evaluate(root.save_counter))
+  # pylint: enable=cell-var-from-loop
+
+  def _get_checkpoint_name(self, name):
+    root = checkpointable.Checkpointable()
+    checkpointable_utils.add_variable(
+        root, name=name, shape=[1, 2], dtype=dtypes.float64)
+    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
+    checkpoint_name, = named_variables.keys()
+    with ops.name_scope("root/" + checkpoint_name):
+      pass  # Make sure we can use this as an op name if we prefix it.
+    return checkpoint_name
+
+  def testAnonymousVarsInInit(self):
+
+    class Model(training.Model):
+
+      def __init__(self):
+        super(Model, self).__init__()
+        self.w = resource_variable_ops.ResourceVariable(0.0)
+        self.b = resource_variable_ops.ResourceVariable(0.0)
+        self.vars = [self.w, self.b]
+
+      def call(self, x):
+        return x * self.w + self.b
+
+    with context.eager_mode():
+      model = Model()
+      optimizer = adam.AdamOptimizer(learning_rate=0.05)
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      checkpoint = checkpointable_utils.Checkpoint(
+          model=model, optimizer=optimizer)
+      for _ in range(2):
+        checkpoint.save(checkpoint_prefix)
+        with backprop.GradientTape() as tape:
+          loss = (constant_op.constant(1.)
+                  - model(constant_op.constant(1.))) ** 2
+        grad = tape.gradient(loss, model.vars)
+        optimizer.apply_gradients(
+            [(g, v) for g, v in zip(grad, model.vars)])
+
+  @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
+  def testDeferredSlotRestoration(self):
+    checkpoint_directory = self.get_temp_dir()
+
+    root = checkpointable.Checkpointable()
+    root.var = checkpointable_utils.add_variable(
+        root, name="var", initializer=0.)
+    optimizer = adam.AdamOptimizer(0.1)
+    if context.executing_eagerly():
+      optimizer.minimize(root.var.read_value)
+    else:
+      train_op = optimizer.minimize(root.var)
+      # Note that `optimizer` has not been added as a dependency of
+      # `root`. Create a one-off grouping so that slot variables for `root.var`
+      # get initialized too.
+      self.evaluate(checkpointable_utils.gather_initializers(
+          checkpointable_utils.Checkpoint(root=root, optimizer=optimizer)))
+      self.evaluate(train_op)
+    self.evaluate(state_ops.assign(root.var, 12.))
+    no_slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "no_slots"))
+    root.optimizer = optimizer
+    self.evaluate(state_ops.assign(root.var, 13.))
+    self.evaluate(state_ops.assign(optimizer.get_slot(name="m", var=root.var),
+                                   14.))
+    slots_path = checkpointable_utils.CheckpointableSaver(root).save(
+        os.path.join(checkpoint_directory, "with_slots"))
+    new_root = checkpointable.Checkpointable()
+    # Load the slot-containing checkpoint (deferred), then immediately overwrite
+    # the non-slot variable (also deferred).
+    slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(slots_path)
+    no_slot_status = checkpointable_utils.CheckpointableSaver(
+        new_root).restore(no_slots_path)
+    with self.assertRaises(AssertionError):
+      no_slot_status.assert_consumed()
+    new_root.var = checkpointable_utils.add_variable(
+        new_root, name="var", shape=[])
+    no_slot_status.assert_consumed()
+    no_slot_status.run_restore_ops()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    new_root.optimizer = adam.AdamOptimizer(0.1)
+    with self.assertRaisesRegexp(AssertionError, "beta1_power"):
+      slot_status.assert_consumed()
+    self.assertEqual(12., self.evaluate(new_root.var))
+    if context.executing_eagerly():
+      # Slot variables are only created with restoring initializers when
+      # executing eagerly.
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+    else:
+      self.assertIs(new_root.optimizer.get_slot(name="m", var=new_root.var),
+                    None)
+    if context.executing_eagerly():
+      new_root.optimizer.minimize(new_root.var.read_value)
+    else:
+      train_op = new_root.optimizer.minimize(new_root.var)
+      # The slot variable now exists; restore() didn't create it, but we should
+      # now have a restore op for it.
+      slot_status.run_restore_ops()
+      self.assertEqual(14., self.evaluate(
+          new_root.optimizer.get_slot(name="m", var=new_root.var)))
+      self.evaluate(train_op)
+    slot_status.assert_consumed()
+
+  def testManySavesGraph(self):
+    """Saves after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        saver.save(checkpoint_prefix)
+        before_ops = graph.get_operations()
+        saver.save(checkpoint_prefix)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testManyRestoresGraph(self):
+    """Restores after the first should not modify the graph."""
+    with context.graph_mode():
+      graph = ops.Graph()
+      with graph.as_default(), self.test_session(graph):
+        checkpoint_directory = self.get_temp_dir()
+        checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+        obj = checkpointable.Checkpointable()
+        obj.var = variable_scope.get_variable(name="v", initializer=0.)
+        obj.opt = adam.AdamOptimizer(0.1)
+        obj.opt.minimize(obj.var.read_value())
+        self.evaluate(checkpointable_utils.gather_initializers(obj))
+        saver = checkpointable_utils.CheckpointableSaver(obj)
+        save_path = saver.save(checkpoint_prefix)
+        saver.restore(save_path)
+        before_ops = graph.get_operations()
+        saver.restore(save_path)
+        self.assertEqual(before_ops, graph.get_operations())
+
+  def testMultipleGraphsNonSlotVariables(self):
+    with context.graph_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      optimizer = adam.AdamOptimizer(0.001)
+      # Construct a model in one graph
+      first_graph = ops.Graph()
+      first_session = session_lib.Session(graph=first_graph)
+      with first_graph.as_default(), first_session.as_default():
+        first_variable = resource_variable_ops.ResourceVariable([1.])
+        first_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=first_variable)
+        train_op = optimizer.minimize(first_variable.read_value)
+        self.evaluate(checkpointable_utils.gather_initializers(
+            first_root_checkpointable))
+        self.evaluate(train_op)
+        self.evaluate(first_variable.assign([1.]))
+        self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m").assign([2.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(3.))
+
+      # Save and load in a second graph
+      second_graph = ops.Graph()
+      with second_graph.as_default(), session_lib.Session(graph=second_graph):
+        second_variable = resource_variable_ops.ResourceVariable([1.])
+        second_root_checkpointable = checkpointable_utils.Checkpoint(
+            optimizer=optimizer, variable=second_variable)
+        train_op = optimizer.minimize(second_variable.read_value)
+        second_root_checkpointable.restore(None).initialize_or_restore()
+        self.evaluate(train_op)
+        self.evaluate(second_variable.assign([4.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([5.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.evaluate(beta1_power.assign(6.))
+        save_path = second_root_checkpointable.save(checkpoint_prefix)
+        self.evaluate(second_variable.assign([7.]))
+        self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m").assign([8.]))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+        status = second_root_checkpointable.restore(save_path)
+        status.assert_consumed().run_restore_ops()
+        self.assertAllEqual([4.], self.evaluate(second_variable))
+        self.assertAllEqual([5.], self.evaluate(optimizer.get_slot(
+            var=second_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(6., self.evaluate(beta1_power))
+
+      # Check that the first graph is unmolested
+      with first_graph.as_default(), first_session.as_default():
+        self.assertAllEqual([1.], self.evaluate(first_variable))
+        self.assertAllEqual([2.], self.evaluate(optimizer.get_slot(
+            var=first_variable, name="m")))
+        beta1_power, _ = optimizer._get_beta_accumulators()
+        self.assertAllEqual(3., self.evaluate(beta1_power))
+
+
+class CheckpointCompatibilityTests(test.TestCase):
+
+  def _initialized_model(self):
+    input_value = constant_op.constant([[3.]])
+    model = MyModel()
+    optimizer = adam.AdamOptimizer(0.001)
+    optimizer_step = training_util.get_or_create_global_step()
+    root_checkpointable = checkpointable_utils.Checkpoint(
+        optimizer=optimizer, model=model, optimizer_step=optimizer_step)
+    train_op = optimizer.minimize(
+        functools.partial(model, input_value),
+        global_step=optimizer_step)
+    self.evaluate(checkpointable_utils.gather_initializers(
+        root_checkpointable))
+    self.evaluate(train_op)
+    # A regular variable, a slot variable, and a non-slot Optimizer variable
+    # with known values to check when loading.
+    self.evaluate(model._named_dense.bias.assign([1.]))
+    self.evaluate(optimizer.get_slot(
+        var=model._named_dense.bias, name="m").assign([2.]))
+    beta1_power, _ = optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(3.))
+    return root_checkpointable
+
+  def _set_sentinels(self, root_checkpointable):
+    self.evaluate(root_checkpointable.model._named_dense.bias.assign([101.]))
+    self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")
+        .assign([102.]))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(103.))
+
+  def _check_sentinels(self, root_checkpointable):
+    self.assertAllEqual(
+        [1.], self.evaluate(root_checkpointable.model._named_dense.bias))
+    self.assertAllEqual([2.], self.evaluate(
+        root_checkpointable.optimizer.get_slot(
+            var=root_checkpointable.model._named_dense.bias, name="m")))
+    beta1_power, _ = root_checkpointable.optimizer._get_beta_accumulators()
+    self.assertAllEqual(3., self.evaluate(beta1_power))
+
+  def _write_name_based_checkpoint(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        name_saver = core_saver.Saver()
+        return name_saver.save(
+            sess=session, save_path=checkpoint_prefix,
+            global_step=root.optimizer_step)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLoadFromNameBasedSaver(self):
+    """Save a name-based checkpoint, load it using the object-based API."""
+    with test_util.device(use_gpu=True):
+      save_path = self._write_name_based_checkpoint()
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      with self.assertRaises(AssertionError):
+        self._check_sentinels(root)
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      status = object_saver.restore(save_path)
+      with self.assertRaises(AssertionError):
+        status.assert_consumed()
+      status.run_restore_ops()
+      self._check_sentinels(root)
+      self._set_sentinels(root)
+      status.initialize_or_restore()
+      self._check_sentinels(root)
+
+  # TODO(allenl): Test for the core name-based saver loading object-based
+  # checkpoints once object-based checkpointing is in core.
+
+  def testSaveGraphLoadEager(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph) as session:
+        root = self._initialized_model()
+        object_saver = checkpointable_utils.CheckpointableSaver(root)
+        save_path = object_saver.save(
+            session=session, file_prefix=checkpoint_prefix)
+    with context.eager_mode():
+      root = self._initialized_model()
+      self._set_sentinels(root)
+      root.restore(save_path).assert_consumed()
+      self._check_sentinels(root)
+
+  def testSaveEagerLoadGraph(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    with context.eager_mode():
+      root = self._initialized_model()
+      object_saver = checkpointable_utils.CheckpointableSaver(root)
+      save_path = object_saver.save(file_prefix=checkpoint_prefix)
+    with context.graph_mode():
+      save_graph = ops.Graph()
+      with save_graph.as_default(), self.test_session(
+          graph=save_graph):
+        root = self._initialized_model()
+        self._set_sentinels(root)
+        root.restore(save_path).assert_consumed().run_restore_ops()
+        self._check_sentinels(root)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent.py b/tensorflow/contrib/optimizer_v2/gradient_descent.py
new file mode 100644
index 0000000000000000000000000000000000000000..945c8de5595394341077ae13cae3161c71ad4f98
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent.py
@@ -0,0 +1,69 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""GradientDescent optimizer for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class GradientDescentOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the gradient descent algorithm."""
+
+  def __init__(self, learning_rate, use_locking=False, name="GradientDescent"):
+    """Construct a new gradient descent optimizer.
+
+    The learning rate arg below is a hyperparameter where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate to use.
+      use_locking: If True use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "GradientDescent".
+    """
+    super(GradientDescentOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+
+  def _apply_dense(self, grad, var, state):
+    return training_ops.apply_gradient_descent(
+        var,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, handle, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return training_ops.resource_apply_gradient_descent(
+        handle.handle, lr, grad, use_locking=self._use_locking)
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    lr = state.get_hyper("learning_rate", grad.dtype.base_dtype)
+    return resource_variable_ops.resource_scatter_add(
+        handle.handle, indices, -grad * lr)
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    delta = ops.IndexedSlices(
+        grad.values * state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.indices, grad.dense_shape)
+    return var.scatter_sub(delta, use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/gradient_descent_test.py b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad9aef804fb250395d0c42fcd145f8a1707237d0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/gradient_descent_test.py
@@ -0,0 +1,223 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for GradientDescent optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+
+  def testBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = gradient_descent.GradientDescentOptimizer(3.0)
+        sgd_op = optimizer.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertEqual(0, len(optimizer.variables()))
+
+  def testBasicResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testMinimizeResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(var0, x) + var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = gradient_descent.GradientDescentOptimizer(
+            lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
+  def testGradWrtRef(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        opt = gradient_descent.GradientDescentOptimizer(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        variables.global_variables_initializer().run()
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], grad.eval())
+
+  def testWithGlobalStep(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+        self.assertAllCloseAccordingToType(1, global_step.eval())
+
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/momentum.py b/tensorflow/contrib/optimizer_v2/momentum.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5aadc2d13074cec440a7b508be56bd195d7517
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum.py
@@ -0,0 +1,124 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Momentum for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class MomentumOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the Momentum algorithm.
+
+  Computes (if `use_nesterov = False`):
+
+  ```
+  accumulation = momentum * accumulation + gradient
+  variable -= learning_rate * accumulation
+  ```
+
+  Note that in the dense version of this algorithm, `accumulation` is updated
+  and applied regardless of a gradient's value, whereas the sparse version (when
+  the gradient is an `IndexedSlices`, typically because of `tf.gather` or an
+  embedding) only updates variable slices and corresponding `accumulation` terms
+  when that part of the variable was used in the forward pass.
+  """
+
+  def __init__(self, learning_rate, momentum,
+               use_locking=False, name="Momentum", use_nesterov=False):
+    """Construct a new Momentum optimizer.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      momentum: A float hyperparameter. The momentum.
+      use_locking: If `True` use locks for update operations.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Momentum".
+      use_nesterov: If `True` use Nesterov Momentum.
+        See [Sutskever et al., 2013](
+        http://jmlr.org/proceedings/papers/v28/sutskever13.pdf).
+        This implementation always computes gradients at the value of the
+        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+        variable(s) track the values called `theta_t + mu*v_t` in the paper.
+
+    @compatibility(eager)
+    When eager execution is enabled, learning_rate and momentum can each be a
+    callable that takes no arguments and returns the actual value to use. This
+    can be useful for changing these values across different invocations of
+    optimizer functions.
+    @end_compatibility
+    """
+    super(MomentumOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("momentum", momentum)
+    self._use_nesterov = use_nesterov
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
+
+  def _apply_sparse(self, grad, var, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad.values,
+        grad.indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov).op
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    mom = state.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_momentum(
+        var.handle,
+        mom.handle,
+        state.get_hyper("learning_rate", var.dtype.base_dtype),
+        grad,
+        indices,
+        state.get_hyper("momentum", var.dtype.base_dtype),
+        use_locking=self._use_locking,
+        use_nesterov=self._use_nesterov)
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..26724f66c2a1db1d01577b31b739af18f51d3976
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -0,0 +1,582 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Momentum."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.optimizer_v2 import momentum as momentum_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class MomentumOptimizerTest(test.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    var = var + accum * lr * momentum
+    accum = accum * momentum + g
+    var = var - lr * accum
+    var = var - accum * lr * momentum
+    return var, accum
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, name="var1_%d" % i)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = lambda: 2.0
+      momentum = lambda: 0.9
+      if not use_callable_params:
+        learning_rate = learning_rate()
+        momentum = momentum()
+      mom_opt = momentum_lib.MomentumOptimizer(
+          learning_rate=learning_rate, momentum=momentum)
+      mom_update = mom_opt.apply_gradients(
+          zip([grads0, grads1], [var0, var1]))
+
+      if not context.executing_eagerly():
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Check we have slots
+      self.assertEqual(["momentum"], mom_opt.get_slot_names())
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEquals(slot0.get_shape(), var0.get_shape())
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEquals(slot1.get_shape(), var1.get_shape())
+      if not context.executing_eagerly():
+        self.assertFalse(slot0 in variables.trainable_variables())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      if not context.executing_eagerly():
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(np.array([0.1, 0.1]),
+                                         self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(np.array([0.01, 0.01]),
+                                         self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+          self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1))
+      # Step 2: the momentum accumulators contain the previous update.
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      else:
+        self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+          self.evaluate(slot0))
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+          self.evaluate(slot1))
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                  (0.9 * 0.01 + 0.01) * 2.0)
+          ]), self.evaluate(var1))
+
+  def testBasic(self):
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testVariablesAcrossGraphs(self):
+    optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
+    with ops.Graph().as_default():
+      var0 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var0")
+      var1 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var1")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var0 + var1)
+      else:
+        loss = math_ops.reduce_sum(var0 + var1)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var0")
+      self.assertStartsWith(optimizer_variables[1].name, "var1")
+      self.assertEquals(2, len(optimizer_variables))
+
+    with ops.Graph().as_default():
+      var2 = resource_variable_ops.ResourceVariable(
+          [1.0, 2.0], dtype=dtypes.float32, name="var2")
+      var3 = resource_variable_ops.ResourceVariable(
+          [3.0, 4.0], dtype=dtypes.float32, name="var3")
+      if context.executing_eagerly():
+        loss = lambda: math_ops.reduce_sum(var2 + var3)
+      else:
+        loss = math_ops.reduce_sum(var2 + var3)
+      optimizer.minimize(loss)
+      optimizer_variables = optimizer.variables()
+      self.assertStartsWith(optimizer_variables[0].name, "var2")
+      self.assertStartsWith(optimizer_variables[1].name, "var3")
+      self.assertEquals(2, len(optimizer_variables))
+
+  def testNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        cost = 5 * var0 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name="global_step")
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        opt_op = mom_op.minimize(cost, global_step, [var0, var1])
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_op.run()
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  def testSparseNesterovMomentum(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        grads = []
+        for t in range(1, 5):
+          grads.append(var0_np * 10)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        loss = 5 * var0 * var0 + 3 * var1
+        mom_op = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9, use_nesterov=True)
+        x_feed = array_ops.placeholder(dtype)
+        y_feed = ops.IndexedSlices(
+            x_feed, constant_op.constant([0, 1]), constant_op.constant([2]))
+        grads_and_vars = [(y_feed, var0), (constant_op.constant(
+            [3.0, 3.0], dtype=dtype), var1)]
+        opt_update = mom_op.apply_gradients(grads_and_vars)
+        variables.global_variables_initializer().run()
+        for t in range(1, 5):
+          opt_update.run(feed_dict={x_feed: grads[t - 1]})
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
+                                                                    accum1_np,
+                                                                    3, 2.0, 0.9)
+          self.assertAllClose(var0_np, var0.eval())
+          self.assertAllClose(var1_np, var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        return pred * pred
+      # pylint: enable=cell-var-from-loop
+
+      opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+      sgd_op = opt.minimize(loss)
+      self.evaluate(variables.global_variables_initializer())
+      # Run 1 step of sgd
+      self.evaluate(sgd_op)
+      # Validate updated params
+      self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+
+    def loss():
+      return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
+
+    opt = momentum_lib.MomentumOptimizer(learning_rate=1.0, momentum=0.0)
+    sgd_op = opt.minimize(loss)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(sgd_op)
+    self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
+
+  def testTensorLearningRateAndMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=constant_op.constant(2.0),
+            momentum=constant_op.constant(0.9))
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertFalse(slot0 in variables.trainable_variables())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertFalse(slot1 in variables.trainable_variables())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+  def _dbParamsMom01(self):
+    """Return dist-belief momentum values.
+
+    Return values been generated from the dist-belief momentum unittest,
+    running with a learning rate of 0.1 and a momentum of 0.1.
+
+    These values record how a parameter vector of size 10, initialized with 0.0,
+    gets updated with 10 consecutive momentum steps.  It uses random gradients.
+
+    Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the momentum update.
+    """
+    db_grad = [[]] * 10
+    db_out = [[]] * 10
+    # pylint: disable=line-too-long
+    db_grad[0] = [
+        0.00096264342, 0.17914793, 0.93945462, 0.41396621, 0.53037018,
+        0.93197989, 0.78648776, 0.50036013, 0.55345792, 0.96722615
+    ]
+    db_out[0] = [
+        -9.6264346e-05, -0.017914793, -0.093945466, -0.041396622, -0.053037018,
+        -0.093197994, -0.078648776, -0.050036013, -0.055345792, -0.096722618
+    ]
+    db_grad[1] = [
+        0.17075552, 0.88821375, 0.20873757, 0.25236958, 0.57578111, 0.15312378,
+        0.5513742, 0.94687688, 0.16012503, 0.22159521
+    ]
+    db_out[1] = [
+        -0.017181443, -0.10852765, -0.12421377, -0.070773244, -0.11591884,
+        -0.11783017, -0.14165108, -0.14972731, -0.076892875, -0.1285544
+    ]
+    db_grad[2] = [
+        0.35077485, 0.47304362, 0.44412705, 0.44368884, 0.078527533, 0.81223965,
+        0.31168157, 0.43203235, 0.16792089, 0.24644311
+    ]
+    db_out[2] = [
+        -0.053967446, -0.1648933, -0.1716533, -0.1180798, -0.13005978,
+        -0.20151734, -0.17911947, -0.20289968, -0.095839672, -0.15638189
+    ]
+    db_grad[3] = [
+        0.9694621, 0.75035888, 0.28171822, 0.83813518, 0.53807181, 0.3728098,
+        0.81454384, 0.03848977, 0.89759839, 0.93665648
+    ]
+    db_out[3] = [
+        -0.15459226, -0.24556576, -0.20456907, -0.20662397, -0.18528105,
+        -0.24716705, -0.2643207, -0.21206589, -0.18749419, -0.2528303
+    ]
+    db_grad[4] = [
+        0.38578293, 0.8536852, 0.88722926, 0.66276771, 0.13678469, 0.94036359,
+        0.69107032, 0.81897682, 0.5433259, 0.67860287
+    ]
+    db_out[4] = [
+        -0.20323303, -0.33900154, -0.29658359, -0.28175515, -0.20448165,
+        -0.34576839, -0.34194785, -0.29488021, -0.25099224, -0.33033544
+    ]
+    db_grad[5] = [
+        0.27885768, 0.76100707, 0.24625534, 0.81354135, 0.18959245, 0.48038563,
+        0.84163809, 0.41172323, 0.83259648, 0.44941229
+    ]
+    db_out[5] = [
+        -0.23598288, -0.42444581, -0.33041057, -0.3706224, -0.22536094,
+        -0.40366709, -0.43387437, -0.34433398, -0.34060168, -0.38302717
+    ]
+    db_grad[6] = [
+        0.27233034, 0.056316052, 0.5039115, 0.24105175, 0.35697976, 0.75913221,
+        0.73577434, 0.16014607, 0.57500273, 0.071136251
+    ]
+    db_out[6] = [
+        -0.26649091, -0.43862185, -0.38418442, -0.40361428, -0.26314685,
+        -0.48537019, -0.51664448, -0.36529395, -0.40706289, -0.39540997
+    ]
+    db_grad[7] = [
+        0.58697265, 0.2494842, 0.08106143, 0.39954534, 0.15892942, 0.12683646,
+        0.74053431, 0.16033, 0.66625422, 0.73515922
+    ]
+    db_out[7] = [
+        -0.32823896, -0.46498787, -0.39766794, -0.446868, -0.28281838,
+        -0.50622416, -0.59897494, -0.38342294, -0.48033443, -0.47016418
+    ]
+    db_grad[8] = [
+        0.8215279, 0.41994119, 0.95172721, 0.68000203, 0.79439718, 0.43384039,
+        0.55561525, 0.22567581, 0.93331909, 0.29438227
+    ]
+    db_out[8] = [
+        -0.41656655, -0.50961858, -0.49418902, -0.51919359, -0.36422527,
+        -0.55169362, -0.6627695, -0.40780342, -0.58099347, -0.50707781
+    ]
+    db_grad[9] = [
+        0.68297005, 0.67758518, 0.1748755, 0.13266537, 0.70697063, 0.055731893,
+        0.68593478, 0.50580865, 0.12602448, 0.093537711
+    ]
+    db_out[9] = [
+        -0.49369633, -0.58184016, -0.52132869, -0.5396927, -0.44306302,
+        -0.56181377, -0.73774242, -0.46082234, -0.60366184, -0.52012295
+    ]
+    # pylint: enable=line-too-long
+    return db_grad, db_out
+
+  def testLikeDistBeliefMom01(self):
+    with self.test_session():
+      db_grad, db_out = self._dbParamsMom01()
+      num_samples = len(db_grad)
+      var0 = variables.Variable([0.0] * num_samples)
+      grads0 = constant_op.constant([0.0] * num_samples)
+      mom_opt = momentum_lib.MomentumOptimizer(learning_rate=0.1, momentum=0.1)
+      mom_update = mom_opt.apply_gradients(zip([grads0], [var0]))
+      variables.global_variables_initializer().run()
+      for i in xrange(num_samples):
+        mom_update.run(feed_dict={grads0: db_grad[i]})
+        self.assertAllClose(np.array(db_out[i]), var0.eval())
+
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable(array_ops.zeros([4, 2], dtype=dtype))
+        var1 = variables.Variable(constant_op.constant(1.0, dtype, [4, 2]))
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.1, .1]], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([4, 2]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(
+                [[.01, .01], [.01, .01]], dtype=dtype),
+            constant_op.constant([2, 3]),
+            constant_op.constant([4, 2]))
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Check we have slots
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([0, 0], var0.eval()[0])
+        self.assertAllClose([0, 0], var0.eval()[1])
+        self.assertAllClose([1, 1], var1.eval()[2])
+
+        # Step 1: the momentum accumulators are 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]), slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+        # Step 2: the momentum accumulators contain the previous update.
+        mom_update.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            slot1.eval()[2])
+        # Check that the parameters have been updated.
+        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
+                    (0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval()[1])
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval()[2])
+
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        mom_opt = momentum_lib.MomentumOptimizer(
+            learning_rate=2.0, momentum=0.9)
+        mom_update1 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        mom_update2 = mom_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        self.assertEqual(["momentum"], mom_opt.get_slot_names())
+        slot0 = mom_opt.get_slot(var0, "momentum")
+        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        slot1 = mom_opt.get_slot(var1, "momentum")
+        self.assertEquals(slot1.get_shape(), var1.get_shape())
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the momentum accumulators where 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        mom_update1.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
+        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+        # Step 2: the second momentum accumulators contain the previous update.
+        mom_update2.run()
+        # Check that the momentum accumulators have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+        # Check that the parameters have been updated.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
+                    (0.9 * 0.01 + 0.01) * 2.0)
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d19578ea8c4f53019657ab85950a814d1a47b8
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -0,0 +1,1352 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Version 2 of class Optimizer."""
+# pylint: disable=g-bad-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer as optimizer_v1
+from tensorflow.python.training import slot_creator
+from tensorflow.python.util import nest
+
+
+class _OptimizableVariable(object):
+  """Interface for abstracting over variables in the optimizers."""
+
+  @abc.abstractmethod
+  def target(self):
+    """Returns the optimization target for this variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+  @abc.abstractmethod
+  def update_op(self, optimizer, g, *args):
+    """Returns the update ops for updating the variable."""
+    raise NotImplementedError("Calling an abstract method.")
+
+
+class _RefVariableProcessor(_OptimizableVariable):
+  """Processor for Variable."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v._ref()  # pylint: disable=protected-access
+
+  def update_op(self, optimizer, g, *args):
+    if isinstance(g, ops.Tensor):
+      update_op = optimizer._apply_dense(g, self._v, *args)  # pylint: disable=protected-access
+      if self._v.constraint is not None:
+        with ops.control_dependencies([update_op]):
+          return self._v.assign(self._v.constraint(self._v))
+      else:
+        return update_op
+    else:
+      assert isinstance(g, ops.IndexedSlices), ("Gradient ", g, " is neither a "
+                                                "tensor nor IndexedSlices.")
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      # pylint: disable=protected-access
+      return optimizer._apply_sparse_duplicate_indices(g, self._v, *args)
+
+
+class _DenseReadResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    update_op = optimizer._resource_apply_dense(g, self._v.op.inputs[0], *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _DenseResourceVariableProcessor(_OptimizableVariable):
+  """Processor for dense ResourceVariables."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    # pylint: disable=protected-access
+    if isinstance(g, ops.IndexedSlices):
+      if self._v.constraint is not None:
+        raise RuntimeError(
+            "Cannot use a constraint function on a sparse variable.")
+      return optimizer._resource_apply_sparse_duplicate_indices(
+          g.values, self._v, g.indices, *args)
+    update_op = optimizer._resource_apply_dense(g, self._v, *args)
+    if self._v.constraint is not None:
+      with ops.control_dependencies([update_op]):
+        return self._v.assign(self._v.constraint(self._v))
+    else:
+      return update_op
+
+
+class _StreamingModelPortProcessor(_OptimizableVariable):
+  """Processor for streaming ModelPorts."""
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    return g
+
+
+class _TensorProcessor(_OptimizableVariable):
+  """Processor for ordinary Tensors.
+
+  Even though a Tensor can't really be updated, sometimes it is useful to
+  compute the gradients with respect to a Tensor using the optimizer. Updating
+  the Tensor is, of course, unsupported.
+  """
+
+  def __init__(self, v):
+    self._v = v
+
+  def target(self):
+    return self._v
+
+  def update_op(self, optimizer, g, *args):
+    raise NotImplementedError("Trying to update a Tensor ", self._v)
+
+
+def _get_processor(v):
+  """The processor of v."""
+  if context.executing_eagerly():
+    if isinstance(v, ops.Tensor):
+      return _TensorProcessor(v)
+    else:
+      return _DenseResourceVariableProcessor(v)
+  if v.op.type == "VarHandleOp":
+    return _DenseResourceVariableProcessor(v)
+  if isinstance(v, variables.Variable):
+    return _RefVariableProcessor(v)
+  if v.op.type == "SubmodelPort":
+    return _StreamingModelPortProcessor(v)
+  if isinstance(v, ops.Tensor):
+    return _TensorProcessor(v)
+  raise NotImplementedError("Trying to optimize unsupported type ", v)
+
+
+def _var_key_v2(var):
+  """Key for representing a primary variable, for looking up slots."""
+  # pylint: disable=protected-access
+  if hasattr(var, "_mirrored_container"):
+    mirrored_container = var._mirrored_container()
+    assert mirrored_container is not None
+    if context.executing_eagerly():
+      return mirrored_container._unique_id
+    return mirrored_container._shared_name
+  if context.executing_eagerly():
+    return var._unique_id
+  return var.op.name
+
+
+def _resolve(value, name):
+  if callable(value):
+    value = value()
+  return ops.convert_to_tensor(value, name=name)
+
+
+def _is_dynamic(value):
+  """Returns true if __init__ arg `value` should be re-evaluated each step."""
+  if callable(value): return True
+  # Don't need to do anything special in graph mode, since dynamic values
+  # will propagate correctly automatically.
+  # TODO(josh11b): Add per-device caching across steps using variables for
+  # truly static values once we add distributed support.
+  if context.executing_eagerly() and isinstance(
+      value, resource_variable_ops.ResourceVariable):
+    return True
+  return False
+
+
+class _OptimizerV2State(object):
+  """Holds per-graph and per-step optimizer state.
+
+  Use _init_with_static_hyper() to create the state for a graph, and then
+  _copy_with_dynamic_hyper() to convert that to state for a particular step.
+  The difference between the two is that the former only has hyper
+  parameter values that are static and the latter also has values that
+  can change every step (according to _is_dynamic()).
+  """
+
+  def __init__(self, op_name):
+    self._op_name = op_name
+
+  def _init_with_static_hyper(self, hyper):
+    """Initialize a fresh state object from hyper dict."""
+    # self._hyper contains a dict from name to a dict with the Tensor values.
+    # This dict starts with a single item with key "None" with the hyper
+    # parameter value converted to a Tensor. Other items have dtype keys
+    # with that Tensor cast to that dtype.
+    self._hyper = {name: {None: ops.convert_to_tensor(value, name=name)}
+                   for name, (dynamic, value) in hyper.items() if not dynamic}
+    self._slots = {}
+    self._non_slot_dict = {}
+    # Extra state to help Optimizers implement Checkpointable. Holds information
+    # about variables which will be restored as soon as they're created.
+    self._deferred_dependencies = {}  # Non-slot variables
+    self._deferred_slot_restorations = {}  # Slot variables
+
+  def _copy_with_dynamic_hyper(self, hyper, distribution, non_slot_devices):
+    """Create a new state object for a particular step."""
+    ret = _OptimizerV2State(self._op_name)
+    # pylint: disable=protected-access
+    ret._slots = self._slots
+    ret._non_slot_dict = self._non_slot_dict
+    ret._deferred_dependencies = self._deferred_dependencies
+    ret._deferred_slot_restorations = self._deferred_slot_restorations
+    ret._hyper = {name: {None: _resolve(value, name)}
+                  for name, (dynamic, value) in hyper.items() if dynamic}
+    ret._hyper.update(self._hyper)
+    ret._non_slot_devices = non_slot_devices
+    ret._distribution = distribution
+    return ret
+
+  def _variables(self):
+    """Returns a list of all variables held by self."""
+    optimizer_variables = list(self._non_slot_dict.values())
+    for variable_dict in self._slots.values():
+      for slot_for_variable in variable_dict.values():
+        optimizer_variables.append(slot_for_variable)
+    # Sort variables by name so that the return is deterministic.
+    return sorted(optimizer_variables, key=lambda v: v.name)
+
+  def _slot_dict(self, slot_name):
+    """Returns a dict for caching slots created under the given name.
+
+    Args:
+      slot_name: Name for the slot.
+
+    Returns:
+      A dict that maps primary `Variable` objects to the slot created
+      for that variable, under the given slot name.
+    """
+    named_slots = self._slots.get(slot_name, None)
+    if named_slots is None:
+      named_slots = {}
+      self._slots[slot_name] = named_slots
+    return named_slots
+
+  def create_slot(self, var, val, slot_name, optional_op_name=None):
+    """Find or create a slot for a variable.
+
+    Args:
+      var: A `Variable` object.
+      val: A `Tensor`.  The initial value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot(
+          var, val, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def create_slot_with_initializer(self, var, initializer, shape, dtype,
+                                   slot_name, optional_op_name=None):
+    """Find or create a slot for a variable, using an Initializer.
+
+    Args:
+      var: A `Variable` object.
+      initializer: An `Initializer`.  The initial value of the slot.
+      shape: Shape of the initial value of the slot.
+      dtype: Type of the value of the slot.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_slot_with_initializer(
+          var, initializer, shape, dtype, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def zeros_slot(self, var, slot_name, optional_op_name=None):
+    """Find or create a slot initialized with 0.0.
+
+    Args:
+      var: A `Variable` object.
+      slot_name: Name for the slot.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+
+    Returns:
+      A `Variable` object.
+    """
+    named_slots = self._slot_dict(slot_name)
+    var_key = _var_key_v2(var)
+    if var_key not in named_slots:
+      new_slot_variable = slot_creator.create_zeros_slot(
+          var, optional_op_name or self._op_name)
+      self._restore_slot_variable(
+          slot_name=slot_name, variable=var,
+          slot_variable=new_slot_variable)
+      named_slots[var_key] = new_slot_variable
+    return named_slots[var_key]
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable,
+      optional_op_name=None):
+    """Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored. When executing eagerly, we create the slot variable with a
+    restoring initializer.
+
+    No new variables are created when graph building. Instead,
+    _restore_slot_variable catches these after normal creation and adds restore
+    ops to the graph. This method is nonetheless important when graph building
+    for the case when a slot variable has already been created but `variable`
+    has just been added to a dependency graph (causing us to realize that the
+    slot variable needs to be restored).
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+      optional_op_name: Name to use when scoping the Variable that
+        needs to be created for the slot.
+    """
+    slot_variable = self.get_slot(var=variable, name=slot_name)
+    if (slot_variable is None and context.executing_eagerly() and
+        slot_variable_position.is_simple_variable()):
+      initializer = checkpointable.CheckpointInitialValue(
+          checkpoint_position=slot_variable_position)
+      slot_variable = self.create_slot(
+          var=variable,
+          val=initializer,
+          slot_name=slot_name,
+          optional_op_name=optional_op_name)
+      # Optimizers do not have unconditional dependencies on their slot
+      # variables (nor do any other objects). They are only saved if the
+      # variables they were created for are also saved.
+    if slot_variable is not None:
+      # If we've either made this slot variable, or if we've pulled out an
+      # existing slot variable, we should restore it.
+      slot_variable_position.restore(slot_variable)
+    else:
+      # We didn't make the slot variable. Defer restoring until it gets created
+      # normally. We keep a list rather than the one with the highest restore
+      # UID in case slot variables have their own dependencies, in which case
+      # those could differ between restores.
+      variable_key = _var_key_v2(variable)
+      self._deferred_slot_restorations.setdefault(
+          slot_name, {}).setdefault(variable_key, []).append(
+              slot_variable_position)
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    named_slots = self._slots.get(name, None)
+    if not named_slots:
+      return None
+    return named_slots.get(_var_key_v2(var), None)
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    return sorted(self._slots.keys())
+
+  def create_non_slot(self, initial_value, name, colocate_with=None):
+    """Add an extra variable, not associated with a slot."""
+    v = self._non_slot_dict.get(name, None)
+    if v is None:
+      if colocate_with is None: colocate_with = self._non_slot_devices
+      with self._distribution.colocate_vars_with(colocate_with):
+        # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
+        v = variable_scope.variable(initial_value, name=name, trainable=False)
+      self._non_slot_dict[name] = v
+      deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
+      for checkpoint_position in sorted(
+          deferred_dependencies_list,
+          key=lambda restore: restore.checkpoint.restore_uid,
+          reverse=True):
+        checkpoint_position.restore(v)
+    return v
+
+  def _restore_slot_variable(self, slot_name, variable, slot_variable):
+    """Restore a newly created slot variable's value."""
+    variable_key = _var_key_v2(variable)
+    deferred_restorations = self._deferred_slot_restorations.get(
+        slot_name, {}).pop(variable_key, [])
+    # Iterate over restores, highest restore UID first to minimize the number
+    # of assignments.
+    deferred_restorations.sort(key=lambda position: position.restore_uid,
+                               reverse=True)
+    for checkpoint_position in deferred_restorations:
+      checkpoint_position.restore(slot_variable)
+
+  def get_non_slot(self, name):
+    """Returns the non-slot variable identified by `name`."""
+    return self._non_slot_dict.get(name, None)
+
+  def get_hyper(self, name, dtype=None):
+    """Returns the `name` hyper parameter, optionally cast to `dtype`."""
+    dtype_dict = self._hyper[name]
+    # Do we have the value cast to dtype already cached? This should always
+    # succeed when dtype is None.
+    if dtype in dtype_dict:
+      return dtype_dict[dtype]
+    # Not cached, cast to dtype and save the result in the cache.
+    result = math_ops.cast(dtype_dict[None], dtype)
+    dtype_dict[dtype] = result
+    return result
+
+
+class OptimizerV2(optimizer_v1.Optimizer):
+  """Updated base class for optimizers.
+
+  This class defines the API to add Ops to train a model.  You never use this
+  class directly, but instead instantiate one of its subclasses such as
+  `GradientDescentOptimizer`, `AdagradOptimizer`, or `MomentumOptimizer`.
+
+  ### Usage
+
+  ```python
+  # Create an optimizer with the desired parameters.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+  # Add Ops to the graph to minimize a cost by updating a list of variables.
+  # "cost" is a Tensor, and the list of variables contains tf.Variable
+  # objects.
+  opt_op = opt.minimize(cost, var_list=<list of variables>)
+  ```
+
+  In the training program you will just have to run the returned Op.
+
+  ```python
+  # Execute opt_op to do one step of training:
+  opt_op.run()
+  ```
+
+  ### Processing gradients before applying them.
+
+  Calling `minimize()` takes care of both computing the gradients and
+  applying them to the variables.  If you want to process the gradients
+  before applying them you can instead use the optimizer in three steps:
+
+  1.  Compute the gradients with `compute_gradients()`.
+  2.  Process the gradients as you wish.
+  3.  Apply the processed gradients with `apply_gradients()`.
+
+  Example:
+
+  ```python
+  # Create an optimizer.
+  opt = GradientDescentOptimizer(learning_rate=0.1)
+
+  # Compute the gradients for a list of variables.
+  grads_and_vars = opt.compute_gradients(loss, <list of variables>)
+
+  # grads_and_vars is a list of tuples (gradient, variable).  Do whatever you
+  # need to the 'gradient' part, for example cap them, etc.
+  capped_grads_and_vars = [(MyCapper(gv[0]), gv[1]) for gv in grads_and_vars]
+
+  # Ask the optimizer to apply the capped gradients.
+  opt.apply_gradients(capped_grads_and_vars)
+  ```
+
+  ### Gating Gradients
+
+  Both `minimize()` and `compute_gradients()` accept a `gate_gradients`
+  argument that controls the degree of parallelism during the application of
+  the gradients.
+
+  The possible values are: `GATE_NONE`, `GATE_OP`, and `GATE_GRAPH`.
+
+  <b>`GATE_NONE`</b>: Compute and apply gradients in parallel.  This provides
+  the maximum parallelism in execution, at the cost of some non-reproducibility
+  in the results.  For example the two gradients of `matmul` depend on the input
+  values: With `GATE_NONE` one of the gradients could be applied to one of the
+  inputs _before_ the other gradient is computed resulting in non-reproducible
+  results.
+
+  <b>`GATE_OP`</b>: For each Op, make sure all gradients are computed before
+  they are used.  This prevents race conditions for Ops that generate gradients
+  for multiple inputs where the gradients depend on the inputs.
+
+  <b>`GATE_GRAPH`</b>: Make sure all gradients for all variables are computed
+  before any one of them is used.  This provides the least parallelism but can
+  be useful if you want to process all gradients before applying any of them.
+
+  ### Slots
+
+  Some optimizer subclasses, such as `MomentumOptimizer` and `AdagradOptimizer`
+  allocate and manage additional variables associated with the variables to
+  train.  These are called <i>Slots</i>.  Slots have names and you can ask the
+  optimizer for the names of the slots that it uses.  Once you have a slot name
+  you can ask the optimizer for the variable it created to hold the slot value.
+
+  This can be useful if you want to log debug a training algorithm, report stats
+  about the slots, etc.
+
+  ### Non-slot variables
+
+  Some optimizer subclasses, such as `AdamOptimizer` have variables that
+  are not associated with the variables to train, just the step itself.
+
+  ### Hyper parameters
+
+  These are arguments passed to the optimizer subclass constructor
+  (the `__init__` method), and then passed to `self._set_hyper()`.
+  They can be either regular Python values (like 1.0), tensors, or
+  callables. If they are callable, the callable will be called during
+  `apply_gradients()` to get the value for the hyper parameter.
+
+  ### State
+
+  Internal methods apre passed a `state` argument with the correct
+  values to use for the slot and non-slot variables, and the hyper
+  parameters.
+  """
+
+  # Values for gate_gradients.
+  GATE_NONE = 0
+  GATE_OP = 1
+  GATE_GRAPH = 2
+
+  def __init__(self, use_locking, name):
+    """Create a new Optimizer.
+
+    This must be called by the constructors of subclasses.
+    Note that Optimizer instances should not bind to a single graph,
+    and so shouldn't keep Tensors as member variables. Generally
+    you should be able to use the _set_hyper()/state.get_hyper()
+    facility instead.
+
+    Args:
+      use_locking: Bool. If True apply use locks to prevent concurrent updates
+        to variables.
+      name: A non-empty string.  The name to use for accumulators created
+        for the optimizer.
+
+    Raises:
+      ValueError: If name is malformed.
+      RuntimeError: If _create_slots has been overridden instead of
+          _create_vars.
+    """
+    # Note: We intentionally don't call parent __init__.
+
+    # Optimizer._create_slots was replaced by _create_vars in OptimizerV2.
+    if (self.__class__._create_slots.__code__ is not  # pylint: disable=protected-access
+        OptimizerV2._create_slots.__code__):
+      raise RuntimeError("Override _create_vars instead of _create_slots when "
+                         "descending from OptimizerV2 (class %s)" %
+                         self.__class__.__name__)
+    if not name:
+      raise ValueError("Must specify the optimizer name")
+
+    self._use_locking = use_locking
+    self._name = name
+    # Map from graph_key to state for that graph. We use the graph_key
+    # since it works in both eager and graph mode, and gives the outer
+    # graph inside functions.
+    tower_context = distribute_lib.get_tower_context()
+    if tower_context is None:
+      # In a cross-tower context for a DistributionStrategy, which means
+      # only one Optimizer will be created, not one per tower.
+      self._per_graph_state = {}
+    else:
+      # We use get_tower_context().merge_call() to get a single dict
+      # shared across all model replicas when running with a
+      # DistributionStrategy.
+      self._per_graph_state = tower_context.merge_call(lambda _: {})
+
+    # Hyper parameters, and whether they should be re-evaluated every step.
+    self._hyper = {}
+
+  def _set_hyper(self, name, value):
+    self._hyper[name] = (_is_dynamic(value), value)
+
+  def minimize(self, loss, global_step=None, var_list=None,
+               gate_gradients=GATE_OP, aggregation_method=None,
+               colocate_gradients_with_ops=False, name=None,
+               grad_loss=None, stop_gradients=None,
+               scale_loss_by_num_towers=None):
+    """Add operations to minimize `loss` by updating `var_list`.
+
+    This method simply combines calls `compute_gradients()` and
+    `apply_gradients()`. If you want to process the gradient before applying
+    them call `compute_gradients()` and `apply_gradients()` explicitly instead
+    of using this function.
+
+    Args:
+      loss: A `Tensor` containing the value to minimize.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      var_list: Optional list or tuple of `Variable` objects to update to
+        minimize `loss`.  Defaults to the list of variables collected in
+        the graph under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      name: Optional name for the returned operation.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      An Operation that updates the variables in `var_list`.  If `global_step`
+      was not `None`, that operation also increments `global_step`.
+
+    Raises:
+      ValueError: If some of the variables are not `Variable` objects.
+
+    @compatibility(eager)
+    When eager execution is enabled, `loss` should be a Python function that
+    takes elements of `var_list` as arguments and computes the value to be
+    minimized. If `var_list` is None, `loss` should take no arguments.
+    Minimization (and gradient computation) is done with respect to the
+    elements of `var_list` if not None, else with respect to any trainable
+    variables created during the execution of the `loss` function.
+    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
+    `grad_loss` are ignored when eager execution is enabled.
+    @end_compatibility
+    """
+    grads_and_vars = self.compute_gradients(
+        loss, var_list=var_list, gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss, stop_gradients=stop_gradients,
+        scale_loss_by_num_towers=scale_loss_by_num_towers)
+
+    vars_with_grad = [v for g, v in grads_and_vars if g is not None]
+    if not vars_with_grad:
+      raise ValueError(
+          "No gradients provided for any variable, check your graph for ops"
+          " that do not support gradients, between variables %s and loss %s." %
+          ([str(v) for _, v in grads_and_vars], loss))
+
+    return self.apply_gradients(grads_and_vars, global_step=global_step,
+                                name=name)
+
+  def compute_gradients(self, loss, var_list=None,
+                        gate_gradients=GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None, stop_gradients=None,
+                        scale_loss_by_num_towers=None):
+    """Compute gradients of `loss` for the variables in `var_list`.
+
+    This is the first part of `minimize()`.  It returns a list
+    of (gradient, variable) pairs where "gradient" is the gradient
+    for "variable".  Note that "gradient" can be a `Tensor`, an
+    `IndexedSlices`, or `None` if there is no gradient for the
+    given variable.
+
+    Args:
+      loss: A Tensor containing the value to minimize or a callable taking
+        no arguments which returns the value to minimize. When eager execution
+        is enabled it must be a callable.
+      var_list: Optional list or tuple of `tf.Variable` to update to minimize
+        `loss`.  Defaults to the list of variables collected in the graph
+        under the key `GraphKeys.TRAINABLE_VARIABLES`.
+      gate_gradients: How to gate the computation of gradients.  Can be
+        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
+      aggregation_method: Specifies the method used to combine gradient terms.
+        Valid values are defined in the class `AggregationMethod`.
+      colocate_gradients_with_ops: If True, try colocating gradients with
+        the corresponding op.
+      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
+      stop_gradients: Optional. A Tensor or list of tensors not to differentiate
+        through.
+      scale_loss_by_num_towers: Optional boolean. If true, scale the loss
+        down by the number of towers. By default, auto-detects whether this
+        is needed.
+
+    Returns:
+      A list of (gradient, variable) pairs. Variable is always present, but
+      gradient can be `None`.
+
+    Raises:
+      TypeError: If `var_list` contains anything else than `Variable` objects.
+      ValueError: If some arguments are invalid.
+      RuntimeError: If called with eager execution enabled and `loss` is
+        not callable.
+
+    @compatibility(eager)
+    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
+    and `colocate_gradients_with_ops` are ignored.
+    @end_compatibility
+    """
+    # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+    if callable(loss):
+      with backprop.GradientTape() as tape:
+        if var_list is not None:
+          tape.watch(var_list)
+        loss_value = loss()
+
+        # Scale loss for number of towers (callable-loss case). In this case,
+        # we have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        if scale_loss_by_num_towers is None:
+          scale_loss_by_num_towers = (
+              distribute_lib.get_loss_reduction() == "mean")
+        if scale_loss_by_num_towers:
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= 1. / num_towers
+
+      if var_list is None:
+        var_list = tape.watched_variables()
+      grads = tape.gradient(loss_value, var_list, grad_loss)
+      return list(zip(grads, var_list))
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "`loss` passed to Optimizer.compute_gradients should "
+          "be a function when eager execution is enabled.")
+
+    # Scale loss for number of towers (non-callable-loss case).
+    if scale_loss_by_num_towers is None:
+      scale_loss_by_num_towers = (
+          distribute_lib.get_loss_reduction() == "mean")
+    if scale_loss_by_num_towers:
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= 1. / num_towers
+
+    if gate_gradients not in [optimizer_v1.Optimizer.GATE_NONE,
+                              optimizer_v1.Optimizer.GATE_OP,
+                              optimizer_v1.Optimizer.GATE_GRAPH]:
+      raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
+                       "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
+                       gate_gradients)
+    self._assert_valid_dtypes([loss])
+    if grad_loss is not None:
+      self._assert_valid_dtypes([grad_loss])
+    if var_list is None:
+      var_list = (
+          variables.trainable_variables() +
+          ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+    else:
+      var_list = nest.flatten(var_list)
+    # pylint: disable=protected-access
+    var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
+    # pylint: enable=protected-access
+    processors = [_get_processor(v) for v in var_list]
+    if not var_list:
+      raise ValueError("No variables to optimize.")
+    var_refs = [p.target() for p in processors]
+    grads = gradients.gradients(
+        loss, var_refs, grad_ys=grad_loss,
+        gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        stop_gradients=stop_gradients)
+    if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
+      grads = control_flow_ops.tuple(grads)
+    grads_and_vars = list(zip(grads, var_list))
+    self._assert_valid_dtypes(
+        [v for g, v in grads_and_vars
+         if g is not None and v.dtype != dtypes.resource])
+    return grads_and_vars
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients to variables.
+
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`.
+      global_step: Optional `Variable` to increment by one after the
+        variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients. If `global_step`
+      was not None, that operation also increments `global_step`.
+
+    Raises:
+      TypeError: If `grads_and_vars` is malformed.
+      ValueError: If none of the variables have gradients.
+    """
+    # This is a default implementation of apply_gradients() that can be shared
+    # by most optimizers.  It relies on the subclass implementing the following
+    # methods: _create_vars(), _prepare(), _apply_dense(), and _apply_sparse().
+
+    # Filter out variables with gradients of `None`.
+    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
+    if not grads_and_vars:
+      raise ValueError("No variables provided.")
+    filtered = tuple((g, v) for (g, v) in grads_and_vars if g is not None)
+    if not filtered:
+      raise ValueError("No gradients provided for any variable: %s." %
+                       ([str(v) for _, v in grads_and_vars],))
+    return distribute_lib.get_tower_context().merge_call(
+        self._distributed_apply, filtered, global_step=global_step, name=name)
+
+  def _get_or_create_state(self, var_list=None):
+    """Either looks up or creates `_OptimizerV2State`.
+
+    If any variables are available, they should be passed via the `var_list`
+    argument, and these will be used to determine the graph to create/retrieve
+    state for. Otherwise the returned state is for the current default graph.
+
+    Args:
+      var_list: A list of variables to extract a graph from.
+
+    Returns:
+      An `_OptimizerV2State` object.
+    """
+    # Determine the graph_key from the current graph.
+    eager_execution = context.executing_eagerly()
+    if eager_execution or var_list is None:
+      graph = ops.get_default_graph()
+    else:
+      graph = ops._get_graph_from_inputs(var_list)  # pylint: disable=protected-access
+    assert graph is not None
+    graph_key = graph._graph_key  # pylint: disable=protected-access
+
+    # Get the per graph state by looking up the graph_key.
+    if graph_key in self._per_graph_state:
+      per_graph_state = self._per_graph_state[graph_key]
+    else:
+      per_graph_state = _OptimizerV2State(self._name)
+      per_graph_state._init_with_static_hyper(self._hyper)  # pylint: disable=protected-access
+      self._per_graph_state[graph_key] = per_graph_state
+    return per_graph_state
+
+  def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
+    """`apply_gradients` for use with a `DistributionStrategy`."""
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+
+    unwrapped_var_list = [x for v in var_list for x in distribution.unwrap(v)]
+    eager_execution = context.executing_eagerly()
+    if eager_execution:
+      # Give a clear error in this case instead of "name not supported
+      # for Eager Tensors" when we compute non_slot_devices.
+      for v in unwrapped_var_list:
+        if isinstance(v, ops.Tensor):
+          raise NotImplementedError("Trying to update a Tensor ", v)
+
+    with ops.name_scope(name, self._name) as name:
+      per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
+      # Include the current value of any dynamic hyper parameters in `state`.
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
+          self._hyper, distribution, non_slot_devices)
+
+    # Create any slot and non-slot variables we need in `state`.
+    with ops.init_scope():
+      self._create_vars(var_list, state)
+
+    with ops.name_scope(name):  # Re-enter name_scope created above
+      # Give the child class a chance to do something before we start
+      # applying gradients.
+      self._prepare(state)
+
+      def update(v, g):
+        """Update variable `v` using gradient `g`."""
+        assert v is not None
+
+        # Convert the grad to Tensor or IndexedSlices if necessary, and
+        # look up a processor for each variable's type.
+        try:
+          g = ops.convert_to_tensor_or_indexed_slices(g)
+        except TypeError:
+          raise TypeError(
+              "Gradient must be convertible to a Tensor"
+              " or IndexedSlices, or None: %s" % g)
+        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+          raise TypeError(
+              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+        processor = _get_processor(v)
+
+        # We colocate all ops created in _apply_dense or _apply_sparse
+        # on the same device as the variable.
+        # TODO(apassos): figure out how to get the variable name here.
+        scope_name = "" if eager_execution else v.op.name
+        # device_policy is set because non-mirrored tensors will be read in
+        # `update_op`.
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with ops.name_scope("update_" + scope_name), \
+            context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return processor.update_op(self, g, state)
+
+      # Use the processors to update the variables.
+      update_ops = []
+      for grad, var in grads_and_vars:
+        update_ops.extend(distribution.unwrap(distribution.update(
+            var, update, grad)))
+
+      # Give the child class a chance to do something after applying
+      # gradients
+      def finish():
+        # TODO(josh11b): Make different state objects for each device to
+        # avoid needing to set the device_policy.
+        with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+          return self._finish(state)
+
+      update_ops = control_flow_ops.group(update_ops)
+      with ops.control_dependencies([update_ops]):
+        finish_updates = distribution.update_non_slot(non_slot_devices, finish)
+      if finish_updates is None:
+        finish_updates = update_ops
+
+      # Update `global_step` (if any).
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+
+          def update_global_step(global_step):
+            if isinstance(global_step, resource_variable_ops.ResourceVariable):
+              return global_step.assign_add(
+                  ops.convert_to_tensor(1, dtype=global_step.dtype),
+                  read_value=False)
+            else:
+              return state_ops.assign_add(global_step, 1)
+
+          apply_updates = distribution.group(
+              distribution.update(global_step, update_global_step), name=name)
+
+      # Add the training op to the TRAIN_OP graph collection in graph mode.
+      if not eager_execution:
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
+  def get_slot(self, var, name):
+    """Return a slot named `name` created for `var` by the Optimizer.
+
+    Some `Optimizer` subclasses use additional variables.  For example
+    `Momentum` and `Adagrad` use variables to accumulate updates.  This method
+    gives access to these `Variable` objects if for some reason you need them.
+
+    Use `get_slot_names()` to get the list of slot names created by the
+    `Optimizer`.
+
+    Args:
+      var: A variable passed to `minimize()` or `apply_gradients()`.
+      name: A string.
+
+    Returns:
+      The `Variable` for the slot if it was created, `None` otherwise.
+    """
+    state = self._get_state_for_var(var)
+    return state.get_slot(var, name) if state is not None else None
+
+  def get_slot_names(self):
+    """Return a list of the names of slots created by the `Optimizer`.
+
+    See `get_slot()`.
+
+    Returns:
+      A list of strings.
+    """
+    state = self._get_per_graph_state()
+    return state.get_slot_names() if state is not None else []
+
+  def variables(self):
+    """A list of variables which encode the current state of `Optimizer`.
+
+    Includes slot variables and additional global variables created by the
+    optimizer in the current default graph.
+
+    Returns:
+      A list of variables.
+    """
+    state = self._get_per_graph_state()
+    return state._variables() if state is not None else []  # pylint: disable=protected-access
+
+  # --------------
+  # Methods to be implemented by subclasses if they want to use the
+  # inherited implementation of apply_gradients() or compute_gradients().
+  # --------------
+  def _create_vars(self, var_list, state):
+    """Create all slots needed by the variables and any non-slot variables.
+
+    Args:
+      var_list: A list of `Variable` objects.
+      state: An object with these methods:
+        `create_slot(var, val, slot_name, optional_op_name)`,
+        `create_slot_with_initializer(`
+            `var, initializer, shape, dtype, slot_name, optional_op_name)`,
+        `zeros_slot(var, slot_name, optional_op_name)`,
+        `create_non_slot_variable(initial_value, name, colocate_with)`,
+        `get_hyper(name)`
+    """
+    # No slots needed by default
+    pass
+
+  def _prepare(self, state):
+    """Code to execute before applying gradients.
+
+    Note that most uses of _prepare() in Optimizer have been subsumed
+    by explicit support for hyper parameters in OptimizerV2
+
+    Args:
+      state: An object with a `get_hyper(name)` method.
+
+    Returns:
+      Return value will be ignored.
+    """
+    pass
+
+  def _apply_dense(self, grad, var, state):
+    """Add ops to apply dense gradients to `var`.
+
+    Args:
+      grad: A `Tensor`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_dense(self, grad, handle, state):
+    """Add ops to apply dense gradients to the variable `handle`.
+
+    Args:
+      grad: a `Tensor` representing the gradient.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _resource_apply_sparse_duplicate_indices(
+      self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+    Optimizers which override this method must deal with repeated indices. See
+    the docstring of `_apply_sparse_duplicate_indices` for details. By default
+    the correct behavior, to sum non-unique indices and their associated
+    gradients, is enforced by first pre-processing `grad` and `indices` and
+    passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+    with duplicate indices may instead override this method to avoid the
+    overhead of summing.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices may be repeated.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    # pylint: disable=protected-access
+    summed_grad, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad, indices=indices)
+    # pylint: enable=protected-access
+    return self._resource_apply_sparse(
+        summed_grad, handle, unique_indices, state)
+
+  def _resource_apply_sparse(self, grad, handle, indices, state):
+    """Add ops to apply sparse gradients to the variable `handle`.
+
+    Similar to `_apply_sparse`, the `indices` argument to this method has been
+    de-duplicated. Optimizers which deal correctly with non-unique indices may
+    instead override `_resource_apply_sparse_duplicate_indices` to avoid this
+    overhead.
+
+    Args:
+      grad: a `Tensor` representing the gradient for the affected indices.
+      handle: a `Tensor` of dtype `resource` which points to the variable
+       to be updated.
+      indices: a `Tensor` of integral type representing the indices for
+       which the gradient is nonzero. Indices are unique.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation` which updates the value of the variable.
+    """
+    raise NotImplementedError()
+
+  def _apply_sparse_duplicate_indices(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`, with repeated sparse indices.
+
+    Optimizers which override this method must deal with IndexedSlices objects
+    such as the following:
+
+      IndexedSlicesValue(values=[1, 1], indices=[0, 0], dense_shape=[1])
+
+    The correct interpretation is:
+
+      IndexedSlicesValue(values=[2], indices=[0], dense_shape=[1])
+
+    Many optimizers deal incorrectly with repeated indices when updating based
+    on sparse gradients (e.g. summing squares rather than squaring the sum, or
+    applying momentum terms multiple times). Adding first is always the correct
+    behavior, so this is enforced here by reconstructing the IndexedSlices to
+    have only unique indices, then calling _apply_sparse.
+
+    Optimizers which deal correctly with repeated indices may instead override
+    this method to avoid the overhead of summing indices.
+
+    Args:
+      grad: `IndexedSlices`.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    # pylint: disable=protected-access
+    summed_values, unique_indices = optimizer_v1._deduplicate_indexed_slices(
+        values=grad.values, indices=grad.indices)
+    # pylint: enable=protected-access
+    gradient_no_duplicate_indices = ops.IndexedSlices(
+        indices=unique_indices,
+        values=summed_values,
+        dense_shape=grad.dense_shape)
+    return self._apply_sparse(gradient_no_duplicate_indices, var, state)
+
+  def _apply_sparse(self, grad, var, state):
+    """Add ops to apply sparse gradients to `var`.
+
+    The IndexedSlices object passed to `grad` in this function is by default
+    pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
+    indices (see its docstring for details). Optimizers which can tolerate or
+    have correct special cases for duplicate sparse indices may override
+    `_apply_sparse_duplicate_indices` instead of this function, avoiding that
+    overhead.
+
+    Args:
+      grad: `IndexedSlices`, with no repeated indices.
+      var: A `Variable` object.
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      An `Operation`.
+    """
+    raise NotImplementedError()
+
+  def _finish(self, state):
+    """Do what is needed to finish the update.
+
+    This is called inside a scope colocated with any non-slot variables.
+
+    Args:
+      state: An object with `get_slot(var, name)`, `get_non_slot(self, name)`,
+        and `get_hyper(name)` methods.
+
+    Returns:
+      The operation to apply updates, or None if no updates.
+    """
+    return None
+
+  # --------------
+  # Utility methods for subclasses.
+  # --------------
+  def _get_per_graph_state(self):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(ops.get_default_graph()._graph_key, None)
+
+  def _get_state_for_var(self, var):
+    # pylint: disable=protected-access
+    return self._per_graph_state.get(var._graph_key, None)
+
+  # --------------
+  # Overridden methods from Checkpointable.
+  # --------------
+
+  def _track_checkpointable(self, *args, **kwargs):
+    """Optimizers may not track dependencies. Raises an error."""
+    raise NotImplementedError(
+        "Optimizers may not have dependencies. File a feature request if this "
+        "limitation bothers you.")
+
+  @property
+  def _checkpoint_dependencies(self):
+    """From Checkpointable. Gather graph-specific non-slot variables to save."""
+    current_graph_non_slot_variables = []
+    state = self._get_per_graph_state()
+    if state is not None:
+      for name, variable_object in sorted(
+          state._non_slot_dict.items(),  # pylint: disable=protected-access
+          # Avoid comparing variables
+          key=lambda item: item[0]):
+        current_graph_non_slot_variables.append(
+            checkpointable.CheckpointableReference(
+                name=name, ref=variable_object))
+    # Note: ignores super(); Optimizers may not have any dependencies outside of
+    # state objects.
+    return current_graph_non_slot_variables
+
+  def _lookup_dependency(self, name):
+    """From Checkpointable. Find a non-slot variable in the current graph."""
+    state = self._get_per_graph_state()
+    if state is None:
+      return None
+    else:
+      return state.get_non_slot(name)
+
+  @property
+  def _deferred_dependencies(self):
+    """Lets Checkpointable know where non-slot variables are created.
+
+    If necessary, creates a new state object for the current default graph.
+    Checkpointable will then add entries to that state's deferred dependency
+    dictionary. The state object will check that dictionary when creating
+    non-slot variables, restoring their value if an entry is found.
+
+    Returns:
+      A dictionary which holds deferred dependencies for the current default
+      graph.
+    """
+    state = self._get_or_create_state()
+    return state._deferred_dependencies  # pylint: disable=protected-access
+
+  def _create_or_restore_slot_variable(
+      self, slot_variable_position, slot_name, variable):
+    """Checkpointable: Restore a slot variable's value, possibly creating it.
+
+    Called when a variable which has an associated slot variable is created or
+    restored.
+
+    Args:
+      slot_variable_position: A `checkpointable._CheckpointPosition` object
+        indicating the slot variable `Checkpointable` object to be restored.
+      slot_name: The name of this `Optimizer`'s slot to restore into.
+      variable: The variable object this slot is being created for.
+    """
+    state = self._get_or_create_state(var_list=[variable])
+    state._create_or_restore_slot_variable(  # pylint: disable=protected-access
+        slot_variable_position=slot_variable_position,
+        slot_name=slot_name,
+        variable=variable,
+        optional_op_name=self._name)
+
+  # --------------
+  # Unsupported parent methods
+  # --------------
+  def _slot_dict(self, slot_name):
+    raise NotImplementedError(
+        "_slot_dict() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot(self, var, val, slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot() method unsupported in OptimizerV2")
+
+  def _get_or_make_slot_with_initializer(self, var, initializer, shape, dtype,
+                                         slot_name, op_name):
+    raise NotImplementedError(
+        "_get_or_make_slot_with_initializer() method unsupported in "
+        "OptimizerV2")
+
+  def _create_non_slot_variable(self, initial_value, name, colocate_with):
+    raise NotImplementedError(
+        "_create_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _get_non_slot_variable(self, name, graph=None):
+    raise NotImplementedError(
+        "_get_non_slot_variable() method unsupported in OptimizerV2")
+
+  def _non_slot_variables(self):
+    raise NotImplementedError(
+        "_non_slot_variables() method unsupported in OptimizerV2")
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..24eada06ccdd68090f44c62646040fcd7d659727
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_symbols.py
@@ -0,0 +1,42 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Distribution-aware version of Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.optimizer_v2.adadelta import AdadeltaOptimizer
+from tensorflow.contrib.optimizer_v2.adagrad import AdagradOptimizer
+from tensorflow.contrib.optimizer_v2.adam import AdamOptimizer
+from tensorflow.contrib.optimizer_v2.gradient_descent import GradientDescentOptimizer
+from tensorflow.contrib.optimizer_v2.momentum import MomentumOptimizer
+from tensorflow.contrib.optimizer_v2.optimizer_v2 import OptimizerV2
+from tensorflow.contrib.optimizer_v2.rmsprop import RMSPropOptimizer
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'AdadeltaOptimizer',
+    'AdagradOptimizer',
+    'AdamOptimizer',
+    'GradientDescentOptimizer',
+    'MomentumOptimizer',
+    'OptimizerV2',
+    'RMSPropOptimizer',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8599af32f6f4cc5529cd812e83c02ef3812cb71e
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -0,0 +1,294 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional test for OptimizerV2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import gradient_descent
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class OptimizerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      # Note that for eager execution, minimize expects a function instead of a
+      # Tensor.
+      global_step = resource_variable_ops.ResourceVariable(
+          array_ops.zeros([], dtypes.int64), name='global_step_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+
+      self.evaluate(variables.global_variables_initializer())
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Run 1 step of sgd through optimizer
+      opt_op = sgd_op.minimize(loss, global_step, [var0, var1])
+      self.evaluate(opt_op)
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  def testAggregationMethod(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost,
+            global_step, [var0, var1],
+            aggregation_method=gradients_impl.AggregationMethod.
+            EXPERIMENTAL_ACCUMULATE_N)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([-14., -13.], var0.eval())
+        self.assertAllClose([-6., -5.], var1.eval())
+
+  def testPrecomputedGradient(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        cost = 5 * var0 + 3 * var1
+        grad_loss = constant_op.constant([42, -42], dtype=dtype)
+        global_step = variables.Variable(
+            array_ops.zeros([], dtypes.int64), name='global_step')
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+        opt_op = sgd_op.minimize(
+            cost, global_step, [var0, var1], grad_loss=grad_loss)
+
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd through optimizer
+        opt_op.run()
+        # Validate updated params
+        self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
+                            var0.eval())
+        self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
+                            var1.eval())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoVariables(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        var0 = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype, trainable=False, name='a')
+        var1 = resource_variable_ops.ResourceVariable(
+            [3.0, 4.0], dtype=dtype, trainable=False, name='b')
+        return 5 * var0 + var1
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No.*variables'):
+        sgd_op.minimize(loss)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      # pylint: disable=cell-var-from-loop
+      def loss():
+        return 5 * var0
+      # pylint: enable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError, 'No gradients'):
+        # var1 has no gradient
+        sgd_op.minimize(loss, var_list=[var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_Minimize(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      def loss():
+        return constant_op.constant(5.0)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.minimize(loss, var_list=[var0, var1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNoGradientsForAnyVariables_ApplyGradients(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a_%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b_%d' % i)
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'No gradients provided for any variable'):
+        sgd_op.apply_gradients([(None, var0), (None, var1)])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientsAsVariables(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      # Note that we name the variables uniquely here since the variables don't
+      # seem to be getting deleted at the end of the loop.
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
+                                                    name='a%d' % i)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
+                                                    name='b%d' % i)
+      def loss():
+        return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(loss, [var0, var1])
+      # Convert gradients to tf.Variables
+      converted_grads = [
+          resource_variable_ops.ResourceVariable(array_ops.zeros([2], dtype),
+                                                 name='c_%d_%d' % (i, j))
+          for j, gv in enumerate(grads_and_vars)
+      ]
+      convert_ops = [
+          state_ops.assign(converted_grads[j], gv[0])
+          for j, gv in enumerate(grads_and_vars)
+      ]
+
+      self.evaluate(variables.global_variables_initializer())
+      # Run convert_ops to achieve the gradietns converting
+      self.evaluate(convert_ops)
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+      # Run 1 step of sgd through optimizer
+      converted_grads_and_vars = list(zip(converted_grads, [var0, var1]))
+      opt_op = sgd_op.apply_gradients(converted_grads_and_vars)
+      self.evaluate(opt_op)
+
+      # Validate updated params
+      self.assertAllClose([-14., -13.], self.evaluate(var0))
+      self.assertAllClose([-6., -5.], self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputeGradientsWithTensors(self):
+    x = ops.convert_to_tensor(1.0)
+    def f():
+      return x * x
+    sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+    grads_and_vars = sgd_op.compute_gradients(f, [x])
+    self.assertEqual(1, len(grads_and_vars))
+    grad, x_as_var = grads_and_vars[0]
+    self.assertIs(x, x_as_var)
+    self.assertEqual(2.0, self.evaluate(grad))
+
+    with self.assertRaises(NotImplementedError):
+      sgd_op.apply_gradients(grads_and_vars)
+
+  def testTrainOp(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0])
+      var1 = variables.Variable([3.0, 4.0])
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+      self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
+
+  def testConstraint(self):
+    constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
+    constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0],
+                                constraint=constraint_01)
+      var1 = variables.Variable([3.0, 4.0],
+                                constraint=constraint_0)
+      cost = 5 * var0 + 3 * var1
+      global_step = variables.Variable(
+          array_ops.zeros([], dtypes.int64), name='global_step')
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
+
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Run 1 step of sgd through optimizer
+      opt_op.run()
+      # Validate updated params
+      self.assertAllClose([-0.1, -0.1], var0.eval())
+      self.assertAllClose([0., 0.], var1.eval())
+
+  def testStopGradients(self):
+    with self.test_session():
+      var0 = variables.Variable([1.0, 2.0], name='var0')
+      var1 = variables.Variable([3.0, 4.0], name='var1')
+      var0_id = array_ops.identity(var0)
+      cost = 5 * var0_id + 3 * var1
+      sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
+      grads_and_vars = sgd_op.compute_gradients(cost, [var0, var1],
+                                                stop_gradients=[var0_id])
+      grad_dict = {var.op.name: grad for grad, var in grads_and_vars}
+      self.assertIsNone(grad_dict['var0'])
+      self.assertIsNotNone(grad_dict['var1'])
+
+  def testDoNotOverrideCreateSlots(self):
+    class ShouldNotOverrideCreateSlots(optimizer_v2.OptimizerV2):
+
+      def _create_slots(self, var_list):
+        """In OptimizerV2 _create_slots was renamed _create_vars."""
+        return var_list
+
+    with self.assertRaises(RuntimeError):
+      ShouldNotOverrideCreateSlots(True, 'name')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..164ff0ea0670bd07d19fa642e2e3cde1ab84612a
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -0,0 +1,233 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop optimizer for Tensorflow.
+
+rmsprop algorithm [tieleman2012rmsprop]
+
+A detailed description of rmsprop.
+
+- maintain a moving (discounted) average of the square of gradients
+- divide gradient by the root of this average
+
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+delta = - mom
+
+This implementation of RMSProp uses plain momentum, not Nesterov momentum.
+
+The centered version additionally maintains a moving (discounted) average of the
+gradients, and uses that average to estimate the variance:
+
+mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
+mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
+mom = momentum * mom{t-1} + learning_rate * g_t /
+    sqrt(mean_square - mean_grad**2 + epsilon)
+delta = - mom
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+
+from tensorflow.python.training import training_ops
+
+
+class RMSPropOptimizer(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the RMSProp algorithm.
+
+  See the
+  [paper](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate,
+               decay=0.9,
+               momentum=0.0,
+               epsilon=1e-10,
+               use_locking=False,
+               centered=False,
+               name="RMSProp"):
+    """Construct a new RMSProp optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Some of the args below are hyperparameters, where a hyperparameter is
+    defined as a scalar Tensor, a regular Python value or a callable (which
+    will be evaluated when `apply_gradients` is called) returning a scalar
+    Tensor or a Python value.
+
+    Args:
+      learning_rate: A float hyperparameter. The learning rate.
+      decay: A float hyperparameter. Discounting factor for the history/coming
+        gradient.
+      momentum: A float hyperparameter.
+      epsilon: A float hyperparameter. Small value to avoid zero denominator.
+      use_locking: If True use locks for update operation.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSProp".
+    """
+    super(RMSPropOptimizer, self).__init__(use_locking, name)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", decay)
+    self._set_hyper("momentum", momentum)
+    self._set_hyper("epsilon", epsilon)
+
+    self._centered = centered
+
+  def _create_vars(self, var_list, state):
+    for v in var_list:
+      if v.get_shape().is_fully_defined():
+        init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
+      else:
+        init_rms = array_ops.ones_like(v)
+      state.create_slot_with_initializer(v, init_rms, v.get_shape(),
+                                         v.dtype.base_dtype, "rms")
+      if self._centered:
+        state.zeros_slot(v, "mg")
+      state.zeros_slot(v, "momentum")
+
+  def _apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+    else:
+      return training_ops.apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking).op
+
+  def _resource_apply_dense(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          use_locking=self._use_locking)
+
+  def _apply_sparse(self, grad, var, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = state.get_slot(var, "mg")
+      return training_ops.sparse_apply_centered_rms_prop(
+          var,
+          mg,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.sparse_apply_rms_prop(
+          var,
+          rms,
+          mom,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad.values,
+          grad.indices,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices, state):
+    rms = state.get_slot(var, "rms")
+    mom = state.get_slot(var, "momentum")
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          state.get_hyper("learning_rate", var.dtype.base_dtype),
+          state.get_hyper("decay", var.dtype.base_dtype),
+          state.get_hyper("momentum", var.dtype.base_dtype),
+          state.get_hyper("epsilon", var.dtype.base_dtype),
+          grad,
+          indices,
+          use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed68f6afbf8bf9678649c1ce6fc59c3b91026dc0
--- /dev/null
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,449 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.contrib.optimizer_v2 import rmsprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, decay, momentum, epsilon, centered, use_resource
+    [0.5, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.9, 0.0, 1e-3, False, False],
+    [0.5, 0.9, 0.0, 1e-3, True, True],
+    [0.5, 0.9, 0.0, 1e-3, False, True],
+    [0.1, 0.9, 0.0, 1e-3, True, False],
+    [0.5, 0.95, 0.0, 1e-3, False, False],
+    [0.5, 0.95, 0.0, 1e-5, True, False],
+    [0.5, 0.95, 0.9, 1e-5, True, False],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSPropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
+                            epsilon, centered):
+    rms_t = rms * decay + (1 - decay) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * decay + (1 - decay) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, decay, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  def testDense(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay, momentum,
+         epsilon, centered, use_resource) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
+              decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
+              decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0., 1.]], var0.eval(), atol=0.01)
+
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSPropOptimizer(
+            learning_rate=1.0,
+            decay=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(loss)
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[-111, -138]], var0.eval(), atol=0.01)
+
+  def testSparse(self):
+    # TODO(yori): Use ParameterizedTest when available
+    for (dtype, learning_rate, decay,
+         momentum, epsilon, centered, _) in _TESTPARAMS:
+      with self.test_session(use_gpu=True):
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=learning_rate,
+            decay=decay,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        mg0 = opt.get_slot(var0, "mg")
+        self.assertEqual(mg0 is not None, centered)
+        mg1 = opt.get_slot(var1, "mg")
+        self.assertEqual(mg1 is not None, centered)
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 4 steps of RMSProp
+        for _ in range(1, 5):
+          update.run()
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, decay, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, decay, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testWithoutMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+            ]), var1.eval())
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+            ]), var1.eval())
+
+  def testWithMomentum(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.test_session(use_gpu=True):
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        opt = rmsprop.RMSPropOptimizer(
+            learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Step 1: rms = 1, mom = 0. So we should see a normal
+        # update: v -= grad * learning_rate
+        update.run()
+        # Check the root mean square accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901, 0.901]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001, 0.90001]), rms1.eval())
+        # Check the momentum accumulators
+        self.assertAllCloseAccordingToType(
+            np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+
+        # Check that the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+            ]), var0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+            ]), var1.eval())
+
+        # Step 2: the root mean square accumulators contain the previous update.
+        update.run()
+        # Check the rms accumulators.
+        self.assertAllCloseAccordingToType(
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
+                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+            ]), mom0.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
+                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+            ]), mom1.eval())
+
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+            ]), var0.eval())
+
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+            ]), var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index bd9078ae76ee27ec26c09d1aa2012f871cbdf5e9..6ca7fe8b6e59b0dc24be76262d4f54f387e53e48 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -94,18 +94,6 @@ py_test(
 #     srcs_version = "PY2AND3",
 # )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index a80f060b91df3b6d5e2ca9ff63c721382f0cbb0a..36e21af618f5af744ce793509813eaf36e1b8479 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -8,18 +8,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "predictor",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..046652cbc5a2f11a3e75fdcc7b91ec00be21d300
--- /dev/null
+++ b/tensorflow/contrib/proto/BUILD
@@ -0,0 +1,16 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "proto",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+    ],
+)
diff --git a/tensorflow/contrib/proto/__init__.py b/tensorflow/contrib/proto/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc5a49de78e251cb4a854fc11a7b13b39820127d
--- /dev/null
+++ b/tensorflow/contrib/proto/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to proto.
+
+@@decode_proto
+@@encode_proto
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.decode_proto_op import decode_proto
+from tensorflow.contrib.proto.python.ops.encode_proto_op import encode_proto
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/proto/python/ops/BUILD b/tensorflow/contrib/proto/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f17065477e1e14d24a16338b1a11d98da44639fe
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/BUILD
@@ -0,0 +1,44 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_gen_op_wrapper_py",
+)
+
+py_library(
+    name = "decode_proto_op_py",
+    srcs = ["decode_proto_op.py"],
+    deps = [
+        ":gen_decode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_decode_proto_op_py",
+    out = "gen_decode_proto_op.py",
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+    ],
+)
+
+py_library(
+    name = "encode_proto_op_py",
+    srcs = ["encode_proto_op.py"],
+    deps = [
+        ":gen_encode_proto_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_encode_proto_op_py",
+    out = "gen_encode_proto_op.py",
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/ops/decode_proto_op.py b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dc000ebe49724e7571ade500eb29de25be89485
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/decode_proto_op.py
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer decoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_decode_proto_op import decode_proto_v2 as decode_proto
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("DecodeProtoV2")
diff --git a/tensorflow/contrib/proto/python/ops/encode_proto_op.py b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac12198b2e462b73238778e91c1e9b6be156182c
--- /dev/null
+++ b/tensorflow/contrib/proto/python/ops/encode_proto_op.py
@@ -0,0 +1,25 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""Protocol Buffer encoding from tensors."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.ops.gen_encode_proto_op import encode_proto
+from tensorflow.python.framework import ops
+
+ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/contrib/quantization/BUILD b/tensorflow/contrib/quantization/BUILD
index c19a31afb2a1a86159eae5c94bbd83daa28caaeb..2de10e8faefa80d609e490f26ef97f6bf513debd 100644
--- a/tensorflow/contrib/quantization/BUILD
+++ b/tensorflow/contrib/quantization/BUILD
@@ -49,15 +49,3 @@ filegroup(
         "**/*.py",
     ]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index 0b7629620418340d803753be0df1f04c342dc490..b9918fdee1ece2bae1ab1459985066a35b6431be 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -246,15 +246,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index 348c824a4072c3329ac4a3441c19c71598bc9c03..c83623ec947c1550991352a9dd9a5c6ee9282290 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -2,14 +2,17 @@
 
 tf.contrib.quantize provides tools for transforming graphs to include ops to
 model quantization of weights, biases and activations during both training and
-inference. This is done using the
+inference. The details of the transformation implemented in this package is
+described here [1].
+
+This is done using the
 [fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
 
-Recent literature has shown that fixed point networks provide comparable
-performance to floating point networks [1]. This is achieved by modeling the
-quantization operation during training in both the forward and backward passes.
+Literature has shown that fixed point networks provide comparable performance to
+floating point networks [2]. This is achieved by modeling the quantization
+operation during training in both the forward and backward passes.
 The fake quantization operator achieves this by modeling the quantizer as a pass
-through estimator [2]. Note that during back propagation, the parameters are
+through estimator [3]. Note that during back propagation, the parameters are
 updated at high precision as this is needed to ensure sufficient precision in
 accumulating tiny adjustments to the parameters. However, for the forward pass,
 the parameters and activations are quantized to the desired lower precision.
@@ -61,9 +64,11 @@ These rewrites are an active area of research and experimentation, so the
 rewrites and quantized training will likely not work across all models, though
 we hope to work towards generalizing these techniques.
 
+[1] B.Jacob et al., "Quantization and Training of Neural Networks for Efficient
+Integer-Arithmetic-Only Inference", https://arxiv.org/abs/1712.05877
 
-[1] P.Gysel, "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
+[2] P.Gysel et al., "HARDWARE-ORIENTED APPROXIMATION OF CONVOLUTIONAL
 NEURAL NETWORKS", https://arxiv.org/pdf/1604.03168.pdf
 
-[2] Y.Bengio, "Estimating or Propagating Gradients Through Stochastic Neurons
-for Conditional Computation", https://arxiv.org/abs/1308.3432
+[3] Y.Bengio et al., "Estimating or Propagating Gradients Through Stochastic
+Neurons for Conditional Computation", https://arxiv.org/abs/1308.3432
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 5750be6f4cbd501ec85656a66b9002a470b1a863..aa0ef643088ef36b84596d08f78c29594ceca2d6 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -134,9 +134,9 @@ def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
 
       nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor,
                                                      match.output_tensor)
-      if nodes_modified_count != 1:
-        raise ValueError(
-            'Unexpected inputs to op: %s' % match.output_tensor.name)
+      if nodes_modified_count == 0:
+        raise ValueError('Folding batch norms failed, %s had no outputs.' %
+                         match.output_tensor.name)
 
 
 def _FindFusedBatchNorms(graph):
@@ -545,7 +545,7 @@ def _GetBatchNormParams(graph, context, has_scaling):
         gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
 
   if not has_scaling:
-    gamma_tensor = array_ops.ones(batch_mean_tensor.shape)
+    gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
 
   return _BatchNormMatch(
       layer_op=None,
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index a4f7b1b22139588be29171126d43b872d6658168..5c0e17dc8646ce7850e26ffaa80c0201cea456af 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -51,7 +51,6 @@ def LastValueQuantize(inputs,
                       per_channel=False,
                       init_min=-6.0,
                       init_max=6.0,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='LastValueQuant',
                       reuse=None,
@@ -69,8 +68,6 @@ def LastValueQuantize(inputs,
       quantization ranges per output channel.
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -133,7 +130,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = state_ops.assign(min_var, batch_min, name='AssignMinLast')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -146,7 +142,6 @@ def LastValueQuantize(inputs,
     # TFLite requires that 0.0 if always in the [min; max] range.
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = state_ops.assign(max_var, batch_max, name='AssignMaxLast')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
@@ -163,7 +158,6 @@ def MovingAvgQuantize(inputs,
                       init_min=-6.0,
                       init_max=6.0,
                       ema_decay=0.999,
-                      updates_collection=ops.GraphKeys.UPDATE_OPS,
                       vars_collection=ops.GraphKeys.MOVING_AVERAGE_VARIABLES,
                       name_prefix='MovingAvgQuantize',
                       reuse=None,
@@ -182,8 +176,6 @@ def MovingAvgQuantize(inputs,
     init_min: a float scalar, the initial value for variable min.
     init_max: a float scalar, the initial value for variable max.
     ema_decay: EMA decay parameter.
-    updates_collection: (Optional) collections to collect the update ops for
-      computation.
     vars_collection: (Optional) collection where to store variables for
       quantization interval ends.
     name_prefix: name_prefix for created nodes.
@@ -246,7 +238,6 @@ def MovingAvgQuantize(inputs,
     batch_min = math_ops.minimum(batch_min, 0.0)
     assign_min = moving_averages.assign_moving_average(
         min_var, batch_min, ema_decay, name='AssignMinEma')
-    ops.add_to_collection(updates_collection, assign_min.op)
 
     if per_channel:
       if input_dim >= 2:
@@ -260,7 +251,6 @@ def MovingAvgQuantize(inputs,
     batch_max = math_ops.maximum(batch_max, 0.0)
     assign_max = moving_averages.assign_moving_average(
         max_var, batch_max, ema_decay, name='AssignMaxEma')
-    ops.add_to_collection(updates_collection, assign_max.op)
 
     return _FakeQuantWithMinMaxVars(
         inputs,
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 019d123a68602fb15c1ae914f3d5621290deeb00..d2d0426d233aaadb4ffd0fb222c77ade0a98278c 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.quantize.python import quant_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
 
 # Quantizable operation types that are supported by the quantization rewrite.
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
@@ -41,9 +42,16 @@ def Quantize(graph,
              activation_bits=8,
              ema_decay=0.999,
              quant_delay=None,
-             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES):
+             vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
+             scope=None):
   """Updates graph with quantization operations.
 
+  Currently we quantize the following tensors:
+  * Conv/MatMul: Quantize the weights if it matches.
+  * Activation: Quantize the output if it matches.
+  * Bypass/Post-activation Bypass: Quantize both input and output
+    if it matches.
+
   Args:
     graph: Graph to modify.
     is_training: Whether quantizing training graph or eval graph.
@@ -57,13 +65,21 @@ def Quantize(graph,
       training.
     vars_collection: (Optional) Collection where to store the variables for
       quantization interval ends.
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
   Raises:
     ValueError: When quantization fails.
   """
+  if scope and not scope.endswith('/'):
+    scope += '/'
+
   input_to_ops_map = input_to_ops.InputToOps(graph)
   for layer_match in _FindLayersToQuantize(graph):
     # Quantize the weights.
     context = _GetContextFromOp(layer_match.layer_op)
+
+    # If `scope` is given, only quantize it if the consumer of weights
+    # (the layer op) is in the right scope.
     _InsertQuantOp(
         context,
         'weights_quant',
@@ -74,7 +90,8 @@ def Quantize(graph,
         quant_delay=quant_delay,
         narrow_range=True,
         vars_collection=vars_collection,
-        bits=weight_bits)
+        bits=weight_bits,
+        consumer_scope=scope)
 
     # Quantize the activations.
     consumer_ops = input_to_ops_map.ConsumerOperations(
@@ -82,6 +99,9 @@ def Quantize(graph,
     add_context = context
     if layer_match.bypass_op:
       add_context = re.search(r'^(.*)/([^/]+)', context).group(1)
+
+    # If `scope` is given, only quantize it if the producer of weights
+    # (usually it's the layer op) is in the right scope.
     _InsertQuantOp(
         add_context,
         'act_quant',
@@ -93,11 +113,14 @@ def Quantize(graph,
         quant_delay=quant_delay,
         vars_collection=vars_collection,
         bits=activation_bits,
-        init_min=0.0)
+        init_min=0.0,
+        producer_scope=scope)
 
     # Quantize the inputs and output to the bypass (if it exists). The input to
     # the bypass is the bias add, and the output is the activation.
     if layer_match.bypass_op is not None:
+      # If `scope` is given, only quantize it if the both the producer and the
+      # consumer are in the right scope.
       _InsertQuantOp(
           context,
           'conv_quant',
@@ -107,7 +130,9 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
       _InsertQuantOp(
           add_context,
           'add_quant',
@@ -118,12 +143,16 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope,
+          consumer_scope=scope)
 
     # Quantize bypass ops that occur after the activation.
     if layer_match.post_activation_bypass_op is not None:
       post_activation_bypass_context = re.search(
           r'^(.*)/([^/]+)', layer_match.post_activation_bypass_op.name).group(1)
+      # If `scope` is given, only quantize it if the producer is in the right
+      # scope.
       _InsertQuantOp(
           post_activation_bypass_context,
           'post_activation_bypass_quant',
@@ -135,7 +164,8 @@ def Quantize(graph,
           ema_decay=ema_decay,
           quant_delay=quant_delay,
           vars_collection=vars_collection,
-          bits=activation_bits)
+          bits=activation_bits,
+          producer_scope=scope)
 
 
 def _FindLayersToQuantize(graph):
@@ -305,7 +335,8 @@ def _FindLayersToQuantize(graph):
   # the output of the final BiasAdd must be quantized. So we treat the BiasAdd
   # as the 'activation_op' in the _LayerMatch, to ensure that it's output is
   # quantized.
-  final_layer_matcher = graph_matcher.GraphMatcher(bias_add_pattern)
+  final_layer_matcher = graph_matcher.GraphMatcher(
+      graph_matcher.OneofPattern([bias_add_pattern, folded_bias_add_pattern]))
   for match_result in final_layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
     weight_tensor = match_result.get_tensor(weight_identity_pattern)
@@ -381,7 +412,9 @@ def _InsertQuantOp(context,
                    ema_decay=0.999,
                    quant_delay=None,
                    vars_collection=ops.GraphKeys.GLOBAL_VARIABLES,
-                   narrow_range=False):
+                   narrow_range=False,
+                   producer_scope=None,
+                   consumer_scope=None):
   """Inserts a quant op between a producer op and (multiple) consumer ops.
 
   Args:
@@ -406,16 +439,42 @@ def _InsertQuantOp(context,
       quantization interval ends.
     narrow_range: Whether to use the narrow quantization range
       [1; 2^bits - 1] or wide range [0; 2^bits - 1].
+    producer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when the producer is in this scope.
+    consumer_scope: The restriction of producer scope. If not None, the new op
+      will be inserted only when all the consumers are in this scope.
   Raises:
     ValueError: When producer operation is not directly connected to the
       consumer operation.
   """
+  if producer_scope and not producer.name.startswith(producer_scope):
+    logging.info(
+        '_InsertQuantOp ignores context="%s" name="%s" '
+        'because producer "%s" is not in scope "%s"',
+        context, name, producer.name, producer_scope)
+    return
+
+  if consumer_scope:
+    consumers_in_scope = []
+    for consumer in consumers:
+      if consumer.name.startswith(consumer_scope):
+        consumers_in_scope.append(consumer)
+      else:
+        logging.info(
+            '_InsertQuantOp context="%s" name="%s" ignores '
+            'consumer "%s" because it is not in scope "%s"',
+            context, name, consumer.name, consumer_scope)
+        return
+    consumers = consumers_in_scope
+
   name_prefix = _AddContextToName(context, name)
   # This is needed on TPU where name_scope == 'TPUReplicate/loop', and
   # name_prefix starts with 'TPUReplicate/loop/'; without dropping it
   # variables are created as TPUReplicate/loop/TPUReplicate/loop/..., which
   # breaks things later.
-  name_prefix = common.DropStringPrefix(name_prefix, ops.get_name_scope() + '/')
+  name_scope = ops.get_name_scope()
+  if name_scope:
+    name_prefix = common.DropStringPrefix(name_prefix, name_scope + '/')
 
   inputs = producer.outputs[0]
   # Prevent ops from being quantized multiple times. Bypass ops can sometimes
@@ -463,11 +522,16 @@ def _InsertQuantOp(context,
         lambda: inputs,
         name=name_prefix + '/delayed_quant')
 
-  nodes_modified_count = graph_editor.reroute_ts(
-      [quant], [inputs], can_modify=consumers)
-  if nodes_modified_count != len(consumers):
-    raise ValueError('Some inputs not quantized for ops: [%s]' % ', '.join(
-        [consumer.name for consumer in consumers]))
+  if consumers:
+    tensors_modified_count = graph_editor.reroute_ts(
+        [quant], [inputs], can_modify=consumers)
+    # Some operations can have multiple output tensors going to the same
+    # consumer. Since consumers is a set, we need to ensure that
+    # tensors_modified_count is greater than or equal to the length of the set
+    # of consumers.
+    if tensors_modified_count < len(consumers):
+      raise ValueError('No inputs quantized for ops: [%s]' % ', '.join(
+          [consumer.name for consumer in consumers]))
 
 
 def _GetContextFromOp(op):
diff --git a/tensorflow/contrib/quantize/python/quantize_graph.py b/tensorflow/contrib/quantize/python/quantize_graph.py
index 0b74b438ac317967bbe10ad936b451de6f69d62c..11d052d7f491dc029d1bda9b47364d6e9c880a67 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph.py
@@ -28,7 +28,8 @@ def _create_graph(input_graph=None,
                   weight_bits=8,
                   activation_bits=8,
                   quant_delay=None,
-                  freeze_bn_delay=None):
+                  freeze_bn_delay=None,
+                  scope=None):
   """Rewrites an input_graph in place for simulated quantization.
 
   The graph has fake quantization ops inserted to simulate the error
@@ -48,6 +49,8 @@ def _create_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to the number of steps when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -66,7 +69,8 @@ def _create_graph(input_graph=None,
         is_training,
         quant_delay=quant_delay,
         weight_bits=weight_bits,
-        activation_bits=activation_bits)
+        activation_bits=activation_bits,
+        scope=scope)
 
 
 def create_training_graph(input_graph=None, quant_delay=0):
@@ -133,7 +137,8 @@ def experimental_create_training_graph(input_graph=None,
                                        weight_bits=8,
                                        activation_bits=8,
                                        quant_delay=0,
-                                       freeze_bn_delay=None):
+                                       freeze_bn_delay=None,
+                                       scope=None):
   """Rewrites a training input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -165,6 +170,8 @@ def experimental_create_training_graph(input_graph=None,
       frozen and used instead of batch statistics during training.
       freeze_bn_delay should be greater than quant_delay and should correspond
       to when training has almost converged
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -177,12 +184,14 @@ def experimental_create_training_graph(input_graph=None,
       weight_bits=weight_bits,
       activation_bits=activation_bits,
       quant_delay=quant_delay,
-      freeze_bn_delay=freeze_bn_delay)
+      freeze_bn_delay=freeze_bn_delay,
+      scope=scope)
 
 
 def experimental_create_eval_graph(input_graph=None,
                                    weight_bits=8,
-                                   activation_bits=8):
+                                   activation_bits=8,
+                                   scope=None):
   """Rewrites an eval input_graph in place for simulated quantization.
 
   Variables added by the rewrite get added to the global variables collection.
@@ -200,8 +209,8 @@ def experimental_create_eval_graph(input_graph=None,
       default graph.
     weight_bits: Number of bits to use for quantizing weights.
     activation_bits: Number of bits to use for quantizing activations.
-
-
+    scope: The scope to be transformed. If it's not None, only the ops which
+      are in this scope will be transformed.
 
   Raises:
     ValueError: If elements contains an element that isn't a tf.Tensor or
@@ -211,4 +220,5 @@ def experimental_create_eval_graph(input_graph=None,
       input_graph=input_graph,
       is_training=False,
       weight_bits=weight_bits,
-      activation_bits=activation_bits)
+      activation_bits=activation_bits,
+      scope=scope)
diff --git a/tensorflow/contrib/quantize/python/quantize_graph_test.py b/tensorflow/contrib/quantize/python/quantize_graph_test.py
index b9d03c1bc059fe7bcce75978f503cbbf76090dbd..caf8ff28d50d2880d491d04c1ed368597519dcd7 100644
--- a/tensorflow/contrib/quantize/python/quantize_graph_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_graph_test.py
@@ -66,6 +66,20 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
     for fn in rewrite_fns:
       test_fn(fn)
 
+  def _RunTestOverExperimentalRewritesWithScope(self, test_fn, scope):
+    def with_absent_scope(fn):
+      def fn_with_absent_scope(*args):
+        fn(*args, scope=scope)
+      return fn_with_absent_scope
+    rewrite_fns = [
+        with_absent_scope(
+            quantize_graph.experimental_create_training_graph),
+        with_absent_scope(
+            quantize_graph.experimental_create_eval_graph),
+    ]
+    for fn in rewrite_fns:
+      test_fn(fn)
+
   def testRewrite(self):
     self._RunTestOverAllRewrites(self._TestRewrite)
 
@@ -99,6 +113,34 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       # Ensure that variables were added.
       self.assertTrue(len(orig_variable_names) < len(q_variables))
 
+  def testWithPreActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPreActivationBypass)
+
+  def _TestWithPreActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(pre_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(
+          any('scope1/add_quant/' in name for name in op_names))
+
+  def testWithPostActivationBypass(self):
+    self._RunTestOverAllRewrites(self._TestWithPostActivationBypass)
+
+  def _TestWithPostActivationBypass(self, rewrite_fn):
+    # Tests that the default graph is correctly used when no args are provided
+    # to rewrite_fn.
+    with ops.Graph().as_default() as g:
+      self._ConvLayer(post_activation_bypass=True, scope='scope1')
+      rewrite_fn()
+
+      op_names = [op.name for op in g.get_operations()]
+      self.assertTrue(any(
+          'scope1/post_activation_bypass_quant/' in name for name in op_names))
+
   def testQuantDelay(self):
     self._RunTestOverTrainingRewrites(self._TestQuantDelay)
 
@@ -224,20 +266,66 @@ class QuantizeGraphTest(test_util.TensorFlowTestCase):
       graph_def_after = str(g.as_graph_def())
       self.assertEqual(graph_def_before, graph_def_after)
 
-  def _ConvLayer(self):
+  def testRewriteWithScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithScope, 'scope1')
+
+  def _TestRewriteWithScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      scope1_output = self._ConvLayer(scope='scope1')
+      self._ConvLayer(input_tensor=scope1_output, scope='scope2')
+
+    rewrite_fn(graph)
+
+    op_names = [op.name for op in graph.get_operations()]
+    # The weights and activation of scope1 is quantized, but not scope2.
+    self.assertTrue(
+        any('scope1/Conv/act_quant' in name for name in op_names))
+    self.assertTrue(
+        any('scope1/Conv/weights_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/act_quant' in name for name in op_names))
+    self.assertFalse(
+        any('scope2/Conv/weights_quant' in name for name in op_names))
+
+  def testRewriteWithNonMatchingScope(self):
+    self._RunTestOverExperimentalRewritesWithScope(
+        self._TestRewriteWithNonMatchingScope, 'NonExistingScope')
+
+  def _TestRewriteWithNonMatchingScope(self, rewrite_fn):
+    graph = ops.Graph()
+    with graph.as_default():
+      self._ConvLayer()
+
+    op_names_before_rewrite = set([op.name for op in graph.get_operations()])
+    rewrite_fn(graph)
+    op_names_after_rewrite = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_rewrite, op_names_after_rewrite)
+
+  def _ConvLayer(
+      self, input_tensor=None, scope='test', pre_activation_bypass=False,
+      post_activation_bypass=False):
     """Add a basic convolution layer to the default graph."""
     batch_size, height, width, depth = 5, 128, 128, 3
-    inputs = array_ops.zeros((batch_size, height, width, depth))
+    if input_tensor is None:
+      input_tensor = array_ops.zeros((batch_size, height, width, depth))
     weight_init = init_ops.truncated_normal_initializer
-    conv = layers.conv2d(
-        inputs,
-        32, [5, 5],
-        stride=2,
-        padding='SAME',
-        weights_initializer=weight_init(0.09),
-        activation_fn=None,
-        scope='test')
-    _ = nn_ops.relu6(conv)
+    with ops.name_scope(scope):
+      output = layers.conv2d(
+          input_tensor,
+          depth, [5, 5],
+          padding='SAME',
+          weights_initializer=weight_init(0.09),
+          activation_fn=None)
+      if pre_activation_bypass:
+        output += input_tensor
+      output = nn_ops.relu6(output)
+      if post_activation_bypass:
+        output += input_tensor
+    return output
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 98f05c8bfc13094aff2839b2a6aa0da5c653da2b..d37c83d6839f02c52a72cac97c9238c135dc2f66 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -247,6 +247,53 @@ class QuantizeTest(test_util.TensorFlowTestCase):
       self.assertTrue(not op.name.startswith('name_scope/name_scope/'),
                       'Broken op: %s' % op.name)
 
+  def testWithNullNameScope(self):
+    self._RunTestOverParameters(self._TestWithNullNameScope)
+
+  def _TestWithNullNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope(None):
+        batch_size, height, width, depth = 5, 128, 128, 32
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+        quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+        # Passes if Quantize() does not crash.
+
+  def testWithNonMatchingNameScope(self):
+    self._RunTestOverParameters(self._testWithNonMatchingNameScope)
+
+  def _testWithNonMatchingNameScope(self, is_training):
+    graph = ops.Graph()
+    with graph.as_default():
+      with graph.name_scope('name_scope'):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        _ = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test')
+
+    op_names_before_quantize = set([op.name for op in graph.get_operations()])
+    quantize.Quantize(
+        graph, is_training, weight_bits=8, activation_bits=8,
+        scope='NonExisting/')
+    op_names_after_quantize = set([op.name for op in graph.get_operations()])
+
+    # No ops should be inserted or removed.
+    self.assertEqual(op_names_before_quantize, op_names_after_quantize)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/receptive_field/BUILD b/tensorflow/contrib/receptive_field/BUILD
index e975aeaea7ee78f8e912be8ab1be61b9acc7b418..9325a14745c1db2f8c311602143175e736fc3c5f 100644
--- a/tensorflow/contrib/receptive_field/BUILD
+++ b/tensorflow/contrib/receptive_field/BUILD
@@ -106,15 +106,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/recurrent/BUILD b/tensorflow/contrib/recurrent/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b3cb04ce26d96333f516f1298c8d5c331964f05b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/BUILD
@@ -0,0 +1,106 @@
+# Recurrent library.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+py_library(
+    name = "recurrent_py",
+    srcs = ["python/recurrent_api.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":functional_rnn_ops_py",
+        ":recurrent_ops_py",
+    ],
+)
+
+py_library(
+    name = "recurrent_ops_py",
+    srcs = ["python/ops/recurrent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "functional_rnn_ops_py",
+    srcs = ["python/ops/functional_rnn.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":recurrent_ops_py",
+        "//tensorflow/contrib/framework:framework_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:standard_ops",
+    ],
+)
+
+cuda_py_tests(
+    name = "recurrent_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/recurrent_test.py"],
+    additional_deps = [
+        ":recurrent_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
+
+cuda_py_tests(
+    name = "functional_rnn_ops_test",
+    size = "small",
+    srcs = ["python/kernel_tests/functional_rnn_test.py"],
+    additional_deps = [
+        ":functional_rnn_ops_py",
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/layers:layers_py",
+        "//tensorflow/contrib/tpu:tpu",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:rnn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+    tags = ["nopip"],
+)
diff --git a/tensorflow/contrib/recurrent/README.md b/tensorflow/contrib/recurrent/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..86e10eee517f69d1316b76af85cb13a2bfea2984
--- /dev/null
+++ b/tensorflow/contrib/recurrent/README.md
@@ -0,0 +1,13 @@
+# Recurrent computation library
+
+The recurrent computation library contains code to perform recurrent
+computations.
+
+Its chief application is to implement recurrent neural networks (RNNs, LSTMs,
+etc), which is implemented in `functional_rnn.py`. Similar techniques may be
+used to implement deep networks.
+
+The computation saves the activations in the forward pass, and computes the
+gradients in the backward pass using a single accumulator.
+
+The `functional_rnn` interface is compatible with the `dynamic_rnn` API.
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f19ac7dbe0cee2eb6c780ec5ea6266bc847abd7
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/functional_rnn_test.py
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Functional RNN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import rnn as rnn_lib
+from tensorflow.python.ops import rnn_cell_impl
+from tensorflow.python.ops import variables
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _CreateStackedLstmCell(*cell_sizes):
+  subcells = [rnn_cell_impl.LSTMCell(cell_size) for cell_size in cell_sizes]
+  return rnn_cell_impl.MultiRNNCell(subcells)
+
+
+class FunctionalRnnTest(test_util.TensorFlowTestCase):
+
+  _BATCH_SIZE = 3
+  _TOTAL_TIME = 5
+  _INPUT_SIZE = 11
+  _NUM_UNITS = 7
+
+  # Set this to some output if you want to use it.
+  _LSTM_GRAPH_DEF_FILEPATH = None
+
+  _CELLDEFS = {
+      'gru': (rnn_cell_impl.GRUCell, [_NUM_UNITS]),
+      'lstm': (rnn_cell_impl.LSTMCell, [_NUM_UNITS]),
+      'stacked_lstm': (_CreateStackedLstmCell, [_NUM_UNITS] * 3)
+  }
+
+  def _CreateCell(self, celldef_name):
+    func, args = self._CELLDEFS[celldef_name]
+    return func(*args)
+
+  def _CreateInputs(self):
+    inputs = np.random.random([FunctionalRnnTest._BATCH_SIZE,
+                               FunctionalRnnTest._TOTAL_TIME,
+                               FunctionalRnnTest._INPUT_SIZE])
+    # Always leave one time slot empty, to check max_length behavior.
+    sequence_length = np.random.randint(
+        0, high=FunctionalRnnTest._TOTAL_TIME - 1,
+        size=FunctionalRnnTest._BATCH_SIZE,
+        dtype=np.int)
+    return (inputs, sequence_length)
+
+  def _CreateRnnGraph(self, create_rnn_computation_func, cell, tf_inputs,
+                      tf_sequence_length, initial_state=None,
+                      time_major=None, scope=None):
+    tf_result = create_rnn_computation_func(cell=cell, inputs=tf_inputs,
+                                            sequence_length=tf_sequence_length,
+                                            initial_state=initial_state,
+                                            dtype=dtypes.float32,
+                                            time_major=time_major,
+                                            scope=scope)
+    grad = gradients_impl.gradients(tf_result, variables.trainable_variables())
+    return {'inference': tf_result, 'grad': grad}
+
+  def _MaybeResetVariables(self, variable_cache, sess, var_list):
+    """Possibly resets the variables to a previously seen value."""
+    reset_ops = []
+    fetches = []
+    for var in var_list:
+      if var.name in variable_cache:
+        reset_ops += [var.assign(variable_cache[var.name])]
+      else:
+        fetches += [(var.name, var)]
+    if reset_ops:
+      sess.run(reset_ops)
+    if fetches:
+      val = sess.run(dict(fetches))
+      for n, v in val.items():
+        assert n not in variable_cache
+        variable_cache[n] = v
+
+  def _RunRnn(self, numpy_inputs, numpy_slen, cell_name, variable_cache,
+              is_dynamic):
+    with ops.Graph().as_default() as graph:
+      tf_inputs = array_ops.placeholder(
+          dtypes.float32, shape=numpy_inputs.shape)
+      tf_slen = array_ops.placeholder(dtypes.int32)
+      feeds = {tf_inputs: numpy_inputs, tf_slen: numpy_slen}
+      cell = self._CreateCell(cell_name)
+      fn = rnn_lib.dynamic_rnn if is_dynamic else functional_rnn.functional_rnn
+      fetches = self._CreateRnnGraph(fn, cell, tf_inputs, tf_slen)
+      with self.test_session(graph=graph) as sess:
+        sess.run(variables.global_variables_initializer())
+        # Note that cell.trainable_variables it not always set.
+        self._MaybeResetVariables(variable_cache, sess,
+                                  variables.trainable_variables())
+        val = sess.run(fetches, feed_dict=feeds)
+      graph_def = graph.as_graph_def()
+      return graph_def, val
+
+  def testRunLstm(self):
+    """Runs a simple LSTM. Does not check output."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    graphdef, _ = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    logging.info('graphdef: %s', graphdef)
+    if self._LSTM_GRAPH_DEF_FILEPATH:
+      with open(self._LSTM_GRAPH_DEF_FILEPATH, 'w') as f:
+        f.write(str(graphdef))
+
+  def testLstm(self):
+    """Checks an LSTM against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'lstm', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testGru(self):
+    """Checks a GRU cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    _, func_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, False)
+    _, dyn_rnn = self._RunRnn(np_inputs, np_slen, 'gru', var_cache, True)
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+  def testStackedLstm(self):
+    """Checks a stacked LSTM cell against the reference implementation."""
+    np_inputs, np_slen = self._CreateInputs()
+    var_cache = {}
+    args = [np_inputs, np_slen, 'stacked_lstm', var_cache]
+    _, func_rnn = self._RunRnn(*(args + [False]))
+    _, dyn_rnn = self._RunRnn(*(args + [True]))
+    self.assertAllClose(dyn_rnn['inference'], func_rnn['inference'])
+    self.assertAllClose(dyn_rnn['grad'], func_rnn['grad'])
+
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..00fbd4fbb8205ceb649616050314e400be1785a5
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/kernel_tests/recurrent_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Recurrent ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test as test_lib
+from tensorflow.python.platform import tf_logging as logging
+
+
+_ElmanState = collections.namedtuple('ElmanState', ('h'))
+_ElmanTheta = collections.namedtuple('ElmanTheta', ('w', 'b'))
+_ElmanInputs = collections.namedtuple('ElmanInputs', ('x'))
+
+
+# TODO(drpng): add test for max length computation.
+class RecurrentTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    # pylint:disable=invalid-name
+    _PolyState = collections.namedtuple('PolyState', ('value', 'x_power'))
+    _PolyTheta = collections.namedtuple('PolyTheta', ('x'))
+    _PolyInputs = collections.namedtuple('PolyInputs', ('coeff'))
+    # pylint:enable=invalid-name
+
+    def Poly(theta, state, inputs):
+      next_state = _PolyState(
+          value=state.value + inputs.coeff * state.x_power,
+          x_power=state.x_power * theta.x)
+      return next_state, []
+
+    with self.test_session() as sess:
+      theta = _PolyTheta(x=array_ops.constant(2.0))
+      state = _PolyState(
+          value=array_ops.constant(0.0),
+          x_power=array_ops.constant(1.0))
+      inputs = _PolyInputs(coeff=array_ops.constant([1., 2., 3.]))
+
+      # x = 2
+      # 1 + 2*x + 3*x^2
+      ret = recurrent.Recurrent(theta, state, inputs, Poly)
+
+      acc, state = sess.run(ret)
+      self.assertAllClose(acc.value, [1., 5., 17.])
+      self.assertAllClose(acc.x_power, [2., 4., 8.])
+      self.assertAllClose(state.value, 17.)
+      self.assertAllClose(state.x_power, 8.)
+
+      y = ret[1].value
+      dx, d_coeff = gradients_impl.gradients(ys=[y], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+
+      # 2 + 6*x
+      self.assertAllClose(dx_val, 14.)
+      self.assertAllClose(d_coeff_val, [1., 2., 4.])
+
+      # acc = [1, 1+2x, 1+2x+3x^2]
+      # sum(acc) = 3 + 4x + 3x^2
+      acc = ret[0].value
+      dx, d_coeff = gradients_impl.gradients(
+          ys=[math_ops.reduce_sum(acc)], xs=[theta.x, inputs.coeff])
+      dx_val, d_coeff_val = sess.run([dx, d_coeff])
+      # 4 + 6*x
+      self.assertAllClose(dx_val, 16.)
+      self.assertAllClose(d_coeff_val, [3., 4., 4.])
+
+  @staticmethod
+  def Rand(shape):
+    return random_ops.random_uniform(
+        shape, minval=-0.2, maxval=0.2, dtype=dtypes.float64)
+
+  @staticmethod
+  def Elman(theta, state0, inputs):
+    h0, w, b, x = state0.h, theta.w, theta.b, inputs.x
+    xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+    h1 = math_ops.sigmoid(xw + b)
+    state1 = _ElmanState(h=h1)
+    return (state1, state1)
+
+  @staticmethod
+  def ElmanGrad(theta, state0, inputs, extras, dstate1):
+
+    @function.Defun()
+    def Grad(h0, w, b, x, h1, dh1):
+      del b
+      # We hand-roll the gradient for the 2nd half of the cell as a demo.
+      dxwb = (dh1 * (1 - h1) * h1)
+      dxw, db = dxwb, math_ops.reduce_sum(dxwb, axis=0)
+
+      # Uses tf.gradient for the 1nd half of the cell as a demo.
+      xw = math_ops.matmul(array_ops.concat([x, h0], axis=1), w)
+      dh0, dx, dw = gradients_impl.gradients(
+          ys=[xw], xs=[h0, x, w], grad_ys=[dxw])
+
+      return dh0, dx, dw, db
+
+    dh0, dx, dw, db = Grad(state0.h, theta.w, theta.b, inputs.x,
+                           extras.h, dstate1.h)
+    dstate0 = _ElmanState(h=dh0)
+    dinputs = _ElmanInputs(x=dx)
+    return (_ElmanTheta(w=dw, b=db), dstate0, dinputs)
+
+  @staticmethod
+  def ElmanOut(state1):
+    return _ElmanState(x=state1.h)
+
+  @staticmethod
+  def ElmanOutGrad(dout):
+    return _ElmanState(h=dout.x)
+
+  def testElman(self):
+    for seqlen, use_grad in [(1, False), (1, True), (7, False), (7, True)]:
+      logging.info('== Elman: seqlen=%s, use_grad=%s', seqlen, use_grad)
+      self._ParameterizedTestElman(seqlen, use_grad)
+
+  def _ParameterizedTestElman(self, seqlen, use_grad):
+
+    with self.test_session() as sess:
+      random_seed.set_random_seed(342462)
+
+      batch = 3
+      dims = 4
+      theta = _ElmanTheta(w=RecurrentTest.Rand([2 * dims, dims]),
+                          b=RecurrentTest.Rand([dims]))
+      state0 = _ElmanState(h=RecurrentTest.Rand([batch, dims]))
+      inputs = _ElmanInputs(x=RecurrentTest.Rand([seqlen, batch, dims]))
+
+      # Statically unrolled.
+      s = state0
+      out = []
+      for i in xrange(seqlen):
+        inp = _ElmanInputs(x=inputs.x[i, :])
+        s, _ = RecurrentTest.Elman(theta, s, inp)
+        out += [s.h]
+      acc0, final0 = array_ops.stack(out), s.h
+      loss0 = math_ops.reduce_sum(acc0) + math_ops.reduce_sum(final0)
+      (dw0, db0, dh0, di0) = gradients_impl.gradients(
+          loss0, [theta.w, theta.b, state0.h, inputs.x])
+
+      acc1, final1 = recurrent.Recurrent(
+          theta=theta,
+          state0=state0,
+          inputs=inputs,
+          cell_fn=RecurrentTest.Elman,
+          cell_grad=RecurrentTest.ElmanGrad if use_grad else None)
+      assert isinstance(acc1, _ElmanState)
+      assert isinstance(final1, _ElmanState)
+      acc1, final1 = acc1.h, final1.h
+      loss1 = math_ops.reduce_sum(acc1) + math_ops.reduce_sum(final1)
+      (dw1, db1, dh1, di1) = gradients_impl.gradients(
+          loss1, [theta.w, theta.b, state0.h, inputs.x])
+
+      # Fetches a few values and compare them.
+      (acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0,
+       di1) = sess.run(
+           [acc0, acc1, final0, final1, dw0, dw1, db0, db1, dh0, dh1, di0, di1])
+      self.assertAllClose(acc0, acc1)
+      self.assertAllClose(final0, final1)
+      self.assertAllClose(dw0, dw1)
+      self.assertAllClose(db0, db1)
+      self.assertAllClose(dh0, dh1)
+      self.assertAllClose(di0, di1)
+
+if __name__ == '__main__':
+  test_lib.main()
diff --git a/tensorflow/contrib/recurrent/python/ops/functional_rnn.py b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a085474c1bf6117ba5663139c78d8f08f71392d3
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/functional_rnn.py
@@ -0,0 +1,396 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A tf.nn.dynamic_rnn variant, built on the Recurrent class.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.contrib.recurrent.python.ops import recurrent
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import nest
+
+
+def _GetDTypesFromStructure(struct):
+  dtypes_list = []
+  for x in nest.flatten(struct):
+    x = ops.convert_to_tensor(x)
+    dtypes_list.append(x.dtype)
+  return dtypes_list
+
+
+def _SetShapeFromTemplate(struct, struct_template):
+  as_list = nest.flatten(struct)
+  template_as_list = nest.flatten(struct_template)
+  for element, template in zip(as_list, template_as_list):
+    element.set_shape(template.shape)
+
+
+class _FunctionalRnnCell(object):
+  """Wrapper around RNNCell which separates state from computation.
+
+  This class accomplishes the following:
+  * Turn the cell's `__call__` function into a pure function. The global
+    side effects are separated as `theta`. They are the variables created
+    for the weights of the computation.
+  * Unless the output is aliased as part of the state, extend the state to
+    contain the output so that we store the history in `Recurrent`.
+  * Set static shapes as required.
+  """
+
+  def __init__(self, rnn_cell, seq_inputs, initial_state):
+    assert initial_state is not None
+
+    # TODO(drpng): Dtype needs to be configurable.
+    input_dtypes = [dtypes.float32] + _GetDTypesFromStructure(initial_state)
+    # See _index.
+    like_inputs_t = nest.map_structure(
+        lambda x: array_ops.stop_gradient(array_ops.gather(x, 0)), seq_inputs)
+    input_structure = (like_inputs_t, initial_state)
+
+    @function.Defun(*input_dtypes)
+    def FlatCellStep(*flat_inputs):
+      """The flattened version of `rnn_cell`."""
+      inputs_t, state0 = nest.pack_sequence_as(input_structure, flat_inputs)
+      _SetShapeFromTemplate(state0, initial_state)
+      _SetShapeFromTemplate(inputs_t, like_inputs_t)
+      outputs_t, state1 = rnn_cell(inputs_t, state0)
+      state_list = nest.flatten(state1)
+      self._output_shape = outputs_t.shape
+
+      if outputs_t in state_list:
+        output_index_in_state = state_list.index(outputs_t)
+      else:
+        output_index_in_state = None
+
+      if output_index_in_state is None:
+        self._prepend_output = True
+        self._output_state_idx = 0
+        return [outputs_t] + state_list
+      else:
+        self._output_state_idx = output_index_in_state
+        self._prepend_output = False
+        # To save memory, we don't store return the output separately
+        # from the state list, since we know it's the same.
+        return state_list
+
+    def _ToPureFunction(func):
+      # NOTE: This forces the creating of the function.
+      if func.captured_inputs:
+        pure_func = copy.copy(func)
+        # pylint: disable=protected-access
+        pure_func._extra_inputs = []
+        return pure_func
+      return func
+
+    pure_flat_cell_step = _ToPureFunction(FlatCellStep)
+
+    def CellStep(theta, extended_state0, inputs_t):
+      """Performs one time steps on structured inputs.
+
+      The purpose of this function is to turn the parameters into flattened
+      versions, and to resolve the parameter order difference between
+      `Recurrent` and `RNNCell`.
+
+      In the event the cell returns a transformed output that is not aliased
+      within its state, the `extended_state0` also contains the output as its
+      first element.
+
+      Args:
+        theta: Weights required for the computation. A structure of tensors.
+        extended_state0: the state0, and possibly the output at the previous
+          time step. A structure of tensors.
+        inputs_t: the inputs at time t.
+
+      Returns:
+        A pair of the next state (inclusive of the output), and an empty list
+        (unused `extras`).
+        The next state is congruent to state0.
+      """
+      extended_state0_flat = nest.flatten(extended_state0)
+      state0_flat = self.MaybeRemoveOutputFromState(extended_state0_flat)
+      full_inputs = [inputs_t] + state0_flat + theta
+      # Note that the thetas are additional inputs appeneded as extra
+      # parameters.
+      cell_out = pure_flat_cell_step(*full_inputs)
+      return cell_out, []
+
+    self._cell_step = CellStep
+    self._theta = FlatCellStep.captured_inputs
+    self._zero_state = rnn_cell.zero_state
+    self._state_template = initial_state
+    self._output_size = rnn_cell.output_size
+
+  @property
+  def extended_initial_state(self):
+    if self._prepend_output:
+      return [array_ops.zeros(self._output_shape), self._state_template]
+    else:
+      # The base case, where the output is just the hidden state.
+      return self._state_template
+
+  @property
+  def cell_step(self):
+    return self._cell_step
+
+  @property
+  def theta(self):
+    return self._theta
+
+  @property
+  def state_template(self):
+    return self._state_template
+
+  @property
+  def output_shape(self):
+    return self._output_shape
+
+  def GetOutputFromState(self, state):
+    return nest.flatten(state)[self._output_state_idx]
+
+  def MaybeRemoveOutputFromState(self, flat_state):
+    if self._prepend_output:
+      return flat_state[1:]
+    return flat_state
+
+
+def _ApplyLengthsToBatch(sequence_lengths, tf_output):
+  # TODO(drpng): just use Update so that we don't carry over the gradients?
+  """Sets the output to be zero at the end of the sequence."""
+  # output is batch major.
+  batch_size, max_time, vector_size = tf_output.shape
+  output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+  output_time = array_ops.reshape(output_time, [batch_size, max_time])
+  lengths = array_ops.tile(
+      array_ops.reshape(sequence_lengths, [-1, 1]), [1, max_time])
+  is_less = math_ops.cast(
+      math_ops.less(output_time, lengths), dtype=dtypes.float32)
+  keep_mask = array_ops.tile(
+      array_ops.expand_dims(is_less, -1),
+      [1, 1, vector_size])
+  final_output = keep_mask * tf_output
+  return final_output
+
+
+def _PickFinalStateFromHistory(acc_state, sequence_length):
+  """Implements acc_state[sequence_length - 1]."""
+  # This will work on all platforms, unlike the regular slice.
+  last_value = []
+  for state_var in nest.flatten(acc_state):
+    # We compute the following with matrix operations:
+    # last_var = state_var[sequence_length - 1]
+    shape = array_ops.shape(state_var)
+    max_time, batch_size = shape[0], shape[1]
+    output_time = array_ops.tile(math_ops.range(0, max_time), [batch_size])
+    output_time = array_ops.reshape(output_time, [batch_size, max_time])
+    lengths = array_ops.tile(array_ops.reshape(sequence_length,
+                                               [-1, 1]), [1, max_time])
+    last_idx = math_ops.cast(math_ops.equal(output_time, lengths - 1),
+                             dtype=dtypes.float32)
+    last_idx = array_ops.transpose(last_idx)
+    last_idx_for_bcast = array_ops.expand_dims(last_idx, -1)
+    sliced = math_ops.multiply(last_idx_for_bcast, state_var)
+    last_var = math_ops.reduce_sum(sliced, 0)
+    last_value += [last_var]
+  return nest.pack_sequence_as(acc_state, last_value)
+
+
+def _PostProcessOutput(extended_acc_state, extended_final_state, func_cell,
+                       total_time, inputs_lengths):
+  """Post-process output of recurrent.
+
+  This function takes the accumulated extended state and extracts the requested
+  state and output.
+
+  When `inputs_lengths` has been set, it extracts the output from the
+  accumulated state. It also sets outputs past.
+
+  It also sets the static shape information.
+
+  Args:
+    extended_acc_state: A structure containing the accumulated state at each
+      time. It may contain the output at each time as well.
+    extended_final_state: A structure containing the final state. It may
+      contain the output at the final time.
+    func_cell: The functional wrapper around the cell.
+    total_time: A scalar integer tensor.
+    inputs_lengths: An integer tensor with one entry per input.
+
+  Returns:
+    A tuple with the outputs at each time, and the final state.
+  """
+  if inputs_lengths is None:
+    flat_final_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_final_state))
+    tf_state = nest.pack_sequence_as(func_cell.state_template, flat_final_state)
+  else:
+    # The accumulated state is over the entire sequence, so we pick it
+    # out from the acc_state sequence.
+    flat_acc_state = func_cell.MaybeRemoveOutputFromState(
+        nest.flatten(extended_acc_state))
+    acc_state = nest.pack_sequence_as(
+        func_cell.state_template, flat_acc_state)
+    tf_state = _PickFinalStateFromHistory(acc_state, inputs_lengths)
+
+  output_from_state = func_cell.GetOutputFromState(extended_acc_state)
+  tf_output = array_ops.transpose(output_from_state, [1, 0, 2])
+  tf_output.set_shape(
+      [func_cell.output_shape[0], total_time, func_cell.output_shape[1]])
+  if inputs_lengths is not None:
+    # Need set the outputs to zero.
+    tf_output = _ApplyLengthsToBatch(inputs_lengths, tf_output)
+    # tf_output = array_ops.zeros([4, 3, 5])
+  _SetShapeFromTemplate(tf_state, func_cell.state_template)
+  return tf_output, tf_state
+
+
+# pylint: disable=invalid-name
+def functional_rnn(cell, inputs, sequence_length=None,
+                   initial_state=None, dtype=None, time_major=False,
+                   scope=None, use_tpu=False):
+  """Same interface as `tf.nn.dynamic_rnn`."""
+  with variable_scope.variable_scope(scope or 'rnn'):
+    if not time_major:
+      inputs = nest.map_structure(
+          lambda t: array_ops.transpose(t, [1, 0, 2]), inputs)
+    inputs_flat = nest.flatten(inputs)
+    batch_size = array_ops.shape(inputs_flat[0])[1]
+    if initial_state is None:
+      initial_state = cell.zero_state(batch_size, dtype)
+    func_cell = _FunctionalRnnCell(cell, inputs, initial_state)
+  extended_acc_state, extended_final_state = recurrent.Recurrent(
+      theta=func_cell.theta,
+      state0=func_cell.extended_initial_state,
+      inputs=inputs,
+      cell_fn=func_cell.cell_step,
+      use_tpu=use_tpu)
+  return _PostProcessOutput(extended_acc_state, extended_final_state,
+                            func_cell, inputs_flat[0].shape[0], sequence_length)
+
+
+def bidirectional_functional_rnn(
+    cell_fw,
+    cell_bw,
+    inputs,
+    initial_state_fw=None,
+    initial_state_bw=None,
+    dtype=None,
+    sequence_length=None,
+    time_major=False,
+    use_tpu=False,
+    scope=None):
+  """Creates a bidirectional recurrent neural network.
+
+  Performs fully dynamic unrolling of inputs in both directions. Built to be API
+  compatible with `tf.nn.bidirectional_dynamic_rnn`, but implemented with
+  functional control flow for TPU compatibility.
+
+  Args:
+    cell_fw: An instance of `tf.contrib.rnn.RNNCell`.
+    cell_bw: An instance of `tf.contrib.rnn.RNNCell`.
+    inputs: The RNN inputs. If time_major == False (default), this must be a
+      Tensor (or hierarchical structure of Tensors) of shape
+      [batch_size, max_time, ...]. If time_major == True, this must be a Tensor
+      (or hierarchical structure of Tensors) of shape:
+      [max_time, batch_size, ...]. The first two dimensions must match across
+      all the inputs, but otherwise the ranks and other shape components may
+      differ.
+    initial_state_fw: An optional initial state for `cell_fw`. Should match
+      `cell_fw.zero_state` in structure and type.
+    initial_state_bw: An optional initial state for `cell_bw`. Should match
+      `cell_bw.zero_state` in structure and type.
+    dtype: (optional) The data type for the initial state and expected output.
+      Required if initial_states are not provided or RNN state has a
+      heterogeneous dtype.
+    sequence_length: An optional int32/int64 vector sized [batch_size]. Used to
+      copy-through state and zero-out outputs when past a batch element's
+      sequence length. So it's more for correctness than performance.
+    time_major: Whether the `inputs` tensor is in "time major" format.
+    use_tpu: Whether to enable TPU-compatible operation. If True, does not truly
+      reverse `inputs` in the backwards RNN. Once b/69305369 is fixed, we can
+      remove this flag.
+    scope: An optional scope name for the dynamic RNN.
+
+  Returns:
+    outputs: A tuple of `(output_fw, output_bw)`. The output of the forward and
+      backward RNN. If time_major == False (default), these will
+      be Tensors shaped: [batch_size, max_time, cell.output_size]. If
+      time_major == True, these will be Tensors shaped:
+      [max_time, batch_size, cell.output_size]. Note, if cell.output_size is a
+      (possibly nested) tuple of integers or TensorShape objects, then the
+      output for that direction will be a tuple having the same structure as
+      cell.output_size, containing Tensors having shapes corresponding to the
+      shape data in cell.output_size.
+    final_states: A tuple of `(final_state_fw, final_state_bw)`. A Tensor or
+      hierarchical structure of Tensors indicating the final cell state in each
+      direction. Must have the same structure and shape as cell.zero_state.
+
+  Raises:
+    ValueError: If `initial_state_fw` is None or `initial_state_bw` is None and
+      `dtype` is not provided.
+  """
+  # Keep this code in sync with tf.nn.dynamic_rnn for compatibility.
+  with variable_scope.variable_scope(scope or 'bidirectional_rnn'):
+    # Forward direction
+    with variable_scope.variable_scope('fw') as fw_scope:
+      output_fw, output_state_fw = functional_rnn(
+          cell=cell_fw, inputs=inputs, sequence_length=sequence_length,
+          initial_state=initial_state_fw, dtype=dtype,
+          time_major=time_major, scope=fw_scope, use_tpu=use_tpu)
+    # Backward direction
+    if not time_major:
+      time_dim = 1
+      batch_dim = 0
+    else:
+      time_dim = 0
+      batch_dim = 1
+
+    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+      if seq_lengths is not None:
+        return array_ops.reverse_sequence(
+            input=input_, seq_lengths=seq_lengths,
+            seq_dim=seq_dim, batch_dim=batch_dim)
+      else:
+        # See b/69305369.
+        assert not use_tpu, (
+            'Bidirectional with variable sequence lengths unsupported on TPU')
+        return array_ops.reverse(input_, axis=[seq_dim])
+
+    with variable_scope.variable_scope('bw') as bw_scope:
+      inputs_reverse = _reverse(
+          inputs, seq_lengths=sequence_length,
+          seq_dim=time_dim, batch_dim=batch_dim)
+      tmp, output_state_bw = functional_rnn(
+          cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
+          initial_state=initial_state_bw, dtype=dtype,
+          time_major=time_major, scope=bw_scope, use_tpu=use_tpu)
+
+  output_bw = _reverse(
+      tmp, seq_lengths=sequence_length,
+      seq_dim=time_dim, batch_dim=batch_dim)
+
+  outputs = (output_fw, output_bw)
+  output_states = (output_state_fw, output_state_bw)
+
+  return (outputs, output_states)
+# pylint: enable=invalid-name
diff --git a/tensorflow/contrib/recurrent/python/ops/recurrent.py b/tensorflow/contrib/recurrent/python/ops/recurrent.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa16b82ab62f27d034c3ca7584e7e1ca14be6f9b
--- /dev/null
+++ b/tensorflow/contrib/recurrent/python/ops/recurrent.py
@@ -0,0 +1,720 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent computation.
+
+The main interface of this module is Recurrent().
+A recurrent computation describes an auto-regressive process, where outputs
+of one time step are fed to the output of the next time step.
+
+This module uses:
+  theta: the "weights" each RNN uses.
+  state0: the initial state of each RNN.
+  cell_fn: A python function describing RNN cell. It must has the following
+    signature:
+         cell_fn: (theta, state0, inputs) -> (state1, extras)
+    state1 is the next RNN state, extras are computed by cell_fn
+    and the library forwards extras to cell_fn's gradient function.
+  cell_grad: A python function describing the backprop gradient function
+    for the RNN cell. It must has the following signature:
+         cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+    dstate1 is what the backprop algorithm provides representing
+    gradients of state1 w.r.t. the final loss.
+
+In this module, we handle structures of tensors for theta, state0, inputs,
+and extras. The structure is an arbitrarily nested python structure, such
+as a dictionary of named tuples.
+
+Because the computation is a left-to-right chain, a single in-place accumulator
+can be used rather than a stack. Thus a special gradient was written to reduce
+unnecessary memory usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.inplace_ops import alias_inplace_update
+from tensorflow.python.util import nest
+
+
+def _AssertIsCompatible(a, b):
+  """Checks that `a` and `b` are nested structures of the same type."""
+  # TODO(drpng): implement.
+  del a
+  del b
+
+
+def _Index(struct, index):
+  """Returns a structure with `x[index]` for each tensor `x` in the structure.
+
+  Args:
+    struct: A structure of tensors.
+    index: A scalar integer tensor. Performance is better if `index` is
+      on the host memory.
+
+  Returns:
+    A structure of tensors congruent to `struct`.
+    For each key in `ret`, `rets[key] = struct[key][index]`.
+  """
+  index = ops.convert_to_tensor(index)
+  index.get_shape().assert_has_rank(0)
+  return nest.map_structure(lambda x: x[index], struct)
+
+
+def _Update(struct_acc, struct_x, t):
+  """Updates t-th row in accumulators.
+
+  Args:
+    struct_acc: The accumulators. A structure of tensors.
+    struct_x: The new values. A structure of tensors congruent to `struct_acc`.
+    t: A scalar integer. Performance is better if `t` is on the device
+      memory.
+
+  Returns:
+    A structure of tensors. Say, ret is a returned dictionary. Then, for
+    each key, we have:
+      ret[key] = struct_acc[key];
+      ret[key][t, :] = struct_x[key]
+  """
+  to_skip_update = set()
+  acc_lst = nest.flatten(struct_acc)
+  x_lst = nest.flatten(struct_x)
+  t = math_ops.to_int32([t])  # tf.to_int32 casts on-device tensors.
+  lst = []
+  for acc, x in zip(acc_lst, x_lst):
+    if acc in to_skip_update:
+      # Until b/62105730 is fixed, we need to avoid inplace update for tensors
+      # of rank 1.  could reshape to handle it, but we don't really need the
+      # values applied to these, so just skip their modification.
+      lst += [acc]
+    else:
+      lst += [alias_inplace_update(acc, t, array_ops.expand_dims(x, 0))]
+  return nest.pack_sequence_as(struct_acc, lst)
+
+
+def _SeqLenDim(struct):
+  """Returns the 0-th dim size of tensors in a structure of tensors.
+
+  This is the max sequence length according to the shape of the inputs.
+
+  Args:
+    struct: A structure of tensors. Every tensor's 0-th dim has the same size.
+
+  Returns:
+    A scalar tensor which is the size of 0-th dim of every tensors in struct.
+  """
+  xs = nest.flatten(struct)
+  assert xs
+  dim0 = array_ops.shape(xs[0])[0]
+  return dim0
+
+
+def _Flatten(struct):
+  """Flattens a structure."""
+  return nest.flatten(struct)
+
+
+def _Pack(elements, struct_template):
+  """Packs the list of tensors according to the structure.
+
+  In the event that `elements` should be a scalar, `struct_template` must
+  contain exactly one non-trivial element (for instance, `[[], {'x':elt}]`).
+
+  Args:
+    elements: Elements to be packed. A list of tensor, or a single tensor.
+    struct_template: The container structure in which to pack them.
+  Returns:
+    A python structure of the same type as `struct_template`, containing
+    `elements` as its contained elements.
+  """
+  if not nest.is_sequence(elements):
+    return nest.pack_sequence_as(struct_template, [elements])
+  return nest.pack_sequence_as(struct_template, elements)
+
+
+def _EmptyAcc(slen, struct_template):
+  """Creates a set of accumulators for tensors in structure.
+
+  Args:
+    slen: The sequence length. A scalar tensor.
+    struct_template: A structure of tensors.
+
+  Returns:
+    A structure congruent to `struct_template`. Say ret is a returned
+    dictionary. Then, `ret.key`, a tensor, has the same dtype as
+    `struct_template.key`. The tensor's shape has 1 more dimension
+    than the tensor `struct_template.key`. The extra 0-th dimension is of size
+    `slen`. E.g., if `slen=10` and `struct_template.key`'s shape is `[3, 5]`,
+    then, `ret.key`'s shape is `[10, 3, 5]`.
+  """
+
+  def _EmptyAccForTensor(tensor):
+    return inplace_ops.empty(
+        array_ops.concat([[slen], array_ops.shape(tensor)], axis=0),
+        tensor.dtype,
+        init=True)
+
+  return nest.map_structure(_EmptyAccForTensor, struct_template)
+
+
+def _EmptyLike(struct):
+  """Creates a set of empty initialized tensors.
+
+  Args:
+    struct: A structure of tensors.
+
+  Returns:
+    A struct of tensors. Each tensor has the same shape and dtype as
+    its corresponding tensor in `struct`. And each tensor is initialized.
+  """
+  return nest.map_structure(
+      lambda x: inplace_ops.empty_like(x, init=True), struct)
+
+
+def _Add(struct_x, struct_y):
+  """Adds tensors in `struct_x` with respective tensors in `struct_y`.
+
+  Args:
+    struct_x: A struct of tensors.
+    struct_y: A struct of tensors congruent to `struct_x`.
+
+  Returns:
+    A struct of tensors. Each element of the returned value
+  equals `x + y`, with corresponding values in `struct_x` and `struct_y`.
+  """
+  list_x = nest.flatten(struct_x)
+  list_y = nest.flatten(struct_y)
+  z = []
+  for x, y in zip(list_x, list_y):
+    z += [math_ops.add(x, y)]
+  return nest.pack_sequence_as(struct_x, z)
+
+
+def _Dtypes(struct):
+  """Returns all tensors' data types in a list."""
+  return [x.dtype for x in nest.flatten(struct)]
+
+
+def _ConvertNoneGradientToZeros(xs, dxs):
+  """Sanitize dxs so that None becomes zeros appropriately.
+
+  Args:
+    xs: A list of tensors.
+    dxs: A list of tensors. dxs[i] corresponds to xs[i]'s gradient.
+
+  Returns:
+    A structure same as `dxs` with `None` replaced by a zero tensor.
+  """
+  list_xs = nest.flatten(xs)
+  list_dxs = nest.flatten(dxs)
+
+  # If x does not get any backprop-ed gradient, propagate zeros.
+  rets = []
+  for (x, dx) in zip(list_xs, list_dxs):
+    if dx is None:
+      rets.append(array_ops.zeros_like(x))
+    else:
+      rets.append(dx)
+
+  return nest.pack_sequence_as(dxs, rets)
+
+
+# All structures are flattened for use internally. This is for simplicity
+# and also to use the Defun construct.
+# In the forward pass (inference), the computation is structured as follows.
+# Forward: [gradient = _Recurrent.Grad]
+#   Flatten structures, create accumulators.
+#   for t = 0..max_input_length:
+#     Defun ForwardLoopBody:
+#       Defun Fwd: flatten/pack around cell_fn
+#       state1 = Fwd(inputs[t], state0)
+#       acc_state += [state1]
+#   Pack structures.
+# During the backward pass (backpropping the gradient from the last time
+# step to the first, through the structure), the computation is structured
+# as follows.
+# Grad:
+#   Flatten structures.
+#   Defun Backward:
+#     Create create accumulated derivatives: d_theta, d_inputs, d_acc_state.
+#     Regarding the note at the top of the file, there is only one accumulator
+#     for d_theta accumulated over the whole sequence.
+#     for t = max_input_length -1..0:
+#       Defun BackwardLoopBody:
+#         Retrieve acc_state[t] computed in the forward pass.
+#         Defun Bak: flatten/back around cell_fn_grad.
+#         d_state1 is d_state0 from previous step (ie next time).
+#         d_acc_state[dev_t] += d_state1
+#         d_theta_t, d_state0, d_inputs_t, = Bak()
+#         d_inputs[dev_t] += d_inputs
+#         d_theta += d_theta_t
+#         d_acc_state[t] += d_state1
+#   Pack structures and return.
+class _Recurrent(object):
+  """A helper class to construct a recurrent neural net."""
+
+  def __init__(self, cell_fn, cell_grad, theta, state0, inputs,
+               max_input_length, extras, use_tpu):
+    """RNN helper class.
+
+    Args:
+      cell_fn: A python function, which computes:
+         state1, extras = cell_fn(theta, state0, inputs[t, :])
+      cell_grad: A python function which computes:
+         dtheta, dstate0, dinputs[t, :] = cell_grad(
+           theta, state0, inputs[t, :], extras, dstate1)
+      theta: weights. A structure of tensors.
+      state0: initial state. A structure of tensors.
+      inputs: inputs. A structure of tensors.
+      max_input_length: None, or the maximum effective length of the input over
+        all batches. A scalar tensor.
+      extras: A structure of tensors. The 2nd return value of every
+        invocation of cell_fn is a structure of tensors with matching keys
+        and shapes of this `extras`.
+      use_tpu: A boolean indicating whether the computation is mean to
+        run on a TPU.
+    """
+    self._theta = theta
+    self._state = state0
+    self._inputs = inputs
+    self._max_input_length = self._MaybeComputeMaxInputLength(
+        inputs, max_input_length)
+    self._cell_fn = cell_fn
+    self._cell_grad = cell_grad
+    self._extras = extras
+
+    # pylint: disable=unbalanced-tuple-unpacking
+
+    # NOTE: TF Function (Fwd, Bak, ForwardLoopBody, BackwardLoopBody,
+    # Forward and Backward defined below) simply takes a list of
+    # Tensors and returns a list of Tensors. When we pass in a
+    # structure (a list of structures of Tensors), we use _Flatten to
+    # convert the structure into a list of tensor. Conversely, the
+    # following code often uses _Pack to formulate a structure from a
+    # list of tensors based on a "template".
+
+    # Wraps cell_fn in a TF Function:
+    #    state1 = cell_fn(theta, state0, inputs)
+    fwd_sig = [self._theta, self._state, self._inputs]
+
+    compiled = use_tpu
+    noinline = not compiled
+    dev_t_type = dtypes.int32 if use_tpu else dtypes.int64
+
+    @function.Defun(*_Dtypes(fwd_sig))
+    def Fwd(*args):
+      (theta, state0, inputs) = _Pack(args, fwd_sig)
+      state1, extras = self._cell_fn(theta, state0, inputs)
+      assert not function.get_extra_args(), (
+          'cell_fn is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(state1, self._state)
+      _AssertIsCompatible(extras, self._extras)
+      return _Flatten([state1, extras])
+
+    # Wraps cell_fn in a TF Function as a for-loop's body.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  state0: the previous recurrent state.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state is also stashed into
+    #    acc_state.
+    #  acc_extras: Each timestep's computed extras is stashed into acc_extras
+    fwdloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(fwdloop_sig))
+    def ForwardLoopBody(*args):
+      """The body of forward loop."""
+      t, dev_t = args[0], args[1]
+      (theta, state0, inputs, acc_state, acc_extras) = _Pack(
+          args[2:], fwdloop_sig)
+      inputs_t = _Index(inputs, t)  # external input at time step t.
+      fwd = Fwd(*_Flatten([theta, state0, inputs_t]))
+      state1, extras = _Pack(fwd, [self._state, self._extras])
+      # Saves state1 and extras in their accumulators.
+      acc_state = _Update(acc_state, state1, dev_t)
+      acc_extras = _Update(acc_extras, extras, dev_t)
+
+      return [math_ops.add(dev_t, 1)] + _Flatten(
+          [theta, state1, inputs, acc_state, acc_extras])
+
+    def Grad(op, *args):
+      """The python grad function for the Forward function."""
+
+      # NOTE: tf.gradient backprops None for int32/int64 while zeros
+      # for float32/float64. For consistency, we always backprop
+      # zeros.
+      args = list(args)
+      for i, dy in enumerate(args):
+        if dy is None:
+          args[i] = array_ops.zeros_like(op.outputs[i])
+      # TODO(drpng): getting the extra state here?
+      op_inputs = [x for x in op.inputs]
+      op_struct = [
+          self._theta, self._state, self._inputs, self._max_input_length,
+          self._extras
+      ]
+      (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct)
+      # acc_state and acc_extras are computed by the Forward pass and
+      # needed by the Backward pass.
+      acc_state, _, acc_extras = _Pack([x for x in op.outputs],
+                                       [self._state, self._state, self._extras])
+
+      # Forward computes acc_state, the final state and
+      # acc_extras. tf.gradients gives us their gradients w.r.t. the
+      # final loss. Because acc_extras are not exposed by Compute(),
+      # it has no gradients w.r.t. the final loss (i.e., by
+      # construction, it must be zeros).
+      d_acc_state, d_state1, _ = _Pack(args,
+                                       [self._state, self._state, self._extras])
+      return Backward(*_Flatten([
+          theta, state0, inputs, max_input_length, acc_state, acc_extras,
+          d_acc_state, d_state1
+      ]))
+
+    # Forward calls ForwardLoopBody n times. Each time computes one
+    # time step of the recurrent net.
+    forward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._extras
+    ]
+
+    @function.Defun(
+        *_Dtypes(forward_sig), python_grad_func=Grad, noinline=noinline)
+    def Forward(*args):
+      """Forward pass of the recurrent net."""
+      theta, state0, inputs, max_input_length, extras = _Pack(args, forward_sig)
+
+      slen_dim = _SeqLenDim(inputs)
+
+      # Creates accumulators for state0 and extras.
+      acc_state = _EmptyAcc(slen_dim, state0)
+      acc_extras = _EmptyAcc(slen_dim, extras)
+
+      dev_t = array_ops.constant(0, dtype=dev_t_type)
+      run = functional_ops.For(
+          start=0,
+          limit=max_input_length,
+          delta=1,
+          inputs=[dev_t] + _Flatten(
+              [theta, state0, inputs, acc_state, acc_extras]),
+          body=ForwardLoopBody,
+          rewrite_with_while=compiled)
+      _, state1, _, acc_state, acc_extras = _Pack(
+          run[1:],
+          [self._theta, self._state, self._inputs, self._state, self._extras])
+
+      return _Flatten([acc_state, state1, acc_extras])
+
+    # The per-step backward computes:
+    #    d_theta, d_state0, d_inputs = cell_grad(
+    #        theta, state0, inputs, extras, d_state1)
+    # where d_state1 is the backprop-ed gradient for state1, and
+    # extras is the computed by the forward step to facilitate the
+    # backward step.
+    bak_sig = [
+        self._theta, self._state, self._inputs, self._extras, self._state
+    ]
+
+    @function.Defun(*_Dtypes(bak_sig))
+    def Bak(*args):
+      """Backward step."""
+      (theta, state0, inputs, extras, d_state1) = _Pack(args, bak_sig)
+      (dtheta, dstate0, dinputs) = self._cell_grad(theta, state0, inputs,
+                                                   extras, d_state1)
+      assert not function.get_extra_args(), (
+          'cell_grad is not pure with extra args: %s.' %
+          (function.get_extra_args()))
+      _AssertIsCompatible(dtheta, self._theta)
+      _AssertIsCompatible(dstate0, self._state)
+      _AssertIsCompatible(dinputs, self._inputs)
+      return _Flatten(
+          _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                      [dtheta, dstate0, dinputs]))
+
+    # Define defuns used by a functional_ops.If in BackwardLoopBody.
+    state_if_sig = [self._state, self._state]
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnOrigState0(*args):
+      """Returns original state0 from inputs."""
+      (_, orig_state0) = _Pack(args, state_if_sig)
+      return nest.flatten(orig_state0)
+
+    @function.Defun(*_Dtypes(state_if_sig))
+    def ReturnAccState(*args):
+      """Returns acc_state[t-1] from inputs."""
+      (acc_state, _) = _Pack(args, state_if_sig)
+      return nest.flatten(acc_state)
+
+    # Wraps cell_grad gradient function in a TF Function as a
+    # for-loop's body for the Backward pass.
+    #
+    # The loop state is composed of:
+    #  t: The loop variable. Timestep id.
+    #  state0: the initial state for the entire backward loop.
+    #  dev_t: The loop variable mirrored on the device.
+    #  theta: the recurrent net's weights.
+    #  inputs: inputs to the recurrent net. inputs[t, :] are for the timestep t.
+    #  acc_state: Each timestep's computed new state was stashed into
+    #    acc_state by the Forward pass.
+    #  acc_extras: Each timestep's computed extras was stashed into
+    #    acc_extras by the Forward pass.
+    #  d_theta: All timestep's gradient for theta is accumulated (added) into
+    #      d_theta.
+    #  d_state1: The backprop-ed gradient for the new stated computed by
+    #      timestep t.
+    #  d_inputs: d_inputs[t, :] is populated by the backward time step t.
+    #  d_acc_state: The backprop-ed gradient for acc_state.
+    bakloop_sig = [
+        self._theta, self._state, self._inputs, self._state, self._extras,
+        self._theta, self._state, self._inputs, self._state
+    ]
+
+    @function.Defun(dtypes.int32, dev_t_type, *_Dtypes(bakloop_sig))
+    def BackwardLoopBody(*args):
+      """Backward loop body function."""
+      t, dev_t = args[0], args[1]
+      (theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+       d_inputs, d_acc_state) = _Pack(args[2:], bakloop_sig)
+
+      # The input recurrent state for time step t is previous time step's
+      # output, or the original state0 when on time step 0.
+      state_from_acc = _Index(acc_state, math_ops.maximum(0, t - 1))
+      state0 = functional_ops.If(
+          math_ops.equal(t, array_ops.constant(0, dtypes.int32)),
+          _Flatten([state_from_acc, orig_state0]), ReturnOrigState0,
+          ReturnAccState)
+      state0 = nest.pack_sequence_as(orig_state0, state0)
+
+      # The external inputs for time step t.
+      inputs_t = _Index(inputs, t)
+      # The extras for time step t.
+      extras_t = _Index(acc_extras, t)
+
+      d_state1 = _Add(_Index(d_acc_state, t), d_state1)
+      (d_theta_t, d_state0, d_inputs_t) = _Pack(
+          Bak(*_Flatten([theta, state0, inputs_t, extras_t, d_state1])),
+          [self._theta, self._state, self._inputs])
+      d_theta = _Add(d_theta, d_theta_t)
+      d_inputs = _Update(d_inputs, d_inputs_t, dev_t)
+      return [math_ops.subtract(dev_t, 1)] + _Flatten([
+          theta, orig_state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+          d_inputs, d_acc_state
+      ])
+
+    # Backward calls BackwardLoopBody n times.  Each time computes the backprop
+    # for one time step of the recurrent net.
+    backward_sig = [
+        self._theta, self._state, self._inputs, self._max_input_length,
+        self._state, self._extras, self._state, self._state
+    ]
+
+    @function.Defun(*_Dtypes(backward_sig), noinline=noinline)
+    def Backward(*args):
+      """Backward pass for the recurrent net."""
+      # theta, state0, inputs are Forward's inputs.
+      # acc_state is the accumulated 1st output of Forward.
+      # acc_extras is the accumulated 2nd output of Forward.
+      # d_acc_state is the gradient for acc_state.
+      # d_state1 is the gradient for the final state computed by Forward.
+      (theta, state0, inputs, max_input_length, acc_state, acc_extras,
+       d_acc_state, d_state1) = _Pack(args, backward_sig)
+
+      # Accumulators for gradients.
+      d_theta = _EmptyLike(theta)
+      d_inputs = _EmptyLike(inputs)
+
+      # Loop backwards. Note the loop's limit is open-ended, so goes through
+      # t=0.
+      t = max_input_length - 1
+      dev_t = math_ops.to_int32(t) if use_tpu else math_ops.to_int64(t)
+      run = functional_ops.For(
+          start=t,
+          limit=-1,
+          delta=-1,
+          inputs=[dev_t] + _Flatten([
+              theta, state0, inputs, acc_state, acc_extras, d_theta, d_state1,
+              d_inputs, d_acc_state
+          ]),
+          body=BackwardLoopBody,
+          rewrite_with_while=compiled)
+
+      (theta, state0, inputs, acc_state, acc_extras, d_theta, d_state0,
+       d_inputs, d_acc_state) = _Pack(run[1:], bakloop_sig)
+
+      d_max_input_length = array_ops.constant(0, dtype=max_input_length.dtype)
+      return _Flatten(
+          [d_theta, d_state0, d_inputs, d_max_input_length, acc_extras])
+
+    self._forward = Forward
+
+  def _MaybeComputeMaxInputLength(self, inputs, max_input_length):
+    if max_input_length is not None:
+      return max_input_length
+    return math_ops.reduce_max(array_ops.shape(nest.flatten(inputs)[0])[0])
+
+  def Compute(self):
+    return _Pack(
+        self._forward(*_Flatten([
+            self._theta, self._state, self._inputs, self._max_input_length,
+            self._extras
+        ])), [self._state, self._state, self._extras])[:2]
+
+
+def _GetCellGrad(cell_fn, cell_grad):
+  """Returns the gradient function for cell_fn.
+
+  Args:
+    cell_fn: The recurrent neural net's cell function.
+    cell_grad: If not None, cell_fn's gradient function.
+
+  Returns:
+    Returns cell_grad if not None. Otherwise, assume cell_fn is a python
+    function representing the recurrent neural net's cell function, i.e.,
+      cell_fn: (theta, state0, inputs) -> (state1, extra)
+    returns its default gradient python function, i.e.,
+      cell_grad: (theta, state0, inputs, extras, dstate1) -> (
+                  dtheta, dstate0, dinputs)
+  """
+
+  if cell_grad:
+    return cell_grad
+
+  def CellGrad(theta, state0, inputs, extras, dstate1):
+    """Default gradient function for cell_fn."""
+    # NOTE: The default grad function recomputes the forward
+    # function and does not take advantage of 'extras' returned by
+    # the forward function.
+    del extras
+    state1, extras = cell_fn(theta, state0, inputs)
+    ys = _Flatten([state1])
+    xs = _Flatten([theta, state0, inputs])
+    grad_ys = _Flatten([dstate1])
+    grads = gradients_impl.gradients(ys=ys, xs=xs, grad_ys=grad_ys)
+    return _ConvertNoneGradientToZeros([theta, state0, inputs],
+                                       _Pack(grads, [theta, state0, inputs]))
+
+  return CellGrad
+
+
+def _IsSingleTimeStep(inputs, max_input_length):
+  """Returns True only if the time dimension of inputs is 1."""
+  if not isinstance(max_input_length, ops.Tensor):
+    return max_input_length == 1
+  for x in nest.flatten(inputs):
+    if x.shape.dims is None or x.shape[0].value != 1:
+      return False
+  return True
+
+
+def Recurrent(theta,
+              state0,
+              inputs,
+              cell_fn,
+              cell_grad=None,
+              extras=None,
+              max_input_length=None,
+              use_tpu=False):
+  """Compute a recurrent neural net.
+
+  Roughly, Recurrent() computes the following:
+    state = state0
+    for t in inputs' sequence length:
+      state = cell_fn(theta, state, inputs[t, :])
+      accumulate_state[t, :] = state
+    return accumulate_state, state
+
+  theta, state, inputs are all structures of tensors.
+
+  inputs[t, :] means taking a slice out from every tensor in the inputs.
+
+  accumulate_state[t, :] = state means that we stash every tensor in
+  'state' into a slice of the corresponding tensor in
+  accumulate_state.
+
+  cell_fn is a python callable computing (building up a TensorFlow
+  graph) the recurrent neural network's one forward step. Two calls of
+  cell_fn must describe two identical computations.
+
+  By construction, Recurrent()'s backward computation does not access
+  any intermediate values computed by cell_fn during forward
+  computation. We may extend Recurrent() to support that by taking a
+  customized backward function of cell_fn.
+
+  Args:
+    theta: weights. A structure of tensors.
+    state0: initial state. A structure of tensors.
+    inputs: inputs. A structure of tensors.
+    cell_fn: A python function, which computes:
+      state1, extras = cell_fn(theta, state0, inputs[t, :])
+    cell_grad: A python function which computes:
+      dtheta, dstate0, dinputs[t, :] = cell_grad(
+        theta, state0, inputs[t, :], extras, dstate1)
+    extras: A structure of tensors. The 2nd return value of every
+      invocation of cell_fn is a structure of tensors with matching keys
+      and shapes of  this `extras`.
+    max_input_length: maximum length of effective input. This is used to
+      truncate the computation if the inputs have been allocated to a
+      larger size. A scalar tensor.
+    use_tpu: whether or not we are on TPU.
+
+  Returns:
+    accumulate_state and the final state.
+  """
+  if cell_grad is None and _IsSingleTimeStep(inputs, max_input_length):
+    # The seqlen length is staticly known as 1. Hence, we just need to
+    # call cell_fn once without putting it into a loop.
+    inputs = nest.map_structure(lambda x: array_ops.squeeze(x, axis=0), inputs)
+    state1, _ = cell_fn(theta, state0, inputs)
+    acc_state = nest.map_structure(lambda x: array_ops.expand_dims(x, axis=0),
+                                   state1)
+    return acc_state, state1
+
+  # If cell_grad is not given, derives the gradient function from
+  # cell_fn.
+  cell_grad = _GetCellGrad(cell_fn, cell_grad)
+
+  if extras is None:
+    # Derives 'extras' so that we can allocate extras' accumulator.
+    _, extras = cell_fn(theta, state0, _Index(inputs, 0))
+    extras = nest.map_structure(array_ops.zeros_like, extras)
+  else:
+    _, actual = cell_fn(theta, state0, _Index(inputs, 0))
+    _AssertIsCompatible(extras, actual)
+
+  return _Recurrent(
+      cell_fn=cell_fn,
+      cell_grad=cell_grad,
+      theta=theta,
+      state0=state0,
+      inputs=inputs,
+      max_input_length=max_input_length,
+      extras=extras,
+      use_tpu=use_tpu).Compute()
diff --git a/tensorflow/experimental_api.py b/tensorflow/contrib/recurrent/python/recurrent_api.py
similarity index 53%
rename from tensorflow/experimental_api.py
rename to tensorflow/contrib/recurrent/python/recurrent_api.py
index 63a8aa9cb1dc130a7999c3b248815633998c4cd0..ffe1dcf7dc49554db56ee8e8fabedf976310a554 100644
--- a/tensorflow/experimental_api.py
+++ b/tensorflow/contrib/recurrent/python/recurrent_api.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,26 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-# Bring in all of the public TensorFlow interface into this
-# module.
+"""Recurrent computations library."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
-
-from tensorflow.python.util.lazy_loader import LazyLoader
-contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
-del LazyLoader
-
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-app.flags = flags  # pylint: disable=undefined-variable
+# pylint: disable=unused-import
+from tensorflow.contrib.recurrent.python.ops import functional_bidirectional_rnn
+from tensorflow.contrib.recurrent.python.ops import functional_rnn
+from tensorflow.contrib.recurrent.python.ops import Recurrent
+# pylint: enable=unused-import
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/reduce_slice_ops/BUILD b/tensorflow/contrib/reduce_slice_ops/BUILD
index b31f4488f5882a0bc4e419668dba5da72d69b7fe..02b3d66e4612d0f7eb29959d6c9f8472379fe16c 100644
--- a/tensorflow/contrib/reduce_slice_ops/BUILD
+++ b/tensorflow/contrib/reduce_slice_ops/BUILD
@@ -101,15 +101,3 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/remote_fused_graph/pylib/BUILD b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
index 27f0a7f58f78135f1d73ae04bd1e76ef496fa549..3aa8a14f44f38de51ed61f0b894cfd77ea9329f8 100644
--- a/tensorflow/contrib/remote_fused_graph/pylib/BUILD
+++ b/tensorflow/contrib/remote_fused_graph/pylib/BUILD
@@ -38,7 +38,6 @@ py_test(
     size = "small",
     srcs = ["python/ops/remote_fused_graph_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":remote_fused_graph_ops_py",
         "//tensorflow/core:protos_all_py",
@@ -48,15 +47,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index f0ecc8b85a5db93075d3cf0b55e7df95732bcf94..48345d7030bea431152bbed934af9f500f2c15c5 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -85,14 +85,3 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 7e5e35d0b55c97946c022e55180765d982eaa87a..43c0f7595590802aa80e1012967d377a6ab83d29 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -321,19 +321,6 @@ tf_cc_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "tools/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_gen_op_libs(
     op_lib_names = [
         "lstm_ops",
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
index 63fdd91d368d97007280871f3886e5649e6b2e86..c7d85862f65674f60c9f63fd5c649afa75b95cc0 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/rnn_cell_test.py
@@ -842,12 +842,12 @@ class RNNCellTest(test.TestCase):
       batch_size = 3
       input_size = 4
       expected_state_c = np.array(
-          [[6.450831e-04, 4.697885e-04], [9.862894e-05, 7.212213e-04],
-           [4.401947e-04, 9.143004e-04]],
+          [[0.00072015, 0.00036633], [0.00083481, 0.00047266],
+           [0.00085111, 0.00053054]],
           dtype=np.float32)
       expected_state_h = np.array(
-          [[4.621217e-04, 3.365449e-04], [7.438179e-05, 5.439147e-04],
-           [3.347936e-04, 6.953785e-04]],
+          [[0.0005159, 0.00026243], [0.00062958, 0.00035646],
+           [0.00064732, 0.00040351]],
           dtype=np.float32)
       with variable_scope.variable_scope(
           "root", initializer=init_ops.constant_initializer(0.5)):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index 2f6ae9f3678e58dae67bf777991641b10e42ef94..b12e2cd5eddc3f8abdba62781692673a40e41d9b 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -2891,7 +2891,7 @@ class WeightNormLSTMCell(rnn_cell_impl.RNNCell):
 
     output_size = weight.get_shape().as_list()[1]
     g = vs.get_variable(name, [output_size], dtype=weight.dtype)
-    return nn_impl.l2_normalize(weight, dim=0) * g
+    return nn_impl.l2_normalize(weight, axis=0) * g
 
   def _linear(self,
               args,
diff --git a/tensorflow/contrib/rpc/BUILD b/tensorflow/contrib/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..597f18c77197127cf99a3fbd0d2d22cac9131792
--- /dev/null
+++ b/tensorflow/contrib/rpc/BUILD
@@ -0,0 +1,13 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "rpc",
+    srcs = [
+        "__init__.py",
+    ],
+    deps = ["//tensorflow/contrib/rpc/python/ops:rpc_op_py"],
+)
diff --git a/tensorflow/contrib/rpc/__init__.py b/tensorflow/contrib/rpc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c65c1a05def92b91ddd75e7aecdb4e4d9b8abe8a
--- /dev/null
+++ b/tensorflow/contrib/rpc/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops and modules related to RPC.
+
+@@rpc
+@@try_rpc
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.rpc_op import try_rpc
+
+from tensorflow.python.util.all_util import remove_undocumented
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/rpc/python/ops/BUILD b/tensorflow/contrib/rpc/python/ops/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..84d2a1832f14b61ec313e7a1a00b0672bc410cfb
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/BUILD
@@ -0,0 +1,24 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
+
+py_library(
+    name = "rpc_op_py",
+    srcs = ["rpc_op.py"],
+    deps = [
+        ":gen_rpc_op_py",
+        "//tensorflow/python:framework_ops",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_rpc_op_py",
+    out = "gen_rpc_op.py",
+    deps = [
+        "//tensorflow/core:rpc_ops_op_lib",
+    ],
+)
diff --git a/tensorflow/contrib/rpc/python/ops/rpc_op.py b/tensorflow/contrib/rpc/python/ops/rpc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1b6c41137828950e73757579dca1eba4adf2ae4
--- /dev/null
+++ b/tensorflow/contrib/rpc/python/ops/rpc_op.py
@@ -0,0 +1,26 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# pylint: disable=wildcard-import,unused-import
+"""RPC communication."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import rpc
+from tensorflow.contrib.rpc.python.ops.gen_rpc_op import try_rpc
+from tensorflow.python.framework import ops
+ops.NotDifferentiable("Rpc")
+ops.NotDifferentiable("TryRpc")
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 245fe07f2bcdaddb2bc47c0e1234dc1f19bd85e3..26fd4e2023806765ea4088f4c13a780ca7338bff 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -48,15 +48,14 @@ py_library(
     ],
 )
 
-py_test(
-    name = "reader_test",
-    size = "small",
-    srcs = ["python/saved_model/reader_test.py"],
+py_library(
+    name = "reader",
+    srcs = ["python/saved_model/reader.py"],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:private"],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:public"],
     deps = [
         ":saved_model_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:variables",
@@ -65,6 +64,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "reader_test",
+    size = "small",
+    srcs = ["python/saved_model/reader_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    visibility = ["//visibility:private"],
+    deps = [
+        ":reader",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "signature_def_utils_test",
     size = "small",
@@ -81,15 +93,3 @@ py_test(
         "//tensorflow/python/saved_model:utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/saved_model/cc/saved_model/BUILD b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
index ea4da80ba38389fb63a5dfe3cf608b959939c7ca..3c616c555b88cf5ec948bef3df5c2fef5caed0d4 100644
--- a/tensorflow/contrib/saved_model/cc/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/cc/saved_model/BUILD
@@ -49,9 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/BUILD b/tensorflow/contrib/seq2seq/BUILD
index ab80c68b1a8e4ff151494e393b68c460846fa8fe..a62069a252155a8bd1c6251d9dd3a4564a81c295 100644
--- a/tensorflow/contrib/seq2seq/BUILD
+++ b/tensorflow/contrib/seq2seq/BUILD
@@ -211,15 +211,3 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 07b3ad71d4698b990fc5fbb1dc30fc787872d495..d508cf3f9db81aa7c3a1174ed13f2310b0595b04 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -353,6 +353,42 @@ class AttentionWrapperTest(test.TestCase):
         attention_mechanism_depth=9,
         name='testLuongNotNormalized')
 
+  def testLuongScaledDType(self):
+    # Test case for GitHub issue 18099
+    for dtype in [np.float16, np.float32, np.float64]:
+      num_units = 128
+      encoder_outputs = array_ops.placeholder(dtype, shape=[64, None, 256])
+      encoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      decoder_inputs = array_ops.placeholder(dtype, shape=[64, None, 128])
+      decoder_sequence_length = array_ops.placeholder(dtypes.int32, shape=[64])
+      batch_size = 64
+      attention_mechanism = wrapper.LuongAttention(
+          num_units=num_units,
+          memory=encoder_outputs,
+          memory_sequence_length=encoder_sequence_length,
+          scale=True,
+          dtype=dtype,
+      )
+      cell = rnn_cell.LSTMCell(num_units)
+      cell = wrapper.AttentionWrapper(cell, attention_mechanism)
+
+      helper = helper_py.TrainingHelper(decoder_inputs,
+                                        decoder_sequence_length)
+      my_decoder = basic_decoder.BasicDecoder(
+          cell=cell,
+          helper=helper,
+          initial_state=cell.zero_state(
+              dtype=dtype, batch_size=batch_size))
+
+      final_outputs, final_state, _ = decoder.dynamic_decode(my_decoder)
+      self.assertTrue(
+          isinstance(final_outputs, basic_decoder.BasicDecoderOutput))
+      self.assertEqual(final_outputs.rnn_output.dtype, dtype)
+      self.assertTrue(
+          isinstance(final_state, wrapper.AttentionWrapperState))
+      self.assertTrue(
+          isinstance(final_state.cell_state, rnn_cell.LSTMStateTuple))
+
   def testLuongScaled(self):
     create_attention_mechanism = functools.partial(
         wrapper.LuongAttention, scale=True)
diff --git a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
index be537798268b7938bb68e7d96ae2a1d51685433f..f0f143ddfcf17c0e471add804ac4920b02da68e0 100644
--- a/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
+++ b/tensorflow/contrib/seq2seq/python/ops/attention_wrapper.py
@@ -339,7 +339,8 @@ def _luong_score(query, keys, scale):
   if scale:
     # Scalar used in weight scaling
     g = variable_scope.get_variable(
-        "attention_g", dtype=dtype, initializer=1.)
+        "attention_g", dtype=dtype,
+        initializer=init_ops.ones_initializer, shape=())
     score = g * score
   return score
 
@@ -609,8 +610,8 @@ def monotonic_attention(p_choose_i, previous_attention, mode):
   addition, once an input sequence element is attended to at a given output
   timestep, elements occurring before it cannot be attended to at subsequent
   output timesteps.  This function generates attention distributions according
-  to these assumptions.  For more information, see ``Online and Linear-Time
-  Attention by Enforcing Monotonic Alignments''.
+  to these assumptions.  For more information, see `Online and Linear-Time
+  Attention by Enforcing Monotonic Alignments`.
 
   Args:
     p_choose_i: Probability of choosing input sequence/memory element i.  Should
diff --git a/tensorflow/contrib/session_bundle/BUILD b/tensorflow/contrib/session_bundle/BUILD
index 75a753ed89a5ea13b7b79f480511979c38f321e3..9c0885918071c25ab65cb4044bc19ea22c55442a 100644
--- a/tensorflow/contrib/session_bundle/BUILD
+++ b/tensorflow/contrib/session_bundle/BUILD
@@ -17,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-)
-
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
     name = "session_bundle_half_plus_two",
@@ -163,6 +151,7 @@ py_test(
     name = "gc_test",
     srcs = ["gc_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     visibility = ["//visibility:private"],
     deps = [
         ":gc",
diff --git a/tensorflow/contrib/session_bundle/example/BUILD b/tensorflow/contrib/session_bundle/example/BUILD
index dbbae01f3661b81f35350470c08ec65b3488b7fc..9a56eab431d66c53c1c51341d48bf47eb8926829 100644
--- a/tensorflow/contrib/session_bundle/example/BUILD
+++ b/tensorflow/contrib/session_bundle/example/BUILD
@@ -10,19 +10,6 @@ exports_files(["LICENSE"])
 
 # vardef("PYTHON_BIN_PATH", "/usr/bin/python")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "export_half_plus_two",
     srcs = [
diff --git a/tensorflow/contrib/session_bundle/session_bundle_test.cc b/tensorflow/contrib/session_bundle/session_bundle_test.cc
index 6d997bac9ee8e0fe242455686cc00a016d9bd768..612623ae309f6393beb258138b7b795c2a25d4e1 100644
--- a/tensorflow/contrib/session_bundle/session_bundle_test.cc
+++ b/tensorflow/contrib/session_bundle/session_bundle_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
@@ -239,8 +240,8 @@ TEST(LoadSessionBundleFromPath, BasicTestRunOptionsThreadPoolInvalid) {
 
   // Expect failed session run calls with invalid run-options.
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Invalid inter_op_thread_pool: 2"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Invalid inter_op_thread_pool: 2"))
       << status.error_message();
 }
 
@@ -314,8 +315,8 @@ TEST_F(SessionBundleTest, ServingGraphEmpty) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving GraphDef"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving GraphDef"))
       << status_.error_message();
 }
 
@@ -330,8 +331,9 @@ TEST_F(SessionBundleTest, ServingGraphAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected Any type_url for: tensorflow.GraphDef"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(),
+                            "Expected Any type_url for: tensorflow.GraphDef"))
       << status_.error_message();
 }
 
@@ -347,7 +349,8 @@ TEST_F(SessionBundleTest, ServingGraphAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -362,9 +365,9 @@ TEST_F(SessionBundleTest, AssetFileAnyIncorrectType) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(
-      StringPiece(status_.error_message())
-          .contains("Expected Any type_url for: tensorflow.serving.AssetFile"))
+  EXPECT_TRUE(str_util::StrContains(
+      status_.error_message(),
+      "Expected Any type_url for: tensorflow.serving.AssetFile"))
       << status_.error_message();
 }
 
@@ -380,7 +383,8 @@ TEST_F(SessionBundleTest, AssetFileAnyValueCorrupted) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(
+      str_util::StrContains(status_.error_message(), "Failed to unpack"))
       << status_.error_message();
 }
 
@@ -395,8 +399,8 @@ TEST_F(SessionBundleTest, InitOpTooManyValues) {
   });
   status_ = LoadSessionBundleFromPath(options_, path, &bundle_);
   EXPECT_FALSE(status_.ok());
-  EXPECT_TRUE(StringPiece(status_.error_message())
-                  .contains("Expected exactly one serving init op"))
+  EXPECT_TRUE(str_util::StrContains(status_.error_message(),
+                                    "Expected exactly one serving init op"))
       << status_.error_message();
 }
 
diff --git a/tensorflow/contrib/session_bundle/signature_test.cc b/tensorflow/contrib/session_bundle/signature_test.cc
index 741b7fde9bdb40e8d0d7e4396676dfff036970d6..b1ff55552e0932ddc100adc4a257016fa3923120 100644
--- a/tensorflow/contrib/session_bundle/signature_test.cc
+++ b/tensorflow/contrib/session_bundle/signature_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -33,8 +34,8 @@ namespace tensorflow {
 namespace serving {
 namespace {
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
@@ -69,8 +70,8 @@ TEST(GetClassificationSignature, MissingSignature) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -86,8 +87,8 @@ TEST(GetClassificationSignature, WrongSignatureType) {
   ClassificationSignature signature;
   const Status status = GetClassificationSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a classification signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a classification signature"))
       << status.error_message();
 }
 
@@ -122,8 +123,8 @@ TEST(GetNamedClassificationSignature, MissingSignature) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -141,9 +142,9 @@ TEST(GetNamedClassificationSignature, WrongSignatureType) {
   const Status status =
       GetNamedClassificationSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains("Expected a classification signature for name \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "Expected a classification signature for name \"foo\""))
       << status.error_message();
 }
 
@@ -176,8 +177,8 @@ TEST(GetRegressionSignature, MissingSignature) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -193,8 +194,8 @@ TEST(GetRegressionSignature, WrongSignatureType) {
   RegressionSignature signature;
   const Status status = GetRegressionSignature(meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a regression signature"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a regression signature"))
       << status.error_message();
 }
 
@@ -227,8 +228,8 @@ TEST(GetNamedSignature, MissingSignature) {
   Signature signature;
   const Status status = GetNamedSignature("foo", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Missing signature named \"foo\""))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Missing signature named \"foo\""))
       << status.error_message();
 }
 
@@ -370,7 +371,7 @@ TEST(RunClassification, RunNotOk) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -386,7 +387,8 @@ TEST(RunClassification, TooManyOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Expected 1 output"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "Expected 1 output"))
       << status.error_message();
 }
 
@@ -402,8 +404,9 @@ TEST(RunClassification, WrongBatchOutputs) {
   const Status status = RunClassification(signature, input_tensor, &session,
                                           &classes_tensor, nullptr);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -449,7 +452,7 @@ TEST_F(RunRegressionTest, RunNotOk) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Data is gone"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Data is gone"))
       << status.error_message();
 }
 
@@ -460,8 +463,9 @@ TEST_F(RunRegressionTest, MismatchedSizeForBatchInputAndOutput) {
   const Status status =
       RunRegression(signature_, input_tensor_, &session_, &output_tensor_);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Input batch size did not match output batch size"))
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "Input batch size did not match output batch size"))
       << status.error_message();
 }
 
@@ -488,7 +492,7 @@ TEST(GetSignatures, MissingSignature) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -502,9 +506,9 @@ TEST(GetSignatures, WrongProtoInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected Any type_url for: "
-                            "tensorflow.serving.Signatures"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected Any type_url for: "
+                                    "tensorflow.serving.Signatures"))
       << status.error_message();
 }
 
@@ -519,7 +523,7 @@ TEST(GetSignatures, JunkInAny) {
   Signatures read_signatures;
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message()).contains("Failed to unpack"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Failed to unpack"))
       << status.error_message();
 }
 
@@ -567,7 +571,7 @@ TEST(GetSignatures, MultipleSignaturesNotOK) {
   const auto status = GetSignatures(meta_graph_def, &read_signatures);
   EXPECT_EQ(tensorflow::error::FAILED_PRECONDITION, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Expected exactly one"))
+      str_util::StrContains(status.error_message(), "Expected exactly one"))
       << status.error_message();
 }
 
@@ -641,8 +645,8 @@ TEST(GetGenericSignature, WrongSignatureType) {
   const Status status =
       GetGenericSignature("generic_bindings", meta_graph_def, &signature);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Expected a generic signature:"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "Expected a generic signature:"))
       << status.error_message();
 }
 
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index a83fc20596c8ad7e1cf94ede8b10d82e25f47b17..fdecceff526a860a274354e53e824b98d11418a6 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -130,15 +130,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
index 1c052354b8afcc5fd8a53b783cc5c676588cf48c..64cc8c7ea54673ac748be73e677575331d8e1cc9 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -338,7 +338,7 @@ class FrameTest(test.TestCase):
 
   def test_constant_folding(self):
     """frame should be constant foldable for constant inputs."""
-    for pad_end in [False, True]:
+    for pad_end in [True, False]:
       g = ops.Graph()
       with g.as_default():
         frame_length, frame_step = 32, 16
diff --git a/tensorflow/contrib/slim/BUILD b/tensorflow/contrib/slim/BUILD
index c2f106c2b28029f05648716bb08cd2531729fb36..516e3ea073268e9b113a1e13577551ccacbf4206 100644
--- a/tensorflow/contrib/slim/BUILD
+++ b/tensorflow/contrib/slim/BUILD
@@ -178,15 +178,3 @@ py_test(
         "//tensorflow/python:summary",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/data/BUILD b/tensorflow/contrib/slim/python/slim/data/BUILD
index 5daabbd62e7e63608a7a86a8b7fb0bc0d570b28b..eef043e83276dcdffe491ee9b981c8de0894f592 100644
--- a/tensorflow/contrib/slim/python/slim/data/BUILD
+++ b/tensorflow/contrib/slim/python/slim/data/BUILD
@@ -61,6 +61,7 @@ py_test(
     name = "dataset_data_provider_test",
     srcs = ["dataset_data_provider_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":dataset",
         ":dataset_data_provider",
@@ -193,15 +194,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation.py b/tensorflow/contrib/slim/python/slim/evaluation.py
index 3caf4e02da3aa2d7e586c4e76807a11f84585ea6..5cfd5ee82e2a0fce33311a8783d2d4ceb031544d 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation.py
@@ -230,6 +230,7 @@ def evaluation_loop(master,
                     max_number_of_evaluations=None,
                     session_config=None,
                     timeout=None,
+                    timeout_fn=None,
                     hooks=None):
   """Runs TF-Slim's Evaluation Loop.
 
@@ -261,6 +262,9 @@ def evaluation_loop(master,
       configure the `Session`. If left as `None`, the default will be used.
     timeout: The maximum amount of time to wait between checkpoints. If left as
       `None`, then the process will wait indefinitely.
+    timeout_fn: Optional function to call after a timeout.  If the function
+      returns True, then it means that no new checkpoints will be generated and
+      the iterator will exit.  The function is called with no arguments.
     hooks: A list of additional `SessionRunHook` objects to pass during
       repeated evaluations.
 
@@ -298,4 +302,5 @@ def evaluation_loop(master,
       hooks=all_hooks,
       config=session_config,
       max_number_of_evaluations=max_number_of_evaluations,
-      timeout=timeout)
+      timeout=timeout,
+      timeout_fn=timeout_fn)
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index c24bd048512daaae116e732ac437f7c9b6f6d7fc..94fc12ca814721acf62f16b72ffa50473043cc8b 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -177,6 +177,17 @@ class EvaluationTest(test.TestCase):
     # The timeout kicked in.
     self.assertLess(end, start + 1.1)
 
+  def testTimeoutFnOnEvaluationLoop(self):
+    # We require a mutable object (e.g. list but not an int) to maintain state
+    # across calls of a nested function.
+    timeout_fn_calls = [0]
+    def _TimeoutFn():
+      timeout_fn_calls[0] += 1
+      return timeout_fn_calls[0] >= 3
+    # Need not do any evaluation, but should just call timeout_fn repeatedly.
+    evaluation.evaluation_loop('', '', '', timeout=0, timeout_fn=_TimeoutFn)
+    self.assertEqual(timeout_fn_calls[0], 3)
+
   def testMonitorCheckpointsLoopTimeout(self):
     ret = list(
         evaluation_lib.checkpoints_iterator(
diff --git a/tensorflow/contrib/slim/python/slim/nets/BUILD b/tensorflow/contrib/slim/python/slim/nets/BUILD
index 7f03aaf085cf26e3f5f940f4388828006a02ef42..8bbdf96384683c68648367c6433eeb89c64c22bf 100644
--- a/tensorflow/contrib/slim/python/slim/nets/BUILD
+++ b/tensorflow/contrib/slim/python/slim/nets/BUILD
@@ -317,15 +317,3 @@ py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/solvers/BUILD b/tensorflow/contrib/solvers/BUILD
index 87b67486ad413ad537aa8cb68f9f7bef729dd488..5247288d54aaf4e3020d38618b74f1118a69a105 100644
--- a/tensorflow/contrib/solvers/BUILD
+++ b/tensorflow/contrib/solvers/BUILD
@@ -93,16 +93,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-# All files
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index fcfaa2aba4e8ab086a9eac053188f8fbd4f6f39a..b729fff261192be22c6a56fa9ca0a641f302c570 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -65,15 +65,3 @@ cuda_py_tests(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/sparsemax/__init__.py b/tensorflow/contrib/sparsemax/__init__.py
index 19d213fb3e8f615190d67862b1928205f31146b4..7bc726f4a84d683517b73814193429220f864735 100644
--- a/tensorflow/contrib/sparsemax/__init__.py
+++ b/tensorflow/contrib/sparsemax/__init__.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Module that implements sparsemax and sparsemax loss, see [1].
 
-[1] https://arxiv.org/abs/1602.02068
+[1]: https://arxiv.org/abs/1602.02068
 
 ## Sparsemax
 
diff --git a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
index 890ca20f4cabd65146e803e54e554a5c97e72427..e617af2ff1b731eddb5b72469a1cd67e7cfd163f 100644
--- a/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
+++ b/tensorflow/contrib/sparsemax/python/ops/sparsemax.py
@@ -31,7 +31,7 @@ def sparsemax(logits, name=None):
   """Computes sparsemax activations [1].
 
   For each batch `i` and class `j` we have
-    sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)
+    $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$
 
   [1]: https://arxiv.org/abs/1602.02068
 
diff --git a/tensorflow/contrib/specs/BUILD b/tensorflow/contrib/specs/BUILD
index 084953a0a226cde46ebd9d2031d20cb839180ca8..055b04db8a5654ebf6fee45547d58f0375f9a554 100644
--- a/tensorflow/contrib/specs/BUILD
+++ b/tensorflow/contrib/specs/BUILD
@@ -60,15 +60,3 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/staging/BUILD b/tensorflow/contrib/staging/BUILD
index bc4a289468c257e7e5e2bd437b8d6d1235980495..0c86f3db1d5bc262f27440754c86f8c63e16b690 100644
--- a/tensorflow/contrib/staging/BUILD
+++ b/tensorflow/contrib/staging/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "staging",
     srcs = ["__init__.py"],
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 5fd02efbf6327b20eade6785007930eed3fd4e03..30be14c10cd8576ded75b8489cc89d439a9cc282 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,16 +31,5 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    tags = ["no_windows"],
 )
diff --git a/tensorflow/contrib/stateless/BUILD b/tensorflow/contrib/stateless/BUILD
index 6e259e1d32be64f3b593faf73e8af4f704d72349..dcbef2881df7b5543d664c4b385927f52ae2cbaa 100644
--- a/tensorflow/contrib/stateless/BUILD
+++ b/tensorflow/contrib/stateless/BUILD
@@ -38,15 +38,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/summary/BUILD b/tensorflow/contrib/summary/BUILD
index 80563c5e150dfb74ef11bc912e95345a1a015212..f88b03ec4c2b1f250091594ea12d7d1862029fa2 100644
--- a/tensorflow/contrib/summary/BUILD
+++ b/tensorflow/contrib/summary/BUILD
@@ -15,7 +15,6 @@ py_test(
     srcs = ["summary_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
@@ -23,6 +22,7 @@ py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:test",
@@ -35,7 +35,6 @@ py_test(
     srcs = ["summary_ops_graph_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":summary_ops",
         ":summary_test_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -44,31 +43,9 @@ py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "summary_ops",
-    srcs = ["summary_ops.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:summary_ops_gen",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:variables",
         "@six_archive//:six",
     ],
 )
@@ -79,22 +56,10 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":summary_ops",
+        "//tensorflow/python:summary_ops_v2",
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # NOTE: target cannot be testonly because it needs to be in the pip
 # package. Sigh.
 py_library(
@@ -104,8 +69,10 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary_ops_v2",
         "@org_sqlite//:python",
     ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 2d6d7ea6a3eff2562ba8def4117e3aa6f818b6fd..99ced53e1167ec5486d0b75cff81ffbf857c2be7 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -61,23 +61,23 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.contrib.summary.summary_ops import all_summary_ops
-from tensorflow.contrib.summary.summary_ops import always_record_summaries
-from tensorflow.contrib.summary.summary_ops import audio
-from tensorflow.contrib.summary.summary_ops import create_db_writer
-from tensorflow.contrib.summary.summary_ops import create_file_writer
-from tensorflow.contrib.summary.summary_ops import create_summary_file_writer
-from tensorflow.contrib.summary.summary_ops import eval_dir
-from tensorflow.contrib.summary.summary_ops import flush
-from tensorflow.contrib.summary.summary_ops import generic
-from tensorflow.contrib.summary.summary_ops import graph
-from tensorflow.contrib.summary.summary_ops import histogram
-from tensorflow.contrib.summary.summary_ops import image
-from tensorflow.contrib.summary.summary_ops import import_event
-from tensorflow.contrib.summary.summary_ops import initialize
-from tensorflow.contrib.summary.summary_ops import never_record_summaries
-from tensorflow.contrib.summary.summary_ops import record_summaries_every_n_global_steps
-from tensorflow.contrib.summary.summary_ops import scalar
-from tensorflow.contrib.summary.summary_ops import should_record_summaries
-from tensorflow.contrib.summary.summary_ops import summary_writer_initializer_op
-from tensorflow.contrib.summary.summary_ops import SummaryWriter
+from tensorflow.python.ops.summary_ops_v2 import all_summary_ops
+from tensorflow.python.ops.summary_ops_v2 import always_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import audio
+from tensorflow.python.ops.summary_ops_v2 import create_db_writer
+from tensorflow.python.ops.summary_ops_v2 import create_file_writer
+from tensorflow.python.ops.summary_ops_v2 import create_summary_file_writer
+from tensorflow.python.ops.summary_ops_v2 import eval_dir
+from tensorflow.python.ops.summary_ops_v2 import flush
+from tensorflow.python.ops.summary_ops_v2 import generic
+from tensorflow.python.ops.summary_ops_v2 import graph
+from tensorflow.python.ops.summary_ops_v2 import histogram
+from tensorflow.python.ops.summary_ops_v2 import image
+from tensorflow.python.ops.summary_ops_v2 import import_event
+from tensorflow.python.ops.summary_ops_v2 import initialize
+from tensorflow.python.ops.summary_ops_v2 import never_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import record_summaries_every_n_global_steps
+from tensorflow.python.ops.summary_ops_v2 import scalar
+from tensorflow.python.ops.summary_ops_v2 import should_record_summaries
+from tensorflow.python.ops.summary_ops_v2 import summary_writer_initializer_op
+from tensorflow.python.ops.summary_ops_v2 import SummaryWriter
diff --git a/tensorflow/contrib/summary/summary_ops_graph_test.py b/tensorflow/contrib/summary/summary_ops_graph_test.py
index 3aba04540eba12092d884cca10e23546eb91c91d..ae8336daaf8ea9113716b90b6ea9be9de7303596 100644
--- a/tensorflow/contrib/summary/summary_ops_graph_test.py
+++ b/tensorflow/contrib/summary/summary_ops_graph_test.py
@@ -16,27 +16,220 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.training import training_util
 
 get_all = summary_test_util.get_all
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class GraphFileTest(test_util.TensorFlowTestCase):
+
+  def testSummaryOps(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.generic('tensor', 1, step=1)
+      summary_ops.scalar('scalar', 2.0, step=1)
+      summary_ops.histogram('histogram', [1.0], step=1)
+      summary_ops.image('image', [[[[1.0]]]], step=1)
+      summary_ops.audio('audio', [[1.0]], 1.0, 1, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    # The working condition of the ops is tested in the C++ test so we just
+    # test here that we're calling them correctly.
+    self.assertTrue(gfile.Exists(logdir))
+
+  def testSummaryName(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scalar', events[1].summary.value[0].tag)
+
+  def testSummaryNameScope(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      with ops.name_scope('scope'):
+        summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      sess.run(summary_ops.all_summary_ops())
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual('scope/scalar', events[1].summary.value[0].tag)
+
+  def testSummaryGlobalStep(self):
+    training_util.get_or_create_global_step()
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(logdir, max_queue=0)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0)
+    with self.test_session() as sess:
+      sess.run(variables.global_variables_initializer())
+      sess.run(summary_ops.summary_writer_initializer_op())
+      step, _ = sess.run(
+          [training_util.get_global_step(), summary_ops.all_summary_ops()])
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(2, len(events))
+    self.assertEqual(step, events[1].step)
+
+  def testMaxQueue(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=1, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(3, get_total())
+
+  def testFlushFunction(self):
+    logdir = self.get_temp_dir()
+    writer = summary_ops.create_file_writer(
+        logdir, max_queue=999999, flush_millis=999999)
+    with writer.as_default(), summary_ops.always_record_summaries():
+      summary_ops.scalar('scalar', 2.0, step=1)
+      flush_op = summary_ops.flush()
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      # Note: First tf.Event is always file_version.
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(flush_op)
+      self.assertEqual(2, get_total())
+      # Test "writer" parameter
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer))
+      self.assertEqual(3, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(summary_ops.flush(writer=writer._resource))  # pylint:disable=protected-access
+      self.assertEqual(4, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+      # Create with different shared name (should be separate resource/file)
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+
+    with self.test_session() as sess:
+      # Run init ops across writers sequentially to avoid race condition.
+      # TODO(nickfelt): fix race condition in resource manager lookup or create
+      sess.run(writer1.init())
+      sess.run(writer2.init())
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer3.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run([writer1.flush(), writer2.flush(), writer3.flush()])
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['one', 'two'], tags)
+
+    # Second file has tag "three"
+    events = summary_test_util.events_from_file(next(event_files))
+    self.assertEqual('brain.Event:2', events[0].file_version)
+    tags = [e.summary.value[0].tag for e in events[1:]]
+    self.assertItemsEqual(['three'], tags)
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      # Running init() again while writer is open has no effect
+      sess.run(writer.init())
+      self.assertEqual(1, get_total())
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      # Running close() should do an implicit flush
+      sess.run(writer.close())
+      self.assertEqual(2, get_total())
+      # Running init() on a closed writer should start a new file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      sess.run(writer.init())
+      sess.run(summary_ops.all_summary_ops())
+      sess.run(writer.close())
+      files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+      self.assertEqual(2, len(files))
+      self.assertEqual(2, len(summary_test_util.events_from_file(files[1])))
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+    with self.test_session() as sess:
+      sess.run(summary_ops.summary_writer_initializer_op())
+      get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+      self.assertEqual(1, get_total())  # file_version Event
+      sess.run(summary_ops.all_summary_ops())
+      self.assertEqual(1, get_total())
+      sess.run(writer.flush())
+      self.assertEqual(2, get_total())
+
+
+class GraphDbTest(summary_test_util.SummaryDbTest):
 
   def testGraphPassedToGraph_isForbiddenForThineOwnSafety(self):
     with self.assertRaises(TypeError):
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index c756f8b27055f9cf86a311e485d97745a3c7a95b..f1ef218e74bbd225071324a8269fdfeb5de0e038 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -16,12 +16,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
 import tempfile
+import time
 
 import numpy as np
 import six
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.contrib.summary import summary_test_util
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
@@ -33,6 +34,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.training import training_util
 
@@ -57,7 +59,7 @@ _NUMPY_NUMERIC_TYPES = {
 }
 
 
-class TargetTest(test_util.TensorFlowTestCase):
+class EagerFileTest(test_util.TensorFlowTestCase):
 
   def testShouldRecordSummary(self):
     self.assertFalse(summary_ops.should_record_summaries())
@@ -138,21 +140,22 @@ class TargetTest(test_util.TensorFlowTestCase):
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
-        logs, max_queue=2, flush_millis=999999,
+        logs, max_queue=1, flush_millis=999999,
         name='lol').as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
       summary_ops.scalar('scalar', 2.0, step=1)
       self.assertEqual(1, get_total())
+      # Should flush after second summary since max_queue = 1
       summary_ops.scalar('scalar', 2.0, step=2)
       self.assertEqual(3, get_total())
 
-  def testFlush(self):
+  def testFlushFunction(self):
     logs = tempfile.mkdtemp()
-    with summary_ops.create_file_writer(
-        logs, max_queue=999999, flush_millis=999999,
-        name='lol').as_default(), summary_ops.always_record_summaries():
+    writer = summary_ops.create_file_writer(
+        logs, max_queue=999999, flush_millis=999999, name='lol')
+    with writer.as_default(), summary_ops.always_record_summaries():
       get_total = lambda: len(summary_test_util.events_from_logdir(logs))
       # Note: First tf.Event is always file_version.
       self.assertEqual(1, get_total())
@@ -161,9 +164,103 @@ class TargetTest(test_util.TensorFlowTestCase):
       self.assertEqual(1, get_total())
       summary_ops.flush()
       self.assertEqual(3, get_total())
+      # Test "writer" parameter
+      summary_ops.scalar('scalar', 2.0, step=3)
+      summary_ops.flush(writer=writer)
+      self.assertEqual(4, get_total())
+      summary_ops.scalar('scalar', 2.0, step=4)
+      summary_ops.flush(writer=writer._resource)  # pylint:disable=protected-access
+      self.assertEqual(5, get_total())
+
+  def testSharedName(self):
+    logdir = self.get_temp_dir()
+    with summary_ops.always_record_summaries():
+      # Create with default shared name (should match logdir)
+      writer1 = summary_ops.create_file_writer(logdir)
+      with writer1.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        summary_ops.flush()
+      # Create with explicit logdir shared name (should be same resource/file)
+      shared_name = 'logdir:' + logdir
+      writer2 = summary_ops.create_file_writer(logdir, name=shared_name)
+      with writer2.as_default():
+        summary_ops.scalar('two', 2.0, step=2)
+        summary_ops.flush()
+      # Create with different shared name (should be separate resource/file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops.create_file_writer(logdir, name='other')
+      with writer3.as_default():
+        summary_ops.scalar('three', 3.0, step=3)
+        summary_ops.flush()
+
+    event_files = iter(sorted(gfile.Glob(os.path.join(logdir, '*tfevents*'))))
+
+    # First file has tags "one" and "two"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('one', next(events).summary.value[0].tag)
+    self.assertEqual('two', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file has tag "three"
+    events = iter(summary_test_util.events_from_file(next(event_files)))
+    self.assertEqual('brain.Event:2', next(events).file_version)
+    self.assertEqual('three', next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_files))
+
+  def testWriterInitAndClose(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      # Calling init() again while writer is open has no effect
+      writer.init()
+      self.assertEqual(1, get_total())
+      try:
+        # Not using .as_default() to avoid implicit flush when exiting
+        writer.set_as_default()
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        # Calling .close() should do an implicit flush
+        writer.close()
+        self.assertEqual(2, get_total())
+        # Calling init() on a closed writer should start a new file
+        time.sleep(1.1)  # Ensure filename has a different timestamp
+        writer.init()
+        files = sorted(gfile.Glob(os.path.join(logdir, '*tfevents*')))
+        self.assertEqual(2, len(files))
+        get_total = lambda: len(summary_test_util.events_from_file(files[1]))
+        self.assertEqual(1, get_total())  # file_version Event
+        summary_ops.scalar('two', 2.0, step=2)
+        writer.close()
+        self.assertEqual(2, get_total())
+      finally:
+        # Clean up by resetting default writer
+        summary_ops.create_file_writer(None).set_as_default()
+
+  def testWriterFlush(self):
+    logdir = self.get_temp_dir()
+    get_total = lambda: len(summary_test_util.events_from_logdir(logdir))
+    with summary_ops.always_record_summaries():
+      writer = summary_ops.create_file_writer(
+          logdir, max_queue=100, flush_millis=1000000)
+      self.assertEqual(1, get_total())  # file_version Event
+      with writer.as_default():
+        summary_ops.scalar('one', 1.0, step=1)
+        self.assertEqual(1, get_total())
+        writer.flush()
+        self.assertEqual(2, get_total())
+        summary_ops.scalar('two', 2.0, step=2)
+      # Exiting the "as_default()" should do an implicit flush of the "two" tag
+      self.assertEqual(3, get_total())
 
 
-class DbTest(summary_test_util.SummaryDbTest):
+class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
diff --git a/tensorflow/contrib/summary/summary_test_internal.py b/tensorflow/contrib/summary/summary_test_internal.py
deleted file mode 100644
index d0d3384735fb1eb1a048c7aa6da0037ee9fc6936..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/summary/summary_test_internal.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Internal helpers for tests in this directory."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import os
-
-import sqlite3
-
-from tensorflow.contrib.summary import summary_ops
-from tensorflow.python.framework import test_util
-
-
-class SummaryDbTest(test_util.TensorFlowTestCase):
-  """Helper for summary database testing."""
-
-  def setUp(self):
-    super(SummaryDbTest, self).setUp()
-    self.db_path = os.path.join(self.get_temp_dir(), 'DbTest.sqlite')
-    if os.path.exists(self.db_path):
-      os.unlink(self.db_path)
-    self.db = sqlite3.connect(self.db_path)
-    self.create_db_writer = functools.partial(
-        summary_ops.create_db_writer,
-        db_uri=self.db_path,
-        experiment_name='experiment',
-        run_name='run',
-        user_name='user')
-
-  def tearDown(self):
-    self.db.close()
-    super(SummaryDbTest, self).tearDown()
-
-
-def get_one(db, q, *p):
-  return db.execute(q, p).fetchone()[0]
-
-
-def get_all(db, q, *p):
-  return unroll(db.execute(q, p).fetchall())
-
-
-def unroll(list_of_tuples):
-  return sum(list_of_tuples, ())
diff --git a/tensorflow/contrib/summary/summary_test_util.py b/tensorflow/contrib/summary/summary_test_util.py
index 8506c4be9c4ca8305b62da17c7246e6e18313bd3..b4ae43302cb22ad17c04050eb84433c470757bf1 100644
--- a/tensorflow/contrib/summary/summary_test_util.py
+++ b/tensorflow/contrib/summary/summary_test_util.py
@@ -24,10 +24,10 @@ import os
 
 import sqlite3
 
-from tensorflow.contrib.summary import summary_ops
 from tensorflow.core.util import event_pb2
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
+from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.platform import gfile
 
 
diff --git a/tensorflow/contrib/tensor_forest/BUILD b/tensorflow/contrib/tensor_forest/BUILD
index 1e4cc3f0952ef74a1c89b7ed2d8c357fa8847ad5..136856c0156c41046f9af61cdd6e3d5f8213309e 100644
--- a/tensorflow/contrib/tensor_forest/BUILD
+++ b/tensorflow/contrib/tensor_forest/BUILD
@@ -16,20 +16,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "kernels/v4/*",
-            "proto/*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 # ---------------------------------- V2 ops ------------------------------------------#
 filegroup(
     name = "v2_op_sources",
@@ -553,7 +539,6 @@ py_test(
     srcs = ["client/random_forest_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-        "no_windows",
         "nomac",  # b/63258195
         "notsan",
     ],
diff --git a/tensorflow/contrib/tensor_forest/hybrid/BUILD b/tensorflow/contrib/tensor_forest/hybrid/BUILD
index a2a3b485f6aa0ae827bbaa7812823730bd8db3b8..b7185e09c70fbeb33ed559cde1dfeaf348a7e126 100644
--- a/tensorflow/contrib/tensor_forest/hybrid/BUILD
+++ b/tensorflow/contrib/tensor_forest/hybrid/BUILD
@@ -11,18 +11,6 @@ package(default_visibility = ["//visibility:public"])
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "custom_op_sources",
     srcs = glob(
diff --git a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
index 794b76d8583c3608d540d34a5aaf1d1a799f35e3..b1b1559383a1d26a80d4974e2773f5b27ce1f2be 100644
--- a/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
+++ b/tensorflow/contrib/tensor_forest/kernels/v4/BUILD
@@ -11,11 +11,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-)
-
 DECISION_TREE_RESOURCE_DEPS = [
     ":decision_node_evaluator",
     ":input_data",
diff --git a/tensorflow/contrib/tensor_forest/proto/BUILD b/tensorflow/contrib/tensor_forest/proto/BUILD
index 1cfef44af1aaee3c105664398200524f2770f7d7..04fd6a9839509d2d02b7cf947acc4505c28cbdcd 100644
--- a/tensorflow/contrib/tensor_forest/proto/BUILD
+++ b/tensorflow/contrib/tensor_forest/proto/BUILD
@@ -6,14 +6,6 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
 package(default_visibility = ["//visibility:public"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "fertile_stats_proto",
     srcs = ["fertile_stats.proto"],
diff --git a/tensorflow/contrib/tensorboard/BUILD b/tensorflow/contrib/tensorboard/BUILD
index d833744d0c7e85b9f336f60a3becfd043bc3821d..2b6a2b2f3c711f48812063e98e05735e2d9b4141 100644
--- a/tensorflow/contrib/tensorboard/BUILD
+++ b/tensorflow/contrib/tensorboard/BUILD
@@ -9,6 +9,7 @@ exports_files(["LICENSE"])
 
 # For platform specific build config
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 tf_proto_library(
     name = "protos_all",
@@ -81,6 +82,7 @@ py_test(
     size = "small",
     srcs = ["plugins/trace/trace_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":trace",
         "//tensorflow/python:client_testlib",
@@ -88,15 +90,3 @@ py_test(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/BUILD b/tensorflow/contrib/tensorboard/db/BUILD
index 4175d8adb58a85728519042a9870e8c4590232ba..3f6b4cdc9ad10f5089f28af35a8be408918c7f90 100644
--- a/tensorflow/contrib/tensorboard/db/BUILD
+++ b/tensorflow/contrib/tensorboard/db/BUILD
@@ -135,9 +135,3 @@ tf_cc_binary(
         "//tensorflow/core/lib/db:sqlite",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 85b3e7231bcb433e9510522597c03c5f764f06cf..3f24f58f03aac2ba6d368d7eccf8731f611a81b4 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -132,7 +132,7 @@ class SummaryFileWriter : public SummaryWriterInterface {
   Status WriteEvent(std::unique_ptr<Event> event) override {
     mutex_lock ml(mu_);
     queue_.emplace_back(std::move(event));
-    if (queue_.size() >= max_queue_ ||
+    if (queue_.size() > max_queue_ ||
         env_->NowMicros() - last_flush_ > 1000 * flush_millis_) {
       return InternalFlush();
     }
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index c61b4655961664a6c9c22a5f6d6f26a55c34bfcd..cd3f712256f2293ed725745f8cbe48109856ef86 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/record_reader.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/event.pb.h"
@@ -58,7 +59,7 @@ class SummaryFileWriterTest : public ::testing::Test {
     TF_CHECK_OK(env_.GetChildren(testing::TmpDir(), &files));
     bool found = false;
     for (const string& f : files) {
-      if (StringPiece(f).contains(test_name)) {
+      if (str_util::StrContains(f, test_name)) {
         if (found) {
           return errors::Unknown("Found more than one file for ", test_name);
         }
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 906cc3f0344e7cb641589bd522e33d658150d3b5..2f316767b35e190c7e438a253a7395b0c5c2ee16 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -272,15 +272,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index ff8cc6374d40dc0b49721a784e25015c76541d03..b412b296e02751427b80e7c1596f2530942519c6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -405,7 +405,13 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                          max_mem_per_engine, static_graph_properties,
                          &output_edge_map, precision_mode);
     if (precision_mode == INT8MODE) {
-      TF_RETURN_IF_ERROR(GetCalibNode(&p));
+      tensorflow::Status status = GetCalibNode(&p);
+      if (status != tensorflow::Status::OK()) {
+        LOG(WARNING) << "subgraph conversion error for subgraph_index:" << count
+                     << " due to: \"" << status.ToString()
+                     << "\" SKIPPING......( " << subgraph_node_names.size()
+                     << " nodes)";
+      }
     } else {
       tensorflow::Status status = ConvertSubGraphToTensorRT(&p);
       if (status != tensorflow::Status::OK()) {
@@ -414,8 +420,8 @@ tensorflow::Status ConvertGraphDefToTensorRT(
                      << "\" SKIPPING......( " << subgraph_node_names.size()
                      << " nodes)";
       }
-      count++;
     }
+    count++;
   }
   graph.ToGraphDef(new_graph_def);
   return tensorflow::Status::OK();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e920a797fe428620ef62a2b67c07f35d85ef5211..567b4af88df65b8dd83c85a8430370bff611e917 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -443,7 +443,9 @@ class Converter {
        * 2) Control dependency inputs contain caret at the beginning and we
        *    remove this and annotate the edge as a control dependency.
        ************************************************************************/
-      string name = input_name[0] == '^' ? input_name.substr(1) : input_name;
+      // skip control nodes
+      if (input_name[0] == '^' ) continue;
+      string name =  input_name;
       auto first = name.find_first_of(':');
       if (first != string::npos && first + 2 == name.size() &&
           name[first + 1] == '0')
@@ -2262,6 +2264,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
   TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
   Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
@@ -2270,20 +2273,41 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
     auto node_name = node->name();
-    input_names.push_back(node_name);  // insert original node name without port
-    // TODO(jie): alternative :)
-    if (!s.graph_properties.HasOutputProperties(node_name))
+    // input_names should use the node name in the graph
+    // here it should be the input tensor name -> matching the binding
+    // insert original node name without port
+    auto tensor_name = node_name;
+    if (output_idx != 0) {
+      tensor_name = StrCat(tensor_name, ":", output_idx);
+    }
+
+    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
+            << " idx: " << output_idx;
+
+    auto shape_inference_node_name = node_name;
+    auto shape_inference_output_idx = output_idx;
+    // rewire the shape inference to original node in the graph
+    if (s.output_edge_map->count(tensor_name)) {
+      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
+      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
+    }
+    if (shape_inference_output_idx < 0) continue;
+    VLOG(2) << "shapeinference name: " << shape_inference_node_name
+            << " idx: " << shape_inference_output_idx;
+
+    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
       return tensorflow::errors::Internal("failed to find input node: " +
-                                          node_name);
+                                          shape_inference_node_name);
 
-    auto op_info_vec = s.graph_properties.GetOutputProperties(node_name);
-    if (static_cast<int>(op_info_vec.size()) < output_idx)
+    auto op_info_vec =
+        s.graph_properties.GetOutputProperties(shape_inference_node_name);
+    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
       return tensorflow::errors::Internal(
-          "accessing output index of: ", output_idx, ", at node: ", node_name,
-          "with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(output_idx);
+          "accessing output index of: ", shape_inference_output_idx,
+          ", at node: ", shape_inference_node_name,
+          " with output entry from shape_map: ", op_info_vec.size());
 
+    auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
     input_dtypes.push_back(tf_dtype);
 
@@ -2294,16 +2318,23 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
                    << "' failed";
       return type_status;
     }
-    TF_CHECK_OK(ConvertDType(tf_dtype, &dtype));
 
     VLOG(2) << "accessing output index of: " << output_idx
             << ", at node: " << node_name
             << "with output entry from shape_map: " << op_info_vec.size();
-
     // TODO(ben,jie): update TRT input format/dimension
     nvinfer1::DimsCHW input_dim_psuedo_chw;
     for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
 
+    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
+    //            update the code once TRT 4.0 comes out.
+    if (op_info.shape().dim_size() != 4) {
+      string err_str = "Require 4 dimensional input.";
+      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
+                shape_inference_node_name);
+      return tensorflow::errors::Unimplemented(err_str);
+    }
+
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
@@ -2312,8 +2343,11 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     // TODO(ben,jie): proper way to restore input tensor name?
     auto input_tensor_name = node_name;
-    if (output_idx != 0) input_tensor_name = StrCat(node_name, ":", output_idx);
+    if (output_idx != 0) {
+      input_tensor_name = StrCat(node_name, ":", output_idx);
+    }
 
+    input_names.push_back(input_tensor_name);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
 
@@ -2377,11 +2411,13 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     tensor->setType(trt_dtype);
   }
 
-  VLOG(2) << "finished output";
+  VLOG(2) << "Finished processing outputs";
 
   // Build the engine
   op_res->builder_->setMaxBatchSize(s.max_batch_size);
   op_res->builder_->setMaxWorkspaceSize(s.max_workspace_size_bytes);
+  VLOG(0) << "Max batch size= " << s.max_batch_size
+          << " max workspace size= " << s.max_workspace_size_bytes;
 
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
@@ -2475,7 +2511,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
   for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input!!!!!";
+    VLOG(2) << "parsing input. Node id= " << input.first ;
     int node_id = input.first;
     int output_idx = input.second;
     tensorflow::Node* node = s.graph.FindNodeId(node_id);
diff --git a/tensorflow/contrib/testing/BUILD b/tensorflow/contrib/testing/BUILD
index 0be6aa755bee50451f6717139fd8e1315789b389..8a40e111d7723b0d1c332b9d2381169c8bed510f 100644
--- a/tensorflow/contrib/testing/BUILD
+++ b/tensorflow/contrib/testing/BUILD
@@ -22,15 +22,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
index 15a415df303df5be44e89c00005cb253ae2af286..eac34afc4adb268e909076ec9c5fb379cc95537b 100644
--- a/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
+++ b/tensorflow/contrib/testing/python/framework/fake_summary_writer.py
@@ -52,6 +52,7 @@ class FakeSummaryWriter(object):
     self._added_graphs = []
     self._added_meta_graphs = []
     self._added_session_logs = []
+    self._added_run_metadata = {}
 
   @property
   def summaries(self):
@@ -127,6 +128,11 @@ class FakeSummaryWriter(object):
     # pylint: disable=unused-argument
     self._added_session_logs.append(session_log)
 
+  def add_run_metadata(self, run_metadata, tag, global_step=None):
+    if (global_step is not None) and (global_step < 0):
+      raise ValueError('Invalid global_step %s.' % global_step)
+    self._added_run_metadata[tag] = run_metadata
+
   def flush(self):
     pass
 
diff --git a/tensorflow/contrib/text/BUILD b/tensorflow/contrib/text/BUILD
index 698fdd830f57eb64c3c4119371f545908bf726e5..38d91f7e496d47ac74415da3bae91bad7f431dce 100644
--- a/tensorflow/contrib/text/BUILD
+++ b/tensorflow/contrib/text/BUILD
@@ -111,14 +111,3 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/tfprof/BUILD b/tensorflow/contrib/tfprof/BUILD
index 28adce71d414d267bd53109751689c6f4d5d7b3b..e7f4ebdd36aa9d21ec1dc71ed200001eb0331704 100644
--- a/tensorflow/contrib/tfprof/BUILD
+++ b/tensorflow/contrib/tfprof/BUILD
@@ -20,15 +20,3 @@ py_library(
         "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/BUILD b/tensorflow/contrib/timeseries/BUILD
index 6ba069778ccf5bfba94921ac47db9233c63c0cfe..f2b8786a527289fe20de86447355fbf552cd265e 100644
--- a/tensorflow/contrib/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/BUILD
@@ -31,15 +31,3 @@ py_library(
         "//tensorflow/contrib/timeseries/python/timeseries/state_space_models:test_utils",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/BUILD b/tensorflow/contrib/timeseries/examples/BUILD
index bb86ecb2209f9bed3ad6c37f4b23bc7b361e1bd6..32e948a009741b126e21a64473ac2d020a25a7af 100644
--- a/tensorflow/contrib/timeseries/examples/BUILD
+++ b/tensorflow/contrib/timeseries/examples/BUILD
@@ -25,7 +25,10 @@ py_test(
     srcs = ["predict_test.py"],
     data = ["data/period_trend.csv"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],  # b/67513579
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/67513579
+    ],
     deps = [
         ":predict",
         "//tensorflow/python:client_testlib",
@@ -106,15 +109,3 @@ py_test(
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/examples/lstm_test.py b/tensorflow/contrib/timeseries/examples/lstm_test.py
index ca56e38ca079f71b38cf29605a295a50929945e8..c58e24e6d9748868791d21b0ff4ec28ca2f646c3 100644
--- a/tensorflow/contrib/timeseries/examples/lstm_test.py
+++ b/tensorflow/contrib/timeseries/examples/lstm_test.py
@@ -36,17 +36,14 @@ class LSTMExampleTest(test.TestCase):
   def test_periodicity_learned(self):
     (observed_times, observed_values,
      all_times, predicted_values) = lstm.train_and_predict(
-         training_steps=100, estimator_config=_SeedRunConfig(),
+         training_steps=2, estimator_config=_SeedRunConfig(),
          export_directory=self.get_temp_dir())
     self.assertAllEqual([100], observed_times.shape)
     self.assertAllEqual([100, 5], observed_values.shape)
     self.assertAllEqual([200], all_times.shape)
     self.assertAllEqual([200, 5], predicted_values.shape)
-    self.assertGreater(
-        predicted_values[100, 4]
-        - predicted_values[115, 4],  # Amplitude of fifth component
-        0.2)
-
+    # TODO(allenl): Make the model deterministic so you can check something
+    # substantive.
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index ed3ed4c0e1731df62e9197aa7471fd6a31e9858e..d2746032a04946cdfab4b5ac968ea3add5f6b51d 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -88,10 +88,14 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:export",
+        "//tensorflow/python/feature_column",
     ],
 )
 
@@ -132,7 +136,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":feature_keys",
-        "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/layers:layers_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -141,6 +144,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator:estimator_py",
@@ -156,23 +160,30 @@ py_test(
         "head_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_pip_gpu",  # b/63391119
-    ],
+    tags = ["no_pip_gpu"],  # b/63391119
     deps = [
+        ":estimators",
         ":feature_keys",
         ":head",
+        ":input_pipeline",
         ":model",
         ":state_management",
+        "//tensorflow/contrib/timeseries/examples:lstm",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
+        "//tensorflow/python:session",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -235,6 +246,7 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_pip",  # b/64527635
         "no_pip_gpu",  # b/63391119
     ],
@@ -427,6 +439,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip_gpu",  # b/63391119
+        "no_windows",  # TODO: needs investigation on Windows
     ],
     deps = [
         ":feature_keys",
@@ -442,15 +455,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index 469cea4fd2fca65373eef85b1931a267e6e60238..886e1846e2a4f75503a47a3ff92adf97f814053f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -44,7 +44,7 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
   """An Estimator to fit and evaluate a time series model."""
 
   def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
-               config=None):
+               config=None, head_type=ts_head_lib.TimeSeriesRegressionHead):
     """Initialize the Estimator.
 
     Args:
@@ -55,6 +55,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
           from tf.train.Optimizer. Defaults to Adam with step size 0.02.
       model_dir: See `Estimator`.
       config: See `Estimator`.
+      head_type: The kind of head to use for the model (inheriting from
+          `TimeSeriesRegressionHead`).
     """
     input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
         dtype=model.dtype, num_features=model.num_features)
@@ -63,8 +65,8 @@ class TimeSeriesRegressor(estimator_lib.Estimator):
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
-    ts_regression_head = ts_head_lib.time_series_regression_head(
-        model, state_manager, optimizer,
+    ts_regression_head = head_type(
+        model=model, state_manager=state_manager, optimizer=optimizer,
         input_statistics_generator=input_statistics_generator)
     model_fn = ts_regression_head.create_estimator_spec
     super(TimeSeriesRegressor, self).__init__(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index f4304f2560a82b666f87f302a821a39b0e9e140e..9f161c1695f415ad28c41ad0c00bc0b056399b96 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import tempfile
 
 import numpy
+import six
 
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import estimators
@@ -126,6 +127,33 @@ class TimeSeriesRegressorTest(test.TestCase):
             signatures=signatures,
             session=sess)
 
+        # Test cold starting
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES],
+            signatures.signature_def[
+                feature_keys.SavedModelLabels.COLD_START_FILTER].inputs.keys())
+        batch_numpy_times = numpy.tile(
+            numpy.arange(30, dtype=numpy.int64)[None, :], (10, 1))
+        batch_numpy_values = numpy.ones([10, 30, 1])
+        state = saved_model_utils.cold_start_filter(
+            signatures=signatures,
+            session=sess,
+            features={
+                feature_keys.FilteringFeatures.TIMES: batch_numpy_times,
+                feature_keys.FilteringFeatures.VALUES: batch_numpy_values
+            }
+        )
+        predict_times = numpy.tile(
+            numpy.arange(30, 45, dtype=numpy.int64)[None, :], (10, 1))
+        predictions = saved_model_utils.predict_continuation(
+            continue_from=state,
+            times=predict_times,
+            signatures=signatures,
+            session=sess)
+        self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
+
   def test_fit_restore_fit_ar_regressor(self):
     def _estimator_fn(model_dir):
       return estimators.ARRegressor(
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head.py b/tensorflow/contrib/timeseries/python/timeseries/head.py
index 3d7e61529014ff5045c3b64fb945ceb9c902dd0d..a28a5872b850b51630240bdeb3ff22f372613523 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head.py
@@ -39,27 +39,18 @@ from tensorflow.python.util import nest
 from tensorflow.python.summary import summary
 
 
-def time_series_regression_head(model,
-                                state_manager,
-                                optimizer,
-                                input_statistics_generator=None):
-  """Creates a `_Head` for time series regression.
+class _NoStatePredictOutput(export_lib.PredictOutput):
 
-  Args:
-    model: A model for time series regression.
-    state_manager: A state manager.
-    optimizer: An optimizer.
-    input_statistics_generator: A input statistics generator.
-
-  Returns:
-    An instance of `_Head` for time series regression.
-  """
-  return _TimeSeriesRegressionHead(model, state_manager, optimizer,
-                                   input_statistics_generator)
+  def as_signature_def(self, receiver_tensors):
+    no_state_receiver_tensors = {
+        key: value for key, value in receiver_tensors.items()
+        if not key.startswith(feature_keys.State.STATE_PREFIX)}
+    return super(_NoStatePredictOutput, self).as_signature_def(
+        receiver_tensors=no_state_receiver_tensors)
 
 
-class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
-  """See `time_series_regression_head`."""
+class TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-access
+  """Determines input and output signatures for a time series model."""
 
   def __init__(self,
                model,
@@ -67,6 +58,15 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                optimizer,
                input_statistics_generator=None,
                name=None):
+    """Creates a `_Head` for time series regression.
+
+    Args:
+      model: A model for time series regression.
+      state_manager: A state manager.
+      optimizer: An optimizer.
+      input_statistics_generator: A input statistics generator.
+      name: An optional name for the model.
+    """
     self.model = model
     self.state_manager = state_manager
     self.optimizer = optimizer
@@ -154,8 +154,10 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
       no_state_features = {
           k: v for k, v in features.items()
           if not k.startswith(feature_keys.State.STATE_PREFIX)}
-      cold_filtering_outputs = self.create_loss(
-          no_state_features, estimator_lib.ModeKeys.EVAL)
+      # Ignore any state management when cold-starting. The model's default
+      # start state is replicated across the batch.
+      cold_filtering_outputs = self.model.define_loss(
+          features=no_state_features, mode=estimator_lib.ModeKeys.EVAL)
     return estimator_lib.EstimatorSpec(
         mode=estimator_lib.ModeKeys.PREDICT,
         export_outputs={
@@ -165,7 +167,7 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
                 export_lib.PredictOutput(
                     state_to_dictionary(filtering_outputs.end_state)),
             feature_keys.SavedModelLabels.COLD_START_FILTER:
-                export_lib.PredictOutput(
+                _NoStatePredictOutput(
                     state_to_dictionary(cold_filtering_outputs.end_state))
         },
         # Likely unused, but it is necessary to return `predictions` to satisfy
@@ -253,6 +255,58 @@ class _TimeSeriesRegressionHead(head_lib._Head):  # pylint:disable=protected-acc
         return self._serving_ops(features)
 
 
+class OneShotPredictionHead(TimeSeriesRegressionHead):
+  """A time series head which exports a single stateless serving signature.
+
+  The serving default signature exported by this head expects `times`, `values`,
+  and any exogenous features, but no state. `values` has shape `[batch_size,
+  filter_length, num_features]` and `times` has shape `[batch_size,
+  total_length]`, where `total_length > filter_length`. Any exogenous features
+  must have their shapes prefixed by the shape of the `times` feature.
+
+  When serving, first performs filtering on the series up to `filter_length`
+  starting from the default start state for the model, then computes predictions
+  on the remainder of the series, returning them.
+
+  Model state is neither accepted nor returned, so filtering must be performed
+  each time predictions are requested when using this head.
+  """
+
+  def _serving_ops(self, features):
+    """Add ops for serving to the graph."""
+    with variable_scope.variable_scope("model", use_resource=True):
+      filtering_features = {}
+      prediction_features = {}
+      values_length = array_ops.shape(
+          features[feature_keys.FilteringFeatures.VALUES])[1]
+      for key, value in features.items():
+        if key == feature_keys.State.STATE_TUPLE:
+          # Ignore state input. The model's default start state is replicated
+          # across the batch.
+          continue
+        if key == feature_keys.FilteringFeatures.VALUES:
+          filtering_features[key] = value
+        else:
+          filtering_features[key] = value[:, :values_length]
+          prediction_features[key] = value[:, values_length:]
+      cold_filtering_outputs = self.model.define_loss(
+          features=filtering_features, mode=estimator_lib.ModeKeys.EVAL)
+      prediction_features[feature_keys.State.STATE_TUPLE] = (
+          cold_filtering_outputs.end_state)
+    with variable_scope.variable_scope("model", reuse=True):
+      prediction_outputs = self.model.predict(
+          features=prediction_features)
+    return estimator_lib.EstimatorSpec(
+        mode=estimator_lib.ModeKeys.PREDICT,
+        export_outputs={
+            feature_keys.SavedModelLabels.PREDICT:
+                _NoStatePredictOutput(prediction_outputs),
+        },
+        # Likely unused, but it is necessary to return `predictions` to satisfy
+        # the Estimator's error checking.
+        predictions={})
+
+
 def _check_feature_shapes_compatible_with(features,
                                           compatible_with_name,
                                           compatible_with_value,
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 3415061cfd87358cccaf36dcb301fb36986bbde6..c606db76a668235ab6a837159b9dec072b5fd801 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -18,12 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy
+import six
+
+from tensorflow.contrib.timeseries.examples import lstm as lstm_example
+from tensorflow.contrib.timeseries.python.timeseries import estimators as ts_estimators
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
+from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import state_management
 
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.feature_column import feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -31,6 +39,9 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import adam
 from tensorflow.python.training import coordinator as coordinator_lib
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import training as train
@@ -90,7 +101,7 @@ class EvaluationMetricsTests(test.TestCase):
                       .count_up_to(10),
                       dtype=dtypes.float32), (1, 1, 1))
       }
-      model_fn = ts_head_lib.time_series_regression_head(
+      model_fn = ts_head_lib.TimeSeriesRegressionHead(
           model=_TickerModel(),
           state_manager=state_management.PassthroughStateManager(),
           optimizer=train.GradientDescentOptimizer(0.001)).create_estimator_spec
@@ -127,7 +138,7 @@ class _StubModel(object):
 
 
 def _stub_model_fn():
-  return ts_head_lib.time_series_regression_head(
+  return ts_head_lib.TimeSeriesRegressionHead(
       model=_StubModel(),
       state_manager=state_management.PassthroughStateManager(),
       optimizer=train.AdamOptimizer(0.001)).create_estimator_spec
@@ -263,5 +274,76 @@ class PredictFeatureCheckingTests(test.TestCase):
           mode=estimator_lib.ModeKeys.PREDICT)
 
 
+class OneShotTests(test.TestCase):
+
+  def test_one_shot_prediction_head_export(self):
+    model_dir = self.get_temp_dir()
+    categorical_column = feature_column.categorical_column_with_hash_bucket(
+        key="categorical_exogenous_feature", hash_bucket_size=16)
+    exogenous_feature_columns = [
+        feature_column.numeric_column(
+            "2d_exogenous_feature", shape=(2,)),
+        feature_column.embedding_column(
+            categorical_column=categorical_column, dimension=10)]
+    estimator = ts_estimators.TimeSeriesRegressor(
+        model=lstm_example._LSTMModel(
+            num_features=5, num_units=128,
+            exogenous_feature_columns=exogenous_feature_columns),
+        optimizer=adam.AdamOptimizer(0.001),
+        config=estimator_lib.RunConfig(tf_random_seed=4),
+        state_manager=state_management.ChainingStateManager(),
+        head_type=ts_head_lib.OneShotPredictionHead,
+        model_dir=model_dir)
+    train_features = {
+        feature_keys.TrainEvalFeatures.TIMES: numpy.arange(
+            20, dtype=numpy.int64),
+        feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+            20, dtype=numpy.float32)[:, None], [1, 5]),
+        "2d_exogenous_feature": numpy.ones([20, 2]),
+        "categorical_exogenous_feature": numpy.array(
+            ["strkey"] * 20)[:, None]
+    }
+    train_input_fn = input_pipeline.RandomWindowInputFn(
+        input_pipeline.NumpyReader(train_features), shuffle_seed=2,
+        num_threads=1, batch_size=16, window_size=16)
+    estimator.train(input_fn=train_input_fn, steps=5)
+    input_receiver_fn = estimator.build_raw_serving_input_receiver_fn()
+    export_location = estimator.export_savedmodel(self.get_temp_dir(),
+                                                  input_receiver_fn)
+    graph = ops.Graph()
+    with graph.as_default():
+      with session_lib.Session() as session:
+        signatures = loader.load(
+            session, [tag_constants.SERVING], export_location)
+        self.assertEqual([feature_keys.SavedModelLabels.PREDICT],
+                         list(signatures.signature_def.keys()))
+        predict_signature = signatures.signature_def[
+            feature_keys.SavedModelLabels.PREDICT]
+        six.assertCountEqual(
+            self,
+            [feature_keys.FilteringFeatures.TIMES,
+             feature_keys.FilteringFeatures.VALUES,
+             "2d_exogenous_feature",
+             "categorical_exogenous_feature"],
+            predict_signature.inputs.keys())
+        features = {
+            feature_keys.TrainEvalFeatures.TIMES: numpy.tile(
+                numpy.arange(35, dtype=numpy.int64)[None, :], [2, 1]),
+            feature_keys.TrainEvalFeatures.VALUES: numpy.tile(numpy.arange(
+                20, dtype=numpy.float32)[None, :, None], [2, 1, 5]),
+            "2d_exogenous_feature": numpy.ones([2, 35, 2]),
+            "categorical_exogenous_feature": numpy.tile(numpy.array(
+                ["strkey"] * 35)[None, :, None], [2, 1, 1])
+        }
+        feeds = {
+            graph.as_graph_element(input_value.name): features[input_key]
+            for input_key, input_value in predict_signature.inputs.items()}
+        fetches = {output_key: graph.as_graph_element(output_value.name)
+                   for output_key, output_value
+                   in predict_signature.outputs.items()}
+        output = session.run(fetches, feed_dict=feeds)
+        self.assertAllEqual((2, 15, 5), output["mean"].shape)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index c86d06e9236962cbabbc56afa1cfe213e0c78bc0..5d33e23a427bd54fd02b0eb7489f84d189e05e35 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,6 +40,7 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
@@ -268,15 +269,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index eea19e9465e482dfd1ea9a144435c23a2ecf1467..3e489d38b68fb759fb7a404ed108714e6a3c2216 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -46,7 +46,7 @@ py_library(
     deps = [
         ":tpu_lib",
         ":tpu_py",
-        "//tensorflow/contrib/summary:summary_ops",
+        "//tensorflow/contrib/training:training_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -56,6 +56,7 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
+        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
@@ -119,6 +120,8 @@ py_library(
     srcs = ["python/profiler/__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_pb2_grpc",
+        "//tensorflow/contrib/tpu/profiler:tpu_profiler_analysis_proto_py",
         "//tensorflow/contrib/tpu/profiler:trace_events_proto_py",
         "//tensorflow/python:util",
     ],
@@ -157,6 +160,7 @@ py_library(
     name = "tpu_lib",
     srcs = [
         "python/tpu/__init__.py",
+        "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
@@ -212,6 +216,7 @@ tf_py_test(
         ":datasets",
     ],
     grpc_enabled = True,
+    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -225,6 +230,7 @@ tf_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:layers",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 tf_py_test(
@@ -238,6 +244,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["python/tpu/bfloat16_test.py"],
+    additional_deps = [
+        ":tpu",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
+
 tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
@@ -281,16 +298,3 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/contrib/tpu/__init__.py b/tensorflow/contrib/tpu/__init__.py
index ea6e874f2d952b03e8cdabeee00ccfe1b076a0d0..bb60f3e2d771b351058322b908dfe68df4abed30 100644
--- a/tensorflow/contrib/tpu/__init__.py
+++ b/tensorflow/contrib/tpu/__init__.py
@@ -53,6 +53,7 @@ from __future__ import print_function
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.contrib.tpu.python import profiler
 from tensorflow.contrib.tpu.python.ops.tpu_ops import *
+from tensorflow.contrib.tpu.python.tpu.bfloat16 import *
 from tensorflow.contrib.tpu.python.tpu.device_assignment import *
 from tensorflow.contrib.tpu.python.tpu.topology import *
 from tensorflow.contrib.tpu.python.tpu.tpu import *
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index cba71c6b98e1079de6c6c4c32fa2ffc44a9ce71e..3bdf7c2f83b037984a45cea99910df87c967aa40 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -27,6 +27,7 @@ REGISTER_OP("TPUReplicateMetadata")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
     .Attr("computation_shape: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .SetShapeFn(shape_inference::UnknownShape);
 
 REGISTER_OP("TPUReplicatedInput")
@@ -68,6 +69,7 @@ REGISTER_OP("TPUReplicate")
     .Attr("num_replicas: int >= 1")
     .Attr("topology: string = \"\"")
     .Attr("device_assignment: list(int) = []")
+    .Attr("host_compute_core: list(string) = []")
     .Attr("computation_shape: list(int) = []")
     .Attr("Tinputs: list(type) >= 0")
     .Attr("Tbroadcast_inputs: list(type) >= 0")
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index 0a52d0b13b7c8749ad44377659714d297ffec3ee..1c32993e8e546a17b8b3c289a306ad8f8388c345 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -6,18 +6,6 @@ load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library_cc")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "tpu_profiler_proto",
     srcs = ["tpu_profiler.proto"],
@@ -127,7 +115,5 @@ py_library(
     srcs = ["tpu_profiler_analysis_pb2_grpc.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        ":tpu_profiler_analysis_proto_py",
-    ],
+    deps = [":tpu_profiler_analysis_proto_py"],
 )
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index e6811d4ad204edb318638c698090479436f38ecd..6b198dbc16e544686e35c1ffee8a7f4d3955dafc 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -64,14 +64,20 @@ Status ValidateHostPortPair(const string& host_port) {
   return Status::OK();
 }
 
-ProfileResponse Profile(const string& service_addr, int duration_ms,
-                        const string& repository_root, const string& session_id,
-                        const ProfileOptions& opts) {
+// Returns whether the returned trace is empty.
+// Failure are handled by CHECK, i.e. abort()
+bool Profile(const string& service_addr, const string& logdir, int duration_ms,
+             const string& repository_root, const string& session_id,
+             const ProfileOptions& opts) {
   ProfileRequest request;
   request.set_duration_ms(duration_ms);
   request.set_max_events(kMaxEvents);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
+  if (tensorflow::str_util::StartsWith(repository_root, "gs://")) {
+    // For backward compatibilities, only generate tracetable etc when the
+    // user provide a GCS path for model directory.
+    request.set_repository_root(repository_root);
+    request.set_session_id(session_id);
+  }
   request.add_tools("input_pipeline");
   request.add_tools("overview_page");
   *request.mutable_opts() = opts;
@@ -90,7 +96,31 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
           channel_args));
   ProfileResponse response;
   TF_QCHECK_OK(FromGrpcStatus(stub->Profile(&context, request, &response)));
-  return response;
+
+  if (!response.encoded_trace().empty()) {
+    TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
+        logdir, session_id, "", response, &std::cout));
+    // Print this at the end so that it's not buried in irrelevant LOG messages.
+    std::cout
+        << "NOTE: using the trace duration " << duration_ms << "ms."
+        << std::endl
+        << "Set an appropriate duration (with --duration_ms) if you "
+           "don't see a full step in your trace or the captured trace is too "
+           "large."
+        << std::endl;
+  }
+
+  return response.encoded_trace().empty();
+}
+
+// Start a new profiling session that include all the hosts included in
+// hostnames, for the time interval of duration_ms. Possibly save the profiling
+// result in the directory specified by repository_root and session_id.
+bool NewSession(const string& service_addr,
+                const std::vector<tensorflow::string>& hostnames,
+                int duration_ms, const string& repository_root,
+                const string& session_id, const ProfileOptions& opts) {
+  return true;
 }
 
 }  // namespace
@@ -100,12 +130,16 @@ ProfileResponse Profile(const string& service_addr, int duration_ms,
 int main(int argc, char** argv) {
   tensorflow::string FLAGS_service_addr;
   tensorflow::string FLAGS_logdir;
+  tensorflow::string FLAGS_workers_list;
   int FLAGS_duration_ms = 2000;
   int FLAGS_num_tracing_attempts = 3;
   bool FLAGS_include_dataset_ops = true;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("service_addr", &FLAGS_service_addr,
                        "Address of TPU profiler service e.g. localhost:8466"),
+      tensorflow::Flag("workers_list", &FLAGS_workers_list,
+                       "The list of worker TPUs that we are about to profile "
+                       "in the current session."),
       tensorflow::Flag("logdir", &FLAGS_logdir,
                        "Path of TensorBoard log directory e.g. /tmp/tb_log, "
                        "gs://tb_bucket"),
@@ -149,18 +183,30 @@ int main(int argc, char** argv) {
   constexpr char kProfilePluginDirectory[] = "plugins/profile/";
   tensorflow::string repository_root =
       ::tensorflow::io::JoinPath(FLAGS_logdir, kProfilePluginDirectory);
+  std::vector<tensorflow::string> hostnames =
+      tensorflow::str_util::Split(FLAGS_workers_list, ",");
+
+  bool empty_trace = false;
   while (true) {
     std::cout << "Starting to profile TPU traces for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << remaining_attempts-- << std::endl;
-    response = tensorflow::tpu::Profile(FLAGS_service_addr, duration_ms,
-                                        repository_root, session_id, opts);
-    if (remaining_attempts <= 0 || !response.encoded_trace().empty()) break;
+    if (hostnames.empty()) {
+      empty_trace = tensorflow::tpu::Profile(FLAGS_service_addr, FLAGS_logdir,
+                                             duration_ms, repository_root,
+                                             session_id, opts);
+    } else {
+      tensorflow::string tpu_master = FLAGS_service_addr;
+      empty_trace =
+          tensorflow::tpu::NewSession(tpu_master, hostnames, duration_ms,
+                                      repository_root, session_id, opts);
+    }
+    if (remaining_attempts <= 0 || !empty_trace) break;
     std::cout << "No trace event is collected. Automatically retrying."
               << std::endl
               << std::endl;
   }
 
-  if (response.encoded_trace().empty()) {
+  if (empty_trace) {
     std::cout << "No trace event is collected after "
               << FLAGS_num_tracing_attempts << " attempt(s). "
               << "Perhaps, you want to try again (with more attempts?)."
@@ -171,13 +217,5 @@ int main(int argc, char** argv) {
     return 0;
   }
 
-  TF_CHECK_OK(tensorflow::tpu::WriteTensorboardTPUProfile(
-      FLAGS_logdir, session_id, response, &std::cout));
-  // Print this at the end so that it's not buried in irrelevant LOG messages.
-  std::cout
-      << "NOTE: using the trace duration " << duration_ms << "ms." << std::endl
-      << "Set an appropriate duration (with --duration_ms) if you "
-         "don't see a full step in your trace or the captured trace is too "
-         "large."
-      << std::endl;
+  return 0;
 }
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index ebd6185faad28ae7a22eb33f6b358eb2344c9c22..ae508583f848a94d9a52d8663af96d85d8fff74c 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -41,6 +41,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
 constexpr char kJsonOpProfileFileName[] = "op_profile.json";
@@ -61,28 +62,33 @@ Status WriteGzippedDataToFile(const string& filename, const string& data) {
   return Status::OK();
 }
 
-Status DumpTraceToLogDirectory(StringPiece run_dir, const string& encoded_trace,
-                               std::ostream* os) {
+Status DumpTraceToLogDirectory(StringPiece run_dir, const string& host_prefix,
+                               const string& encoded_trace, std::ostream* os) {
   string proto_path = JoinPath(run_dir, kProtoTraceFileName);
   TF_RETURN_IF_ERROR(
       WriteStringToFile(Env::Default(), proto_path, encoded_trace));
   LOG(INFO) << "Dumped raw-proto trace data to " << proto_path;
 
-  string json_path = JoinPath(run_dir, kJsonTraceFileName);
+  string json_path = JoinPath(run_dir, StrCat(host_prefix, kJsonTraceFileName));
   Trace trace;
   trace.ParseFromString(encoded_trace);
-  *os << "Trace contains " << trace.trace_events_size() << " events."
-      << std::endl;
+  if (os) {
+    *os << "Trace contains " << trace.trace_events_size() << " events."
+        << std::endl;
+  }
   TF_RETURN_IF_ERROR(
       WriteGzippedDataToFile(json_path, TraceEventsToJson(trace)));
-  *os << "Dumped JSON trace data to " << json_path << std::endl;
+  if (os) {
+    *os << "Dumped JSON trace data to " << json_path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpOpProfileToLogDirectory(StringPiece run_dir,
+                                   const string& host_prefix,
                                    const tpu::op_profile::Profile& profile,
                                    std::ostream* os) {
-  string path = JoinPath(run_dir, kJsonOpProfileFileName);
+  string path = JoinPath(run_dir, StrCat(host_prefix, kJsonOpProfileFileName));
   string json;
   JsonOptions options;
   options.always_print_primitive_fields = true;
@@ -93,49 +99,20 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
         string(status.error_message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
-  *os << "Dumped json op profile data to " << path << std::endl;
+  if (os) {
+    *os << "Dumped json op profile data to " << path << std::endl;
+  }
   return Status::OK();
 }
 
 Status DumpToolDataToLogDirectory(StringPiece run_dir,
+                                  const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
-  string path = JoinPath(run_dir, tool.name());
+  string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
-  *os << "Dumped tool data for " << tool.name() << " to " << path << std::endl;
-  return Status::OK();
-}
-
-Status DumpGraphEvents(const string& logdir, const string& run,
-                       const ProfileResponse& response, std::ostream* os) {
-  int num_graphs = response.computation_graph_size();
-  if (response.computation_graph_size() == 0) return Status::OK();
-  // The server might generates multiple graphs for one program; we simply
-  // pick the first one.
-  if (num_graphs > 1) {
-    *os << num_graphs
-        << " TPU program variants observed over the profiling period. "
-        << "One computation graph will be chosen arbitrarily." << std::endl;
-  }
-  // The graph plugin expects the graph in <logdir>/<run>/<event.file>.
-  string run_dir = JoinPath(logdir, strings::StrCat(kGraphRunPrefix, run));
-  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(run_dir));
-  EventsWriter event_writer(JoinPath(run_dir, "events"));
-  Event event;
-  // Add the computation graph.
-  event.set_graph_def(response.computation_graph(0).SerializeAsString());
-  event_writer.WriteEvent(event);
-  *os << "Wrote a HLO graph to " << event_writer.FileName() << std::endl;
-
-  if (response.has_hlo_metadata()) {
-    tensorflow::TaggedRunMetadata tagged_run_metadata;
-    tagged_run_metadata.set_tag(run);
-    tagged_run_metadata.set_run_metadata(
-        response.hlo_metadata().SerializeAsString());
-    tensorflow::Event meta_event;
-    *meta_event.mutable_tagged_run_metadata() = tagged_run_metadata;
-    event_writer.WriteEvent(meta_event);
-    *os << "Wrote HLO ops run metadata to " << event_writer.FileName()
+  if (os) {
+    *os << "Dumped tool data for " << tool.name() << " to " << path
         << std::endl;
   }
   return Status::OK();
@@ -144,27 +121,29 @@ Status DumpGraphEvents(const string& logdir, const string& run,
 }  // namespace
 
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os) {
   // Dumps profile data to <logdir>/plugins/profile/<run>/.
+  string host_prefix = host.empty() ? "" : StrCat(host, ".");
   string profile_run_dir = JoinPath(logdir, kProfilePluginDirectory, run);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(profile_run_dir));
 
   // Ignore computation_graph for now.
   if (!response.encoded_trace().empty()) {
     LOG(INFO) << "Converting trace events to TraceViewer JSON.";
-    TF_RETURN_IF_ERROR(
-        DumpTraceToLogDirectory(profile_run_dir, response.encoded_trace(), os));
+    TF_RETURN_IF_ERROR(DumpTraceToLogDirectory(profile_run_dir, host_prefix,
+                                               response.encoded_trace(), os));
   }
   if (response.has_op_profile() &&
       (response.op_profile().has_by_program_structure() ||
        response.op_profile().has_by_category())) {
-    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir,
+    TF_RETURN_IF_ERROR(DumpOpProfileToLogDirectory(profile_run_dir, host_prefix,
                                                    response.op_profile(), os));
   }
   for (const auto& tool_data : response.tool_data()) {
-    TF_RETURN_IF_ERROR(
-        DumpToolDataToLogDirectory(profile_run_dir, tool_data, os));
+    TF_RETURN_IF_ERROR(DumpToolDataToLogDirectory(profile_run_dir, host_prefix,
+                                                  tool_data, os));
   }
 
   return Status::OK();
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
index 29ef977bacfd61e163be49558c5b94277ed479c1..ecf21b1de2219e8896d5e8b79325a193de0b0fa1 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.h
@@ -32,6 +32,7 @@ namespace tpu {
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
 Status WriteTensorboardTPUProfile(const string& logdir, const string& run,
+                                  const string& host,
                                   const ProfileResponse& response,
                                   std::ostream* os);
 
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index a730d6142d890cc41f72176cf617ac0b0434192c..0b78cf8695091daf797bcb80586397e7ab1c6284 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -76,7 +76,7 @@ def main(unused_argv=None):
   else:
     tpu_cluster_resolver = (
         tf.contrib.cluster_resolver.TPUClusterResolver(
-            tpu_names=[FLAGS.tpu_name],
+            [FLAGS.tpu_name],
             zone=FLAGS.tpu_zone,
             project=FLAGS.gcp_project))
     service_addr = tpu_cluster_resolver.get_master()
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 20ed7419fde36a0d112900093ed2f44c3af63d75..63955d18068fc9d3b3ca1a657a3fd526edf10e6f 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -66,6 +66,10 @@ message OpMetricsDbResult {
   // The total of the difference between the start times of two
   // consecutive infeed-enqueues (per host) in picoseconds.
   optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
+  // The total device time in microseconds.
+  optional double total_device_time_in_us = 4;
+  // The total host time in microseconds.
+  optional double total_host_time_in_us = 5;
 }
 
 // Result proto for StepInfo.
@@ -79,6 +83,10 @@ message StepInfoResult {
   optional uint64 infeed_duration_ps = 3;
   // The start time of this step in picoseconds.
   optional uint64 begin_ps = 4;
+  // The waiting time within this step in picoseconds.
+  optional uint64 wait_duration_ps = 5;
+  // The time spent on cross-replica-sum in picoseconds.
+  optional uint64 crs_duration_ps = 6;
 }
 
 // Result proto for a sequence of steps.
@@ -199,10 +207,22 @@ message HostOpsPerTpuStep {
   map<int32, int32> step_diffs = 5;
 }
 
+message HostOpsDetailsPerCore {
+  // Map from core id to HostOpsPerTpuStep.
+  map<int32, HostOpsPerTpuStep> core_map = 1;
+}
+
+message HostOpsDetailsPerHost {
+  // Map from hostname to a map from core id to HostOpsPerTpuStep.
+  map<string, HostOpsDetailsPerCore> host_map = 1;
+}
+
 // Result proto for the host ops for all TPU steps.
 message HostOpsResult {
-  // A sequence of HostOpsPerTpuStep (one for each TPU step)
-  repeated HostOpsPerTpuStep host_op_sequence = 1;
+  reserved 1;  // (was repeated HostOpsPerTpuStep host_op_sequence)
+  // A sequence of records with one for each TPU step. Each record
+  // is a map from hostname to a map from core id to HostOpsPerTpuStep.
+  repeated HostOpsDetailsPerHost hostops_details = 2;
 }
 
 // Result proto for TfStatsHelper.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index cddc3cd1b41d6e00409222170e69c429fe6f91f8..8505c4bc69b9444519d0bc5d23bab093b8a57163 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -21,6 +21,17 @@ message ProfileOptions {
   // next-field: 2
 }
 
+message ToolRequestOptions {
+  // Required formats for the tool, it should be one of "json", "proto", "raw"
+  // etc. If not specified (backward compatible), use default format, i.e. most
+  // tools use json format.
+  string output_formats = 2;
+
+  // Whether save the result directly to repository or pass it back to caller.
+  // Default to false for backward compatibilities.
+  bool save_to_repo = 3;
+}
+
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -30,9 +41,12 @@ message ProfileRequest {
   // events.
   uint64 max_events = 2;
 
-  // required profiling tools name such as "input_pipeline_analyzer" etc
+  // Required profiling tools name such as "input_pipeline_analyzer" etc
   repeated string tools = 3;
 
+  // Specifies the requirement for each tools.
+  map<string, ToolRequestOptions> tool_options = 8;
+
   // Optional profiling options that control how a TF session will be profiled.
   ProfileOptions opts = 4;
 
@@ -43,10 +57,14 @@ message ProfileRequest {
   // The user provided profile session identifier.
   string session_id = 6;
 
+  // The hostname of system where the profile should happen.
+  // We use it as identifier in part of our output filename.
+  string host_name = 7;
+
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 7
+  // next-field: 9
 }
 
 message ProfileToolData {
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
index c28fef22a9d3736748b1b56135302d5ec7845720..8f5148828878b1c03bf35d1d11dc11942128b20c 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis_pb2_grpc.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 
 import grpc
 
-from third_party.tensorflow.contrib.tpu.profiler import tpu_profiler_analysis_pb2 as third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2
+from tensorflow.contrib.tpu.profiler import tpu_profiler_analysis_pb2 as third__party_dot_tensorflow_dot_contrib_dot_tpu_dot_profiler_dot_tpu__profiler__analysis__pb2
 
 
 class TPUProfileAnalysisStub(object):
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index e1660985676e8c2efe3b01e32b48b211391885b7..fcfbbe1a213b6959b82c20beff02df48517b5e98 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -4,17 +4,6 @@ exports_files(["LICENSE"])
 
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "tpu_embedding_config_proto",
     srcs = [
diff --git a/tensorflow/contrib/tpu/python/profiler/__init__.py b/tensorflow/contrib/tpu/python/profiler/__init__.py
index bde13f0527a1d8c5f71dd9684b93144ae07d60e4..15ce6aceec299adacd7025f0021cf8b6f6ef765b 100644
--- a/tensorflow/contrib/tpu/python/profiler/__init__.py
+++ b/tensorflow/contrib/tpu/python/profiler/__init__.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.contrib.tpu.profiler.tpu_profiler_analysis_pb2 import *
 from tensorflow.contrib.tpu.profiler.trace_events_pb2 import *
 # pylint: enable=wildcard-import,unused-import
 
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e49af6408e8aaf2d6bd56335a60724853ac14c2
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -0,0 +1,77 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Helper context for running models with bfloat16."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util import tf_contextlib
+
+
+def _get_custom_getter():
+  """Returns a custom getter that this class's methods must be called under.
+
+  All methods of this class must be called under a variable scope that was
+  passed this custom getter. Example:
+
+  ```python
+  network = ConvNetBuilder(...)
+  with tf.variable_scope('cg', custom_getter=network.get_custom_getter()):
+    network.conv(...)
+    # Call more methods of network here
+  ```
+
+  Currently, this custom getter only does anything if self.use_tf_layers is
+  True. In that case, it causes variables to be stored as dtype
+  self.variable_type, then casted to the requested dtype, instead of directly
+  storing the variable as the requested dtype.
+  """
+
+  def inner_custom_getter(getter, *args, **kwargs):
+    """Custom getter that forces variables to have type self.variable_type."""
+    cast_to_bfloat16 = False
+    requested_dtype = kwargs['dtype']
+    if requested_dtype == dtypes.bfloat16:
+      # Only change the variable dtype if doing so does not decrease variable
+      # precision.
+      kwargs['dtype'] = dtypes.float32
+      cast_to_bfloat16 = True
+    var = getter(*args, **kwargs)
+    # This if statement is needed to guard the cast, because batch norm
+    # assigns directly to the return value of this custom getter. The cast
+    # makes the return value not a variable so it cannot be assigned. Batch
+    # norm variables are always in fp32 so this if statement is never
+    # triggered for them.
+    if cast_to_bfloat16:
+      var = math_ops.cast(var, dtypes.bfloat16)
+    return var
+
+  return inner_custom_getter
+
+
+@tf_contextlib.contextmanager
+def bfloat16_scope():
+  """Scope class for bfloat16 variables so that the model uses custom getter.
+
+  This enables variables to be read as bfloat16 type when using get_variable.
+  """
+  with variable_scope.variable_scope(
+      'bfloat16', custom_getter=_get_custom_getter()) as varscope:
+    yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a01c7308fbf14d2fb3bd29382d98a6ade1d810
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Tests for bfloat16 helper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tpu.python.tpu import bfloat16
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import variable_scope
+
+from tensorflow.python.platform import test
+
+
+class BFloat16ScopeTest(test.TestCase):
+
+  def testScopeName(self):
+    """Test if name for the variable scope is propogated correctly.
+    """
+    with bfloat16.bfloat16_scope() as bf:
+      self.assertEqual(bf.name, "bfloat16")
+
+  def testRequestedDType(self):
+    """Test if requested dtype is honored in the getter.
+    """
+    with bfloat16.bfloat16_scope() as scope:
+      v1 = variable_scope.get_variable("v1", [])
+      self.assertEqual(v1.dtype.base_dtype, dtypes.float32)
+      v2 = variable_scope.get_variable("v2", [], dtype=dtypes.bfloat16)
+      self.assertEqual(v2.dtype.base_dtype, dtypes.bfloat16)
+      self.assertEqual([dtypes.float32, dtypes.float32],
+                       [v.dtype.base_dtype for v in scope.global_variables()])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 3f2db548ace9e10df7844d8fb461670d27234670..a1690dadffe5770af9416a7c5ad3a7e336f6bc18 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -25,6 +25,8 @@ from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -56,6 +58,7 @@ _NOT_IMPLEMENTED_OPS = set([
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
 def _tpu_system_device_name(job):
@@ -121,8 +124,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name):
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
+    self._num_replicas = num_replicas
+    self._outer_device_function_stack = None
+    self._oc_dev_fn_stack = None
+    self._outside_compilation_cluster = None
+    self._outside_compilation_counter = 0
+    self._in_gradient_colocation = None
+    self._gradient_colocation_stack = []
+    self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
 
@@ -136,6 +147,143 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
         logging.warning("... and %d more" %
                         (len(self._unsupported_ops) - _MAX_WARNING_LINES))
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      self._gradient_colocation_stack.append(op)
+      if not self._outside_compilation_cluster:
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR)
+          if self._in_gradient_colocation:
+            raise NotImplementedError(
+                "Cannot nest gradient colocation operations outside compilation"
+            )
+          if gradient_uid == "__unsupported__":
+            raise NotImplementedError(
+                "No gradient_uid calling gradient within outside_compilation")
+          # When we take the gradient of an op X in an
+          # outside_compilation cluster C in a forward computation we
+          # would like to put the ops corresponding to the gradient of
+          # X into a new outside_compilation cluster C'. However, if
+          # we take the gradient of X twice, the second one should get
+          # yet another new outside_compilation cluster C''.
+          #
+          # The mechanism we adopt is to use a 'root_cluster' which is
+          # the cluster that X was in before we took gradients, and a
+          # 'gradient_uid' which is different for every invocation of
+          # gradients, and put the gradient of X in cluster
+          # 'root_cluster.gradient_uid'.
+          #
+          # When the gradient code adds multiple Ops, it asks them to
+          # be colocated either with the original Op X, or with one of
+          # the preceding Ops that was added to the gradient. In other
+          # words, we want to detect the case where we are colocating
+          # with an Op that is in cluster root_cluster.gradient_uid
+          # and put the new Op in that same cluster if the
+          # gradient_uid is the same (the case that we are in the same
+          # invocation of gradients, and just adding new Ops to the
+          # cluster); and in a different cluster if the gradient_uids
+          # are different (the case that we are in a new invocation of
+          # gradients, taking the gradient of a previously-computed
+          # gradient).
+          self._in_gradient_colocation = op
+          parts = outside_attr.split(".")
+          if len(parts) > 1:
+            uid = parts[-1]
+            if uid == gradient_uid:
+              # Keep using the same cluster
+              cluster = outside_attr
+            else:
+              # We're taking the gradient of a gradient so make a new
+              # cluster attr, adding a new '.uid' on the end to
+              # preserve the invariant that the gradient_uid is the
+              # suffix after the last '.' in the attr.
+              cluster = outside_attr + "." + gradient_uid
+          else:
+            # We're taking the gradient of an Op in the forward pass, so
+            # make a new cluster combining the Op's cluster and the
+            # gradient id.
+            cluster = outside_attr + "." + gradient_uid
+          self._EnterOutsideCompilationScope(cluster=cluster)
+        except ValueError:
+          # The attr was not present: do nothing.
+          pass
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    if op is not None:
+      if not self._gradient_colocation_stack:
+        raise errors.InternalError(
+            op.node_def, op,
+            "Badly nested gradient colocation: empty stack when popping Op " +
+            op.name)
+      last_op = self._gradient_colocation_stack.pop()
+      if op is last_op:
+        if op is self._in_gradient_colocation:
+          self._in_gradient_colocation = None
+          self._ExitOutsideCompilationScope()
+      else:
+        raise errors.InternalError(
+            op.node_def, op, "Badly nested gradient colocation, expected " +
+            last_op + ", got " + op.name)
+
+  def _EnterOutsideCompilationScope(self, cluster=None):
+
+    class FakeOp(object):
+      """A helper class to determine the current device.
+
+      Supports only the device set/get methods needed to run the
+      graph's _apply_device_function method.
+      """
+
+      def __init__(self):
+        self._device = ""
+
+      @property
+      def device(self):
+        return self._device
+
+      def _set_device(self, device):
+        self._device = device.to_string()
+
+    if self._outside_compilation_cluster:
+      raise NotImplementedError("Cannot nest outside_compilation clusters")
+    if cluster:
+      self._outside_compilation_cluster = cluster
+    else:
+      self._outside_compilation_cluster = str(self._outside_compilation_counter)
+      self._outside_compilation_counter += 1
+    graph = ops.get_default_graph()
+    fake_op = FakeOp()
+    graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
+    device = pydev.DeviceSpec.from_string(fake_op.device)
+    if (device.device_type == "TPU_REPLICATED_CORE" and
+        device.device_index is not None):
+      self._host_compute_core.append(self._outside_compilation_cluster + ":" +
+                                     str(device.device_index))
+    self._oc_dev_fn_stack = graph._device_function_stack  # pylint: disable=protected-access
+    graph._device_function_stack = self._outer_device_function_stack  # pylint: disable=protected-access
+
+  def _ExitOutsideCompilationScope(self):
+    if not self._outside_compilation_cluster:
+      raise NotImplementedError(
+          "Attempted to exit outside_compilation scope when not in scope")
+    self._outside_compilation_cluster = None
+    graph = ops.get_default_graph()
+    graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
+
+  def Enter(self):
+    if not self._outer_device_function_stack:
+      # Capture the device function stack at the time of first entry
+      # since that is the stack that will be used outside_compilation.
+      graph = ops.get_default_graph()
+      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+    super(TPUReplicateContext, self).Enter()
+
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
+  def HostComputeCore(self):
+    return self._host_compute_core
+
   def AddOp(self, op):
     self._AddOpInternal(op)
 
@@ -157,9 +305,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError("TPU computations cannot be nested")
     op._set_attr(_TPU_REPLICATE_ATTR,
                  attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
-    op.graph.prevent_feeding(op)
-    op.graph.prevent_fetching(op)
+    if self._outside_compilation_cluster:
+      op._set_attr(
+          _OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(
+              s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._num_replicas > 1 or not self._outside_compilation_cluster:
+      # Prevent feeding or fetching anything that is being compiled,
+      # and any replicated outside_compilation Op.
+      op.graph.prevent_feeding(op)
+      op.graph.prevent_fetching(op)
 
   def AddValue(self, val):
     result = val
@@ -181,6 +336,45 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
+def outside_compilation(computation, args=None):
+  """Builds part of a computation outside any current TPU replicate scope.
+
+  Args:
+    computation: A Python function that builds the computation to
+      place on the host.
+    args: Inputs to pass to computation.
+  Returns:
+    The Tensors returned by computation.
+  """
+  graph = ops.get_default_graph()
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope")
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
+
 def replicate(computation,
               inputs=None,
               infeed_queue=None,
@@ -280,7 +474,8 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
-  context = TPUReplicateContext(name=graph.unique_name("cluster"))
+  context = TPUReplicateContext(
+      name=graph.unique_name("cluster"), num_replicas=num_replicas)
   try:
     context.Enter()
 
@@ -361,6 +556,12 @@ def replicate(computation,
   finally:
     context.report_unsupported_operations()
     context.Exit()
+    host_compute_core = context.HostComputeCore()
+
+  if host_compute_core:
+    attr_value = attr_value_pb2.AttrValue()
+    attr_value.list.s.extend([compat.as_bytes(x) for x in host_compute_core])
+    metadata._set_attr("host_compute_core", attr_value)  # pylint: disable=protected-access
 
   # Fan-out: Builds a TPUReplicatedOutput node for each output.
   outputs = [tpu_ops.tpu_replicated_output(output_tensors[i], num_replicas,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_config.py b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
index 38b5ea23103730630ae8e1cdd7b9180a501013c5..cc1a7fd801506e3f0b758c4848205f1c375403d2 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_config.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_config.py
@@ -35,10 +35,16 @@ _TF_CONFIG_ENV = run_config_lib._TF_CONFIG_ENV
 _SERVICE_KEY = run_config_lib._SERVICE_KEY
 _TPU_WORKER_JOB_NAME = 'tpu_worker_job_name'
 _NUM_CORES_PER_HOST = 8
-
 # pylint: enable=protected-access
 
 
+class InputPipelineConfig(object):
+  r"""Please see the definition of these values in TPUConfig."""
+  PER_SHARD_V1 = 1
+  PER_HOST_V1 = 2
+  PER_HOST_V2 = 3
+
+
 # TODO(b/72511246) Provide a simplified api to configure model parallelism.
 class TPUConfig(
     collections.namedtuple('TPUConfig', [
@@ -68,13 +74,16 @@ class TPUConfig(
       partitioned across 4 cores which span two cores in both x and y
       coordinates.  Please refer to @{tf.contrib.tpu.Topology} for the
       geometry of a TPU mesh.
-    per_host_input_for_training: If `True`, `input_fn` is invoked Per-Host
-      rather than Per-Core. With Per-Host input pipeline deployment, `input_fn`
-      is invoked once on each host. With Per-Core input pipeline deployment, it
-      is invoked once for each core. To be precise, with a global batch size
-      `train_batch_size` in `TPUEstimator` constructor, the batch size for each
-      shard is `train_batch_size` // #hosts. With Per-Core input pipeline
-      deployment, the shard batch size is `train_batch_size` // #cores.
+    per_host_input_for_training: If `True`, `PER_HOST_V1`, or `PER_HOST_V2`,
+      `input_fn` is invoked per-host rather than per-core. With per-host input
+      pipeline configuration, `input_fn` is invoked once on each host. With the
+      per-core input pipeline configuration, it is invoked once for each core.
+      With a global batch size `train_batch_size` in `TPUEstimator` constructor,
+      the batch size for each shard is `train_batch_size` // #hosts in the
+      `True` or `PER_HOST_V1` mode. In `PER_HOST_V2` mode, it is
+      `train_batch_size` // #cores. With the per-core input pipeline
+      configuration, the shard batch size is also `train_batch_size` // #cores.
+      Note: per_host_input_for_training==PER_SHARD_V1 only supports mode.TRAIN.
     tpu_job_name: The name of the TPU job. Typically, this name is auto-inferred
       within TPUEstimator, however when using ClusterSpec propagation in more
       esoteric cluster configurations, you may need to specify the job name as a
@@ -117,6 +126,13 @@ class TPUConfig(
         raise ValueError('computation_shape elements can only be 1 or 2; got '
                          'computation_shape={}'.format(computation_shape))
 
+    # per_host_input_for_training may be True, False, or integer in [1..3].
+    # Map legacy values (True, False) to numeric values.
+    if per_host_input_for_training is False:
+      per_host_input_for_training = InputPipelineConfig.PER_SHARD_V1
+    elif per_host_input_for_training is True:
+      per_host_input_for_training = InputPipelineConfig.PER_HOST_V1
+
     # Check initial_infeed_sleep_secs.
     if initial_infeed_sleep_secs:
       util_lib.check_positive_integer(initial_infeed_sleep_secs,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 3bac2db77e95520a6c9c4c17658267a9a6588d94..fbc1173e49fd6e8912f6bfae8a88198eda4f6d5b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -24,6 +24,7 @@ import copy
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import device_assignment  as tpu_device_assignment
+from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.platform import tf_logging as logging
@@ -205,7 +206,13 @@ class _TPUContext(object):
     """Return true if input_fn is invoked per-core (other than per-host)."""
     mode = self._assert_mode()
     return (mode == model_fn_lib.ModeKeys.TRAIN and
-            not self._config.tpu_config.per_host_input_for_training)
+            (self._config.tpu_config.per_host_input_for_training is
+             tpu_config.InputPipelineConfig.PER_SHARD_V1))
+
+  def is_input_per_host_with_iterators(self):
+    """Return true if input_fn should be run in the per-host v2 config."""
+    return (self._config.tpu_config.per_host_input_for_training is
+            tpu_config.InputPipelineConfig.PER_HOST_V2)
 
   def is_running_on_cpu(self, is_export_mode=False):
     """Determines whether the input_fn and model_fn should be invoked on CPU.
@@ -271,7 +278,8 @@ class _TPUContext(object):
       return global_batch_size
 
     # On TPU
-    if self.is_input_sharded_per_core():
+    if self.is_input_sharded_per_core() or (
+        self.is_input_per_host_with_iterators()):
       # We prohibit per core input sharding for the model parallelism case,
       # therefore it is safe to use num_cores here.
       return global_batch_size // self.num_cores
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 152f8c8c69ef7344c1346885cbdf8059e0849db3..7fab19afeecc258c5185f219da2a11f3ffdad056 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -30,7 +30,6 @@ import six
 from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.contrib.summary import summary_ops as contrib_summary
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -38,6 +37,8 @@ from tensorflow.contrib.tpu.python.tpu import tpu_context
 from tensorflow.contrib.tpu.python.tpu import tpu_feed
 from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.contrib.tpu.python.tpu import util as util_lib
+from tensorflow.contrib.training.python.training import hparam
+from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -53,7 +54,9 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import summary_ops_v2 as contrib_summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
@@ -73,6 +76,8 @@ _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _ONE_GIGABYTE = 1024 * 1024 * 1024
+_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
+_TPU_TRAIN_OP = '_tpu_train_op'
 
 _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 
@@ -85,6 +90,13 @@ _RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
 _WRAP_INPUT_FN_INTO_WHILE_LOOP = False
 
 
+ops.register_proto_function(
+    '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR),
+    proto_type=variable_pb2.VariableDef,
+    to_proto=resource_variable_ops._to_proto_fn,  # pylint: disable=protected-access
+    from_proto=resource_variable_ops._from_proto_fn)  # pylint: disable=protected-access
+
+
 def _create_global_step(graph):
   graph = graph or ops.get_default_graph()
   if training.get_global_step(graph) is not None:
@@ -740,6 +752,61 @@ def generate_per_host_enqueue_ops_fn_for_host(
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
 
+def generate_per_host_v2_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, device, host_id):
+  """Generates infeed enqueue ops for per-host input_fn on a single host."""
+  del host_id  # unused
+  captured_infeed_queue = _CapturedObject()
+  hooks = []
+
+  with ops.device(device):
+    inputs = _Inputs.from_input_fn(input_fn())
+
+    is_dataset = inputs.is_dataset
+    if not is_dataset:
+      raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 '
+                      'input pipeline configuration.')
+    if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
+      # TODO(b/XXX): Add predict support for PER_HOST_V2
+      raise TypeError('Most PREDICT not yet supported in PER_HOST_V2 mode.')
+
+    hooks.append(inputs.dataset_initializer_hook())
+
+  def enqueue_ops_fn():
+    """Generates the per_host enqueue ops."""
+    control_deps = []
+    per_host_sharded_inputs = []
+    num_replicas_per_host = ctx.num_of_replicas_per_host
+    with ops.device(device):
+      if not inputs.is_dataset:
+        raise TypeError('`input_fn` must return a `Dataset` for this mode.')
+      for _ in range(num_replicas_per_host):
+        # Use control dependencies to ensure a deterministic ordering.
+        with ops.control_dependencies(control_deps):
+          features, labels = inputs.features_and_labels()  # Calls get_next()
+
+        inputs_structure_recorder.validate_and_record_structure(
+            features, labels)
+        flattened_inputs = (
+            inputs_structure_recorder.flatten_features_and_labels(
+                features, labels))
+
+        control_deps.extend(flattened_inputs)
+        per_host_sharded_inputs.append(flattened_inputs)
+
+    infeed_queue = tpu_feed.InfeedQueue(
+        number_of_tuple_elements=len(per_host_sharded_inputs[0]))
+    captured_infeed_queue.capture(infeed_queue)
+    infeed_queue.set_configuration_from_sharded_input_tensors(
+        per_host_sharded_inputs)
+
+    per_host_enqueue_ops = infeed_queue.generate_enqueue_ops(
+        per_host_sharded_inputs, tpu_ordinal_function=ctx.tpu_ordinal_function)
+    return per_host_enqueue_ops
+
+  return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
+
+
 class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
@@ -975,10 +1042,17 @@ class _InputPipeline(object):
         host_device = tpu_host_placement_fn(host_id=host_id)
         with ops.device(host_device):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
-            enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
-                generate_per_host_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder,
-                    self._batch_axis, host_device, host_id))
+            if self._ctx.is_input_per_host_with_iterators():
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_v2_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, host_device, host_id))
+            else:
+              enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset = (
+                  generate_per_host_enqueue_ops_fn_for_host(
+                      self._ctx, self._input_fn,
+                      self._inputs_structure_recorder, self._batch_axis,
+                      host_device, host_id))
             all_hooks.extend(hooks)
 
             # NOTE(xiejw): We dispatch here based on the return type of the
@@ -1235,7 +1309,10 @@ class _ModelFnWrapper(object):
       batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
 
     if batch_size_for_model_fn is not None:
-      params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      if isinstance(params, hparam.HParams):
+        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
+      else:
+        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
@@ -1724,7 +1801,7 @@ class TPUEstimator(estimator_lib.Estimator):
         labels to match up with the corresponding images. If None is supplied,
         and per_host_input_for_training is True, batches will be sharded based
         on the major dimension. If tpu_config.per_host_input_for_training is
-        False, batch_axis is ignored.
+        False or `PER_HOST_V2`, batch_axis is ignored.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -1744,7 +1821,8 @@ class TPUEstimator(estimator_lib.Estimator):
         raise ValueError('`train_batch_size` cannot be `None`')
       util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
 
-      if (not config.tpu_config.per_host_input_for_training and
+      if (config.tpu_config.per_host_input_for_training is
+          tpu_config.InputPipelineConfig.PER_SHARD_V1 and
           config.tpu_config.computation_shape):
         raise ValueError(
             'Model parallelism only supports per host input for training. '
@@ -1873,7 +1951,10 @@ class TPUEstimator(estimator_lib.Estimator):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        if isinstance(kwargs['params'], hparam.HParams):
+          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
+        else:
+          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -1943,6 +2024,13 @@ class TPUEstimator(estimator_lib.Estimator):
         enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = (
             input_holders.generate_infeed_enqueue_ops_and_dequeue_fn())
 
+        graph = ops.get_default_graph()
+        for enqueue_op in enqueue_ops:
+          if isinstance(enqueue_op, list):
+            graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op)
+          else:
+            graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
+
         if mode == model_fn_lib.ModeKeys.TRAIN:
           loss, host_call, scaffold = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
@@ -1956,7 +2044,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator)),
-              ExamplesPerSecondHook(ctx.global_batch_size),
+              ExamplesPerSecondHook(ctx.global_batch_size,
+                                    output_dir=self.model_dir),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
@@ -1972,11 +2061,14 @@ class TPUEstimator(estimator_lib.Estimator):
           # Validate the TPU training graph to catch basic errors
           _validate_tpu_training_graph()
 
+          train_op = control_flow_ops.group(*update_ops)
+          graph.add_to_collection(_TPU_TRAIN_OP, train_op)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=loss,
               training_hooks=hooks,
-              train_op=control_flow_ops.group(*update_ops),
+              train_op=train_op,
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
@@ -2362,6 +2454,10 @@ class _Inputs(object):
   def features_and_labels(self):
     """Gets `features` and `labels`."""
     if self.is_dataset:
+      if self._iterator is None:
+        raise RuntimeError('Internal error: Must call dataset_initializer_hook '
+                           'before calling features_and_labels(). Please file '
+                           'a bug!')
       return _Inputs._parse_inputs(self._iterator.get_next())
 
     return (self._features, self._labels)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index 493d1848c072caa5254fc87c67badc2e99ec16ee..3ae350c7bb345cabdb74783c3233354d67394d3a 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -72,9 +72,9 @@ def _query_tpu_system_metadata(master_address, run_config,
               tpu_core_count += 1
           break
     except errors.DeadlineExceededError:
-      msg = ('Fail to connect Tensorflow master. It could be the TPU worker is '
-             'not ready (still under scheduling) or Tensorflow '
-             'master address is correct: got (%s).' %
+      msg = ('Failed to connect to the Tensorflow master. The TPU worker may '
+             'not be ready (still scheduling) or the Tensorflow master address '
+             'is incorrect: got (%s).' %
              (master_address))
 
       # TODO(xiejw): For local or grpc master we might not need retry logic
@@ -120,7 +120,8 @@ def _query_tpu_system_metadata(master_address, run_config,
     logging.info('*** Num TPU Workers: %d', metadata.num_hosts)
     logging.info('*** Num TPU Cores Per Worker: %d',
                  metadata.num_of_cores_per_host)
-    logging.info('*** Available Devices: %s', metadata.devices)
+    for device in metadata.devices:
+      logging.info('*** Available Device: %s', device)
   else:
     logging.info('Failed to find TPU: %s', metadata)
   return metadata
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 336d8260c3c8a5c30efa603e3faeabcc0944b8d0..c3882b8a27bc835f906c47dc5219f280c53800b8 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -37,7 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context")
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index 4ef8f9eebdb165e5fe221be8670276bf943159b3..639e70816926aaed850cee62dca6aa819b38de8b 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -172,7 +172,7 @@ It is always recommended to port a small, simple model first to make sure that
 you are familiar with the basic concepts of `TPUEstimator` and test end-to-end
 behavior. Once your simple model runs, gradually add more functionality.
 In addition, there are several sample models, available at
-[github.com/tensorflow/tpu-demos](https://github.com/tensorflow/tpu-demos).
+[github.com/tensorflow/tpu](https://github.com/tensorflow/tpu).
 
 To convert your code from the vanilla `Estimator` class to use TPUs, change the
 following (note some of the details may change over time):
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 6ae2f382528c37ae647b73ea01a7f88c07580c78..4d2bfd3e434e60b3fac408931688e8e486b7e494 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -308,18 +308,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index dbdbb08a8252c799924812c83fff7f0631424761..f305197c190b67355338c407a7895a0507941ddb 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -517,6 +518,7 @@ class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
     ops._USE_C_API = self._prev_value
 
 
+@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/evaluation.py b/tensorflow/contrib/training/python/training/evaluation.py
index 1a5fb45be0ad463f2b4189d97ce4bd41a67a1937..4bb53e867811b27dc95857cfdfe936dd2e3b5c6e 100644
--- a/tensorflow/contrib/training/python/training/evaluation.py
+++ b/tensorflow/contrib/training/python/training/evaluation.py
@@ -36,9 +36,8 @@ out the metrics values to stdout:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-        predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
@@ -81,9 +80,8 @@ more summaries and call the evaluate_repeatedly method:
 
   # Choose the metrics to compute:
   names_to_values, names_to_updates = tf.contrib.metrics.aggregate_metric_map({
-      "accuracy": tf.contrib.metrics.streaming_accuracy(predictions, labels),
-      "mse": tf.contrib.metrics.streaming_mean_squared_error(
-          predictions, labels),
+      "accuracy": tf.metrics.accuracy(labels, predictions),
+      "mse": tf.metrics.mean_squared_error(labels, predictions),
   })
 
   # Define the summaries to write:
diff --git a/tensorflow/contrib/training/python/training/evaluation_test.py b/tensorflow/contrib/training/python/training/evaluation_test.py
index b07039916c203940039732c12938e7f342fa72a3..c36d00e8425ccbfe9338b50fc492dc1334d59731 100644
--- a/tensorflow/contrib/training/python/training/evaluation_test.py
+++ b/tensorflow/contrib/training/python/training/evaluation_test.py
@@ -27,7 +27,6 @@ import numpy as np
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.losses.python.losses import loss_ops
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.training.python.training import evaluation
 from tensorflow.contrib.training.python.training import training
 from tensorflow.core.protobuf import config_pb2
@@ -38,6 +37,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
@@ -196,7 +196,8 @@ class EvaluateOnceTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     checkpoint_path = evaluation.wait_for_new_checkpoint(checkpoint_dir)
 
@@ -311,7 +312,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     final_values = evaluation.evaluate_repeatedly(
         checkpoint_dir=checkpoint_dir,
@@ -365,7 +367,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     logits = logistic_classifier(inputs)
     predictions = math_ops.round(logits)
 
-    accuracy, update_op = metric_ops.streaming_accuracy(predictions, labels)
+    accuracy, update_op = metrics.accuracy(
+        predictions=predictions, labels=labels)
 
     timeout_fn_calls = [0]
     def timeout_fn():
@@ -417,9 +420,8 @@ class EvaluateRepeatedlyTest(test.TestCase):
     self.assertEqual(final_values['my_var'], expected_value)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(
-        predictions + 1, labels)
+    accuracy0, update_op0 = metrics.accuracy(labels, predictions)
+    accuracy1, update_op1 = metrics.accuracy(labels, predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 95e051e3b5bb9f8075e66891a45c64a27bca68d1..185f70a86d00fc4bd195fdccc2504515c8c42a89 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -630,6 +630,9 @@ class HParams(object):
   def __str__(self):
     return str(sorted(self.values().items()))
 
+  def __repr__(self):
+    return '%s(%s)' % (type(self).__name__, self.__str__())
+
   @staticmethod
   def _get_kind_name(param_type, is_list):
     """Returns the field name given parameter type and is_list.
diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
index 72231948856b38edd3d022a99a62e6d4c8c5649e..99d486b1833ccaa81a873d457a8edb06f3d9c7a5 100644
--- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
+++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py
@@ -1574,8 +1574,9 @@ def _padding(sequences, num_unroll):
   if not sequences:
     return 0, {}
 
-  sequences_dict = {}
-  for key, value in sequences.items():
+  # Sort 'sequences_dict' so 'length' will have a predictable value below.
+  sequences_dict = collections.OrderedDict()
+  for key, value in sorted(sequences.items()):
     if not (isinstance(value, sparse_tensor.SparseTensor) or
             isinstance(value, sparse_tensor.SparseTensorValue)):
       sequences_dict[key] = ops.convert_to_tensor(value)
diff --git a/tensorflow/contrib/util/BUILD b/tensorflow/contrib/util/BUILD
index 6c766e4f1c04fd9cca0b6e03382737d42b6fda20..d9ccda8e89a4c9a1b3f3d24915b9ad3fb4d9be5f 100644
--- a/tensorflow/contrib/util/BUILD
+++ b/tensorflow/contrib/util/BUILD
@@ -75,15 +75,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py
index f4283cd9ed6eb185ec56bdfef45f6ddc5253d128..dca01d26f4c6291074d855322c8215027cfd2ace 100644
--- a/tensorflow/contrib/util/loader.py
+++ b/tensorflow/contrib/util/loader.py
@@ -42,9 +42,10 @@ def load_op_library(path):
     plugin.
   """
   if os.name == 'nt':
-    # To avoid makeing every user_ops aware of windows, re-write
-    # the file extension from .so to .dll.
-    path = re.sub(r'\.so$', '.dll', path)
+    # To avoid making every user_ops aware of windows, re-write
+    # the file extension from .so to .dll if .so file doesn't exist.
+    if not os.path.exists(path):
+      path = re.sub(r'\.so$', '.dll', path)
 
     # Currently we have only some user_ops as dlls on windows - don't try
     # to load them if the dll is not found.
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 80a5d07ea43531ed2532443b6ff9327b9ece6df7..9720fd6e8657de18cf8d7565f834568ae52fdbda 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -11,18 +11,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/contrib/verbs/rdma.h b/tensorflow/contrib/verbs/rdma.h
index 94203ee2b3654bffe82d203cde8780a64f63ba2a..c9df6beb6b1d67f14d26d7f0420cb53b6347bf99 100644
--- a/tensorflow/contrib/verbs/rdma.h
+++ b/tensorflow/contrib/verbs/rdma.h
@@ -262,7 +262,7 @@ class RdmaTensorRequest {
   // Receive tensor content (RDMA write was completed).
   //
   // Decode proto if required and/or move to GPU if the content was not
-  // written to it directly (GPU direct is not avaliable). Afterwards,
+  // written to it directly (GPU direct is not available). Afterwards,
   // invoke Done().
   void RecvTensorContent();
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 1d11410332c76595fd1c3ac5e801c5c161570ca2..c5ca421ced2434c5fae3682b0c52bfe1ef9883c5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -144,11 +144,14 @@ load(
     "tf_cuda_tests_tags",
     "if_static",
 )
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
 )
 
+exports_files(["ops/ops.pbtxt"])
+
 # -----------------------------------------------------------------------------
 # Public targets
 
@@ -253,7 +256,7 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# Minimal lib to detect plafrom
+# Minimal lib to detect platform
 cc_library(
     name = "lib_platform",
     hdrs = [
@@ -261,6 +264,55 @@ cc_library(
     ],
 )
 
+PLATFORM_BASE_HDRS = [
+    "platform/logging.h",
+    "platform/macros.h",
+    "platform/types.h",
+    "platform/cpu_info.h",
+]
+
+PLATFORM_OTHER_HDRS = [
+    "platform/abi.h",
+    "platform/stacktrace.h",
+    "platform/stacktrace_handler.h",
+    "platform/context.h",
+    "platform/cpu_feature_guard.h",
+    "platform/dynamic_annotations.h",
+    "platform/env.h",
+    "platform/env_time.h",
+    "platform/file_system.h",
+    "platform/file_system_helper.h",
+    "platform/fingerprint.h",
+    "platform/init_main.h",
+    "platform/mem.h",
+    "platform/mutex.h",
+    "platform/net.h",
+    "platform/notification.h",
+    "platform/null_file_system.h",
+    "platform/prefetch.h",
+    "platform/profile_utils/clock_cycle_profiler.h",
+    "platform/profile_utils/cpu_utils.h",
+    "platform/protobuf.h",
+    "platform/strong_hash.h",
+    "platform/subprocess.h",
+    "platform/thread_annotations.h",
+]
+
+# Smaller platform libraries that don't depend on "lib" or "lib_internal".
+cc_library(
+    name = "platform_base",
+    srcs = glob([
+        "platform/*/integral_types.h",
+        "platform/*/logging.h",
+        "platform/*/cpu_info.h",
+    ]),
+    hdrs = PLATFORM_BASE_HDRS,
+    deps = [
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:base",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -291,7 +343,8 @@ cc_library(
 # tf_cc_test and tf_cc_binary will include the necessary symbols.
 cc_library(
     name = "lib",
-    hdrs = [
+    hdrs = PLATFORM_BASE_HDRS +
+           PLATFORM_OTHER_HDRS + [
         "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
@@ -338,33 +391,6 @@ cc_library(
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
-        "platform/abi.h",
-        "platform/context.h",
-        "platform/cpu_feature_guard.h",
-        "platform/cpu_info.h",
-        "platform/dynamic_annotations.h",
-        "platform/env.h",
-        "platform/env_time.h",
-        "platform/file_system.h",
-        "platform/fingerprint.h",
-        "platform/init_main.h",
-        "platform/logging.h",
-        "platform/macros.h",
-        "platform/mem.h",
-        "platform/mutex.h",
-        "platform/net.h",
-        "platform/notification.h",
-        "platform/null_file_system.h",
-        "platform/prefetch.h",
-        "platform/profile_utils/clock_cycle_profiler.h",
-        "platform/profile_utils/cpu_utils.h",
-        "platform/protobuf.h",
-        "platform/stacktrace.h",
-        "platform/strong_hash.h",
-        "platform/subprocess.h",
-        "platform/thread_annotations.h",
-        "platform/types.h",
-        "platform/windows/cpu_info.h",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -379,13 +405,13 @@ cc_library(
 )
 
 cc_library(
-    name = "session_message",
-    srcs = ["util/session_message.cc"],
-    hdrs = ["util/session_message.h"],
+    name = "stacktrace",
+    srcs = glob(["platform/*/stacktrace.h"]),
+    hdrs = ["platform/stacktrace.h"],
     deps = [
-        ":framework",
-        ":lib",
-        ":protos_all_cc",
+        ":abi",
+        ":lib_platform",
+        "//tensorflow/core/platform/default/build_config:stacktrace",
     ],
 )
 
@@ -394,11 +420,34 @@ cc_library(
     srcs = ["platform/stacktrace_handler.cc"],
     hdrs = ["platform/stacktrace_handler.h"],
     deps = [
-        ":lib",
+        ":abi",
         ":lib_platform",
+        ":stacktrace",
     ],
 )
 
+cc_library(
+    name = "session_message",
+    srcs = ["util/session_message.cc"],
+    hdrs = ["util/session_message.h"],
+    deps = [
+        ":framework",
+        ":lib",
+        ":protos_all_cc",
+    ],
+)
+
+# Libraries that will eventually be moved into lib/core
+# Note that stringpiece_test can't be place here yet, because we are
+# required to use tf_cc_test, and that rule will change / into _
+cc_library(
+    name = "core_stringpiece",
+    srcs = ["lib/core/stringpiece.cc"],
+    hdrs = ["lib/core/stringpiece.h"],
+    copts = tf_copts(),
+    deps = [":platform_base"],
+)
+
 # Test support library needed for all tests
 # This is currently public, but may be made internal in the
 # future.  Try to avoid depending on it.
@@ -426,6 +475,27 @@ cc_library(
     ] + tf_additional_test_deps(),
 )
 
+# Testing libraries - lite versions that don't depend on all of "lib" or
+# "lib_internal". Instead, they only need a much smaller set of support
+# libraries such as ":platform_base" and ":core_stringpiece".
+cc_library(
+    name = "test_lite",
+    testonly = 1,
+    srcs = [
+        "platform/test.cc",
+    ],
+    hdrs = [
+        "platform/test.h",
+        "platform/test_benchmark.h",
+    ],
+    copts = tf_copts(),
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        "//tensorflow/core/platform/default/build_config:gtest",
+    ],
+)
+
 # This build rule (along with :framework_internal, :lib, and :lib_internal)
 # purposefully omits the definitions of many declared symbols, which are
 # included in //tensorflow:libtensorflow_framework.so. Using tf_cc_test and tf_cc_binary
@@ -443,6 +513,7 @@ tf_cuda_library(
         "framework/attr_value_util.h",
         "framework/bfloat16.h",
         "framework/cancellation.h",
+        "framework/collective.h",
         "framework/common_shape_fns.h",
         "framework/control_flow.h",  # TODO(josh11b): Make internal?
         "framework/dataset.h",
@@ -482,7 +553,6 @@ tf_cuda_library(
         "framework/type_index.h",
         "framework/type_traits.h",
         "framework/types.h",
-        "framework/visitable_allocator.h",
         "public/version.h",
         "util/activation_mode.h",
         "util/bcast.h",
@@ -613,12 +683,16 @@ tf_gen_op_libs(
     op_lib_names = [
         "batch_ops",
         "bitwise_ops",
+        "boosted_trees_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
+        "collective_ops",
         "control_flow_ops",
         "ctc_ops",
         "data_flow_ops",
         "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
         "function_ops",
         "functional_ops",
         "image_ops",
@@ -635,6 +709,7 @@ tf_gen_op_libs(
         "random_ops",
         "remote_fused_graph_ops",
         "resource_variable_ops",
+        "rpc_ops",
         "scoped_allocator_ops",
         "sdca_ops",
         "set_ops",
@@ -725,13 +800,17 @@ cc_library(
         ":audio_ops_op_lib",
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
+        ":boosted_trees_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
+        ":collective_ops_op_lib",
         ":control_flow_ops_op_lib",
         ":ctc_ops_op_lib",
         ":cudnn_rnn_ops_op_lib",
         ":data_flow_ops_op_lib",
         ":dataset_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
         ":function_ops_op_lib",
         ":functional_ops_op_lib",
         ":image_ops_op_lib",
@@ -748,6 +827,7 @@ cc_library(
         ":random_ops_op_lib",
         ":remote_fused_graph_ops_op_lib",
         ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
         ":scoped_allocator_ops_op_lib",
         ":script_ops_op_lib",
         ":sdca_ops_op_lib",
@@ -866,13 +946,17 @@ cc_library(
         "//tensorflow/core/kernels:audio",
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
+        "//tensorflow/core/kernels:boosted_trees_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
+        "//tensorflow/core/kernels:collective_ops",
         "//tensorflow/core/kernels:control_flow_ops",
         "//tensorflow/core/kernels:ctc_ops",
         "//tensorflow/core/kernels:cudnn_rnn_kernels",
         "//tensorflow/core/kernels:data_flow",
         "//tensorflow/core/kernels:dataset_ops",
+        "//tensorflow/core/kernels:decode_proto_op",
+        "//tensorflow/core/kernels:encode_proto_op",
         "//tensorflow/core/kernels:fake_quant_ops",
         "//tensorflow/core/kernels:function_ops",
         "//tensorflow/core/kernels:functional_ops",
@@ -894,6 +978,7 @@ cc_library(
         "//tensorflow/core/kernels:remote_fused_graph_ops",
         "//tensorflow/core/kernels:required",
         "//tensorflow/core/kernels:resource_variable_ops",
+        "//tensorflow/core/kernels:rpc_op",
         "//tensorflow/core/kernels:scoped_allocator_ops",
         "//tensorflow/core/kernels:sdca_ops",
         "//tensorflow/core/kernels:set_kernels",
@@ -924,6 +1009,9 @@ cc_library(
         "//tensorflow/core/kernels:mkl_softmax_op",
         "//tensorflow/core/kernels:mkl_tfconv_op",
         "//tensorflow/core/kernels:mkl_aggregate_ops",
+    ]) + if_cuda([
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
+        "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
     ]),
 )
 
@@ -1618,23 +1706,27 @@ cc_library(
         exclude = [
             "**/*test*",
             "framework/variant.cc",
+            "lib/core/stringpiece.cc",
             "lib/hash/crc32c_accelerate.cc",
             "lib/gif/**/*",
             "lib/jpeg/**/*",
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
             "**/*test*",
+            "lib/core/stringpiece.cc",
             "platform/**/cuda.h",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/abi.cc",
             "platform/variant_coding.cc",
             "platform/**/variant_cord_coding.cc",
         ] +
@@ -1648,6 +1740,8 @@ cc_library(
     deps = tf_additional_lib_deps() + [
         ":lib_hash_crc32c_accelerate_internal",
         ":lib_proto_parsing",
+        ":abi",
+        ":core_stringpiece",
         "//third_party/eigen3",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "@snappy",
@@ -1879,7 +1973,6 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
     "framework/variant.h",
-    "framework/visitable_allocator.h",
     "platform/variant_coding.h",
     "util/command_line_flags.h",
     "util/env_var.h",
@@ -2157,12 +2250,17 @@ tf_cuda_library(
 CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/allocator_retry.h",
     "common_runtime/bfc_allocator.h",
+    "common_runtime/buf_rendezvous.h",
     "common_runtime/build_graph_options.h",
+    "common_runtime/collective_executor_mgr.h",
+    "common_runtime/collective_param_resolver_local.h",
+    "common_runtime/collective_rma_local.h",
     "common_runtime/constant_folding.h",
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
     "common_runtime/device_factory.h",
+    "common_runtime/device_resolver_local.h",
     "common_runtime/device_set.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
@@ -2173,6 +2271,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
     "common_runtime/pending_counts.h",
+    "common_runtime/placer.h",
     "common_runtime/process_util.h",
     "common_runtime/profile_handler.h",
     "common_runtime/renamed_device.h",
@@ -2181,10 +2280,11 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/scoped_allocator.h",
     "common_runtime/scoped_allocator_mgr.h",
     "common_runtime/session_factory.h",
-    "common_runtime/placer.h",
+    "common_runtime/single_threaded_cpu_device.h",
     "common_runtime/stats_publisher_interface.h",
     "common_runtime/step_stats_collector.h",
     "common_runtime/threadpool_device.h",
+    "common_runtime/visitable_allocator.h",
     "graph/gradients.h",
     "graph/quantize_training.h",
 ] + if_mkl(["graph/mkl_graph_util.h"])
@@ -2195,7 +2295,11 @@ tf_cuda_library(
         "common_runtime/accumulate_n_optimizer.cc",
         "common_runtime/allocator_retry.cc",
         "common_runtime/bfc_allocator.cc",
+        "common_runtime/buf_rendezvous.cc",
         "common_runtime/build_graph_options.cc",
+        "common_runtime/collective_executor_mgr.cc",
+        "common_runtime/collective_param_resolver_local.cc",
+        "common_runtime/collective_rma_local.cc",
         "common_runtime/constant_folding.cc",
         "common_runtime/copy_tensor.cc",
         "common_runtime/costmodel_manager.cc",
@@ -2203,6 +2307,7 @@ tf_cuda_library(
         "common_runtime/device.cc",
         "common_runtime/device_factory.cc",
         "common_runtime/device_mgr.cc",
+        "common_runtime/device_resolver_local.cc",
         "common_runtime/device_set.cc",
         "common_runtime/executor.cc",
         "common_runtime/function.cc",
@@ -2581,6 +2686,23 @@ cc_library(
     alwayslink = 1,
 )
 
+# This is the lite version of a main() for tests. It does not include any
+# support for reporting benchmark results when running on TPUs.
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    srcs = ["platform/test_main.cc"],
+    copts = tf_copts(),
+    deps = [
+        ":core_stringpiece",
+        ":lib_platform",
+        ":stacktrace_handler",
+        ":test_lite",
+        "//tensorflow/core/platform/default/build_config:test_lite_main",
+    ],
+    alwayslink = 1,
+)
+
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
@@ -2810,6 +2932,11 @@ tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
     srcs = [
+        "common_runtime/buf_rendezvous_test.cc",
+        "common_runtime/collective_executor_mgr_test.cc",
+        "common_runtime/collective_param_resolver_local_test.cc",
+        "common_runtime/collective_rma_local_test.cc",
+        "common_runtime/device_resolver_local_test.cc",
         "common_runtime/device_set_test.cc",
         "common_runtime/optimization_registry_test.cc",
         "common_runtime/pending_counts_test.cc",
@@ -3233,6 +3360,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_process_util_test",
+    size = "small",
+    srcs = ["common_runtime/process_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core_cpu_internal",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "common_runtime_rendezvous_util_test",
     size = "small",
@@ -3820,18 +3959,6 @@ cc_library(
 # -----------------------------------------------------------------------------
 # Google-internal targets go here (must be at the end).
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 alias(
     name = "android_srcs_no_runtime",
     actual = ":mobile_srcs_no_runtime",
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index 58dbac4e8edac7079d315fbfcdafbd136793df0b..19d643880966f7607405539a5ad43d8e03dc13fb 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -17,18 +17,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "base_api_def",
     srcs = glob(["base_api/*"]),
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1921e3507bb1a6e8f175305400e4bfbad068d38
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_id_range"
+    description: <<END
+A Rank 1 tensor (shape=[2]) to specify the range [first, last] of node ids to process within `stats_summary_list`. The nodes are iterated between the two nodes specified by the tensor, as like `for node_id in range(node_id_range[0], node_id_range[1]+1)` (Note that the last index node_id_range[1] is inclusive).
+END
+  }
+  in_arg {
+    name: "stats_summary_list"
+    description: <<END
+A list of Rank 3 tensor (#shape=[max_splits, bucket, 2]) for accumulated stats summary (gradient/hessian) per node per buckets for each feature. The first dimension of the tensor is the maximum number of splits, and thus not all elements of it will be used, but only the indexes specified by node_ids will be used.
+END
+  }
+  out_arg {
+    name: "node_ids_list"
+    description: <<END
+An output list of Rank 1 tensors indicating possible split node ids for each feature. The length of the list is num_features, but each tensor has different size as each feature provides different possible nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "gains_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the best gains for each feature to split for certain nodes. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "thresholds_list"
+    description: <<END
+An output list of Rank 1 tensors indicating the bucket id to compare with (as a threshold) for split in each node. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "left_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors indicating the contribution of the left nodes when branching from parent nodes (given by the tensor element in the output node_ids_list) to the left direction by the given threshold for each feature. This value will be used to make the left node value by adding to the parent node value. Second dimension size is 1 for 1-dimensional logits, but would be larger for multi-class problems. See above for details like shapes and sizes.
+END
+  }
+  out_arg {
+    name: "right_node_contribs_list"
+    description: <<END
+A list of Rank 2 tensors, with the same shape/conditions as left_node_contribs_list, but just that the value is for the right node.
+END
+  }
+  attr {
+    name: "l1"
+    description: <<END
+l1 regularization factor on leaf weights, per instance based.
+END
+  }
+  attr {
+    name: "l2"
+    description: <<END
+l2 regularization factor on leaf weights, per instance based.
+END
+  }
+  attr {
+    name: "tree_complexity"
+    description: <<END
+adjustment to the gain, per leaf based.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+the number of nodes that can be split in the whole tree. Used as a dimension of output tensors.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+inferred from the size of `stats_summary_list`; the number of total features.
+END
+  }
+  summary: "Calculates gains for each feature and returns the best possible split information for the feature."
+  description: <<END
+The split information is the best threshold (bucket id), gains and left/right node contributions per node for each feature.
+
+It is possible that not all nodes can be split on each feature. Hence, the list of possible nodes can differ between the features. Therefore, we return `node_ids_list` for each feature, containing the list of nodes that this feature can be used to split.
+
+In this manner, the output is the best split per features and per node, so that it needs to be combined later to produce the best split for each node (among all possible features).
+
+The length of output lists are all of the same length, `num_features`.
+The output shapes are compatible in a way that the first dimension of all tensors of all lists are the same and equal to the number of possible split nodes for each feature.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aee73b910f0ae8b542b3741ceeeadb9624126a27
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesCreateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resource to be created.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the initial value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the tree ensemble.
+END
+  }
+  summary: "Creates a tree ensemble model and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b1602ba045b95db060295a6b05f17dd8a06924d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -0,0 +1,26 @@
+op {
+  graph_op_name: "BoostedTreesDeserializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  in_arg {
+    name: "stamp_token"
+    description: <<END
+Token to use as the new value of the resource stamp.
+END
+  }
+  in_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Deserializes a serialized tree ensemble config and replaces current tree"
+  description: <<END
+ensemble.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1bce5639a2049dba897fe45680fe98fa45f76c24
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a BoostedTreesEnsembleResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ef45a92498dadb9b911fbb99a1365f81a72060f2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "BoostedTreesGetEnsembleStates"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_trees"
+    description: <<END
+The number of trees in the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "num_finalized_trees"
+    description: <<END
+The number of trees that were finished successfully.
+END
+  }
+  out_arg {
+    name: "num_attempted_layers"
+    description: <<END
+The number of layers we attempted to build (but not necessarily succeeded).
+END
+  }
+  summary: "Retrieves the tree ensemble resource stamp token."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dc0856c900d1b1238d0641fef3f2f57b95a209fe
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "BoostedTreesMakeStatsSummary"
+  visibility: HIDDEN
+  in_arg {
+    name: "node_ids"
+    description: <<END
+int32 Rank 1 Tensor containing node ids, which each example falls into for the requested layer.
+END
+  }
+  in_arg {
+    name: "gradients"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for gradients.
+END
+  }
+  in_arg {
+    name: "hessians"
+    description: <<END
+float32; Rank 2 Tensor (shape=[#examples, 1]) for hessians.
+END
+  }
+  in_arg {
+    name: "bucketized_features_list"
+    description: <<END
+int32 list of Rank 1 Tensors, each containing the bucketized feature (for each feature column).
+END
+  }
+  out_arg {
+    name: "stats_summary"
+    description: <<END
+output Rank 4 Tensor (shape=[#features, #splits, #buckets, 2]) containing accumulated stats put into the corresponding node and bucket. The first index of 4th dimension refers to gradients, and the second to hessians.
+END
+  }
+  attr {
+    name: "max_splits"
+    description: <<END
+int; the maximum number of splits possible in the whole tree.
+END
+  }
+  attr {
+    name: "num_buckets"
+    description: <<END
+int; equals to the maximum possible value of bucketized feature.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+int; inferred from the size of bucketized_features_list; the number of features.
+END
+  }
+  summary: "Makes the summary of accumulated stats for the batch."
+  description: <<END
+The summary stats contains gradients and hessians accumulated into the corresponding node and bucket for each example.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b23e77a1fa874e8b7569805527912a84c86111e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "BoostedTreesPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+Output rank 2 Tensor containing logits for each example.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+scalar, max depth of trees. To be used for parallelization costs.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the logits. It is designed to be used during prediction.
+It traverses all the trees and calculates the final score for each instance.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b3688d8a3f23d24178343d5b5e6b3ab5741b9f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "BoostedTreesSerializeEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble.
+END
+  }
+  out_arg {
+    name: "stamp_token"
+    description: <<END
+Stamp token of the tree ensemble resource.
+END
+  }
+  out_arg {
+    name: "tree_ensemble_serialized"
+    description: <<END
+Serialized proto of the ensemble.
+END
+  }
+  summary: "Serializes the tree ensemble to a proto."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7203d3cb5891ff78c441badba690a4a7d018cb83
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "BoostedTreesTrainingPredict"
+  visibility: HIDDEN
+  in_arg {
+    name: "cached_tree_ids"
+    description: <<END
+Rank 1 Tensor containing cached tree ids which is the starting
+tree of prediction.
+END
+  }
+  in_arg {
+    name: "cached_node_ids"
+    description: <<END
+Rank 1 Tensor containing cached node id which is the starting
+node of prediction.
+END
+  }
+  in_arg {
+    name: "bucketized_features"
+    description: <<END
+A list of rank 1 Tensors containing bucket id for each
+feature.
+END
+  }
+  out_arg {
+    name: "partial_logits"
+    description: <<END
+Rank 2 Tensor containing logits update (with respect to cached
+values stored) for each example.
+END
+  }
+  out_arg {
+    name: "tree_ids"
+    description: <<END
+Rank 1 Tensor containing new tree ids for each example.
+END
+  }
+  out_arg {
+    name: "node_ids"
+    description: <<END
+Rank 1 Tensor containing new node ids in the new tree_ids.
+END
+  }
+  attr {
+    name: "num_bucketized_features"
+    description: <<END
+Inferred.
+END
+  }
+  attr {
+    name: "logits_dimension"
+    description: <<END
+scalar, dimension of the logits, to be used for partial logits
+shape.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+scalar, max depth of trees. To be used for parallelization costs.
+END
+  }
+  summary: "Runs multiple additive regression ensemble predictors on input instances and"
+  description: <<END
+computes the update to cached logits. It is designed to be used during training.
+It traverses the trees starting from cached tree id and cached node id and
+calculates the updates to be pushed to the cache.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..00f8953875783e87f40ce8c04187919424bae6da
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -0,0 +1,82 @@
+op {
+  graph_op_name: "BoostedTreesUpdateEnsemble"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the ensemble variable.
+END
+  }
+  in_arg {
+    name: "feature_ids"
+    description: <<END
+Rank 1 tensor with ids for each feature. This is the real id of
+the feature that will be used in the split.
+END
+  }
+  in_arg {
+    name: "node_ids"
+    description: <<END
+List of rank 1 tensors representing the nodes for which this feature
+has a split.
+END
+  }
+  in_arg {
+    name: "gains"
+    description: <<END
+List of rank 1 tensors representing the gains for each of the feature's
+split.
+END
+  }
+  in_arg {
+    name: "thresholds"
+    description: <<END
+List of rank 1 tensors representing the thesholds for each of the
+feature's split.
+END
+  }
+  in_arg {
+    name: "left_node_contribs"
+    description: <<END
+List of rank 2 tensors with left leaf contribs for each of
+the feature's splits. Will be added to the previous node values to constitute
+the values of the left nodes.
+END
+  }
+  in_arg {
+    name: "right_node_contribs"
+    description: <<END
+List of rank 2 tensors with right leaf contribs for each
+of the feature's splits. Will be added to the previous node values to constitute
+the values of the right nodes.
+END
+  }
+  attr {
+    name: "max_depth"
+    description: <<END
+Max depth of the tree to build.
+END
+  }
+  attr {
+    name: "learning_rate"
+    description: <<END
+shrinkage const for each new tree.
+END
+  }
+  attr {
+    name: "pruning_mode"
+    description: <<END
+0-No pruning, 1-Pre-pruning, 2-Post-pruning.
+END
+  }
+  attr {
+    name: "num_features"
+    description: <<END
+Number of features that have best splits returned. INFERRED.
+END
+  }
+  summary: "Updates the tree ensemble by either adding a layer to the last tree being grown"
+  description: <<END
+or by starting a new tree.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..803d8970ab78de347936a8dbbd2f39d8d9915f1e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,36 @@
+op {
+  graph_op_name: "ClipByValue"
+  in_arg {
+    name: "t"
+    description: <<END
+A `Tensor`.
+END
+  }
+  in_arg {
+    name: "clip_value_min"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The minimum value to clip by.
+END
+  }
+  in_arg {
+    name: "clip_value_max"
+    description: <<END
+A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+as `t`. The maximum value to clip by.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A clipped `Tensor` with the same shape as input 't'.
+END
+  }
+  summary: "Clips tensor values to a specified min and max."
+  description: <<END
+Given a tensor `t`, this operation returns a tensor of the same type and
+shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+greater than `clip_value_max` are set to `clip_value_max`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..88049bca365f4a738cac9975d0e14340e1ae401d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  visibility: SKIP
+  summary: "Receives a tensor value broadcast from another device."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ff70f5b178af117e694cf7e998423b5ea58ac5a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  visibility: SKIP
+  summary: "Broadcasts a tensor value to one or more other devices."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..10d9771d46d408d9c0414dab4ae5954a75dfc47e
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  visibility: SKIP
+  summary: "Mutually reduces multiple tensors of identical type and shape."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c8152f53c4ded035140abd24ba006bf391641cf1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DecodeProtoV2.pbtxt
@@ -0,0 +1,116 @@
+op {
+  graph_op_name: "DecodeProtoV2"
+  in_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  out_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+Each entry is the number of values found for the corresponding field.
+Optional fields may have 0 or 1 values.
+END
+  }
+  out_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+`values[i]` has datatype `output_types[i]`
+and shape `[batch_shape, max(sizes[...,i])]`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "output_types"
+    description: <<END
+List of TF types to use for the respective field in field_names.
+END
+  }
+  attr {
+    name: "descriptor_source"
+    description: <<END
+Either the special value `local://` or a path to a file containing
+a serialized `FileDescriptorSet`.
+END
+  }
+  attr {
+    name: "message_format"
+    description: <<END
+Either `binary` or `text`.
+END
+  }
+  attr {
+    name: "sanitize"
+    description: <<END
+Whether to sanitize the result or not.
+END
+  }
+  summary: <<END
+The op extracts fields from a serialized protocol buffers message into tensors.
+END
+  description: <<END
+The `decode_proto` op extracts fields from a serialized protocol buffers
+message into tensors.  The fields in `field_names` are decoded and converted
+to the corresponding `output_types` if possible.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+Each output tensor is a dense tensor. This means that it is padded to
+hold the largest number of repeated elements seen in the input
+minibatch. (The shape is also padded by one to prevent zero-sized
+dimensions). The actual repeat counts for each example in the
+minibatch can be found in the `sizes` output. In many cases the output
+of `decode_proto` is fed immediately into tf.squeeze if missing values
+are not a concern. When using tf.squeeze, always pass the squeeze
+dimension explicitly to avoid surprises.
+
+For the most part, the mapping between Proto field types and
+TensorFlow dtypes is straightforward. However, there are a few
+special cases:
+
+- A proto field that contains a submessage or group can only be converted
+to `DT_STRING` (the serialized submessage). This is to reduce the
+complexity of the API. The resulting string can be used as input
+to another instance of the decode_proto op.
+
+- TensorFlow lacks support for unsigned integers. The ops represent uint64
+types as a `DT_INT64` with the same twos-complement bit pattern
+(the obvious way). Unsigned int32 values can be represented exactly by
+specifying type `DT_INT64`, or using twos-complement if the caller
+specifies `DT_INT32` in the `output_types` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+Both binary and text proto serializations are supported, and can be
+chosen using the `format` attribute.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe0fc3823ff724641298c03f74c115dd6211f385
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,15 @@
+op {
+  graph_op_name: "DeepCopy"
+  in_arg {
+    name: "x"
+    description: "The source tensor of type `T`."
+  }
+  out_arg {
+    name: "y"
+    description: <<END
+    y: A `Tensor` of type `T`. A copy of `x`. Guaranteed that `y`
+      is not an alias of `x`.
+END
+  }
+  summary: "Makes a copy of `x`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..746f561e9251c34c327123efd349bfb57682d7aa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Empty.pbtxt
@@ -0,0 +1,23 @@
+op {
+  graph_op_name: "Empty"
+  in_arg {
+    name: "shape"
+    description: "1-D. Represents the shape of the output tensor."
+  }
+  attr {
+    name: "init"
+    description:
+        "If True, initialize the returned tensor with the default value "
+        "of dtype.  Otherwise, the implementation is free not to initialize"
+        "the tensor's content."
+  }
+  out_arg {
+    name: "output"
+    description: "A `Tensor` of type `T`."
+  }
+  summary: <<END
+Creates a tensor with the given shape.
+
+This operation creates a tensor of `shape` and `dtype`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fdbe47f23615a1bbac30346d46241c4d321bc649
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_EncodeProto.pbtxt
@@ -0,0 +1,81 @@
+op {
+  graph_op_name: "EncodeProto"
+  in_arg {
+    name: "sizes"
+    description: <<END
+Tensor of int32 with shape `[batch_shape, len(field_names)]`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+List of tensors containing values for the corresponding field.
+END
+  }
+  out_arg {
+    name: "bytes"
+    description: <<END
+Tensor of serialized protos with shape `batch_shape`.
+END
+  }
+  attr {
+    name: "message_type"
+    description: <<END
+Name of the proto message type to decode.
+END
+  }
+  attr {
+    name: "field_names"
+    description: <<END
+List of strings containing proto field names.
+END
+  }
+  attr {
+    name: "Tinput_types"
+    description: <<END
+The input types.
+END
+  }
+  summary: <<END
+The op serializes protobuf messages provided in the input tensors.
+END
+  description: <<END
+The types of the tensors in `values` must match the schema for the
+fields specified in `field_names`. All the tensors in `values` must
+have a common shape prefix, *batch_shape*.
+
+The `sizes` tensor specifies repeat counts for each field.  The repeat
+count (last dimension) of a each tensor in `values` must be greater
+than or equal to corresponding repeat count in `sizes`.
+
+A `message_type` name must be provided to give context for the field
+names. The actual message descriptor can be looked up either in the
+linked-in descriptor pool or a filename provided by the caller using
+the `descriptor_source` attribute.
+
+The `descriptor_source` attribute selects a source of protocol
+descriptors to consult when looking up `message_type`. This may be a
+filename containing a serialized `FileDescriptorSet` message,
+or the special value `local://`, in which case only descriptors linked
+into the code will be searched; the filename can be on any filesystem
+accessible to TensorFlow.
+
+You can build a `descriptor_source` file using the `--descriptor_set_out`
+and `--include_imports` options to the protocol compiler `protoc`.
+
+The `local://` database only covers descriptors linked into the
+code via C++ libraries, not Python imports. You can link in a proto descriptor
+by creating a cc_library target with alwayslink=1.
+
+There are a few special cases in the value mapping:
+
+Submessage and group fields must be pre-serialized as TensorFlow strings.
+
+TensorFlow lacks support for unsigned int64s, so they must be
+represented as `tf.int64` with the same twos-complement bit pattern
+(the obvious way).
+
+Unsigned int32 values can be represented exactly with `tf.int64`, or
+with sign wrapping if the input is of type `tf.int32`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_For.pbtxt b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a7cd8e1a26e2c75c5d6aaea65699c0545c1be445
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_For.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "For"
+  in_arg { name: "start" description: "The lower bound. An int32" }
+  in_arg { name: "limit" description: "The upper bound. An int32" }
+  in_arg { name: "delta" description: "The increment. An int32" }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "A list of dtypes." }
+  attr {
+    name: "body"
+    description: <<END
+    A function that takes a list of tensors (int32, T) and returns another
+    list of tensors (T).
+END
+  }
+  summary: <<END
+  ```python
+   output = input;
+   for i in range(start, limit, delta)
+     output = body(i, output);
+  ```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_If.pbtxt b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7ba5a3f37e1cd55e9f13b3c8f8f11f1f20346de7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_If.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "If"
+  in_arg { name: "cond"  description: "The predicate." }
+  in_arg {
+    name: "cond"
+    description: <<END
+      A Tensor. If the tensor is a scalar of non-boolean type, the
+      scalar is converted to a boolean according to the
+      following rule: if the scalar is a numerical value, non-zero means
+      `True` and zero means False; if the scalar is a string, non-empty
+      means `True` and empty means `False`. If the tensor is not a scalar,
+      being empty means False and being non-empty means True.
+END
+  }
+  in_arg {
+    name: "input"
+    description: "A list of input tensors."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of return values."
+  }
+  attr { name: "Tin"  description: "A list of input types." }
+  attr { name: "Tout"  description: "A list of output types." }
+  attr {
+    name: "then_branch"
+    description: <<END
+      A function that takes 'inputs' and returns a list of tensors, whose
+      types are the same as what else_branch returns.
+END
+  }
+  attr {
+    name: "else_branch"
+    description: <<END
+    A function that takes 'inputs' and returns a list of tensors, whose
+    types are the same as what then_branch returns.
+END
+  }
+  summary: "output = cond ? then_branch(input) : else_branch(input)"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3654286cc350995f8bed497cd662fce3b4150872
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceAdd"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Adds v into specified rows of x.
+
+    Computes y = x; y[i, :] += v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a9480b4a3837893395168785c5b5b9ba74b643d8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceSub"
+  in_arg {
+    name: "x"
+    description: "A `Tensor` of type T."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Subtracts `v` into specified rows of `x`.
+
+    Computes y = x; y[i, :] -= v; return y.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2fcd3659dc771077d34d2ba833a40e7d6be68f53
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,28 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  in_arg {
+    name: "x"
+    description: "A tensor of type `T`."
+  }
+  in_arg {
+    name: "i"
+    description: "A vector. Indices into the left-most dimension of `x`."
+  }
+  in_arg {
+    name: "v"
+    description:
+        "A `Tensor` of type T. Same dimension sizes as x except "
+        "the first dimension, which must be the same as i's size."
+  }
+  out_arg {
+    name: "y"
+    description:
+        "A `Tensor` of type T. An alias of `x`. The content "
+        "of `y` is undefined if there are duplicates in `i`."
+  }
+  summary: <<END
+    Updates specified rows with values in `v`.
+
+    Computes `x[i, :] = v; return x`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d54b7ef32a3237607c6d31934aa43f11859a248b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IsBoostedTreesEnsembleInitialized.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "IsBoostedTreesEnsembleInitialized"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_ensemble_handle"
+    description: <<END
+Handle to the tree ensemble resouce.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+output boolean on whether it is initialized or not.
+END
+  }
+  summary: "Checks whether a tree ensemble has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..344ef191fd580657acd5ebf75c3b5969f1af1fd2
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Rpc.pbtxt
@@ -0,0 +1,108 @@
+op {
+  graph_op_name: "Rpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the RPC method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+If the connection fails or the remote worker returns an error
+status, the op reraises this exception locally.
+
+See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bded00e83c7de09da8eb06d353925a83bb4e7134
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TryRpc.pbtxt
@@ -0,0 +1,123 @@
+op {
+  graph_op_name: "TryRpc"
+  in_arg {
+    name: "address"
+    description: <<END
+`0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `method` and `request`.
+END
+  }
+  in_arg {
+    name: "method"
+    description: <<END
+`0-D` or `1-D`.  The method address on the RPC server.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `request`.
+END
+  }
+  in_arg {
+    name: "request"
+    description: <<END
+`0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+If this tensor has more than 1 element, then multiple parallel rpc requests
+are sent.  This argument broadcasts with `address` and `method`.
+END
+  }
+  out_arg {
+    name: "response"
+    description: <<END
+Same shape as `request`. Serialized proto strings: the rpc responses.
+END
+  }
+  out_arg {
+    name: "status_code"
+    description: <<END
+Same shape as `request`.  Values correspond to tensorflow Status enum codes.
+END
+  }
+  out_arg {
+    name: "status_message"
+    description: <<END
+Same shape as `request`.  Values correspond to Status messages
+returned from the RPC calls.
+END
+  }
+  attr {
+    name: "protocol"
+    description: <<END
+RPC protocol to use.  Empty string means use the default protocol.
+Options include 'grpc'.
+END
+  }
+  attr {
+    name: "fail_fast"
+    description: <<END
+`boolean`. If `true` (default), then failures to connect
+(i.e., the server does not immediately respond) cause an RPC failure.
+END
+  }
+  attr {
+    name: "timeout_in_ms"
+    description: <<END
+`int`. If `0` (default), then the kernel will run the RPC
+request and only time out if the RPC deadline passes or the session times out.
+If this value is greater than `0`, then the op will raise an exception if
+the RPC takes longer than `timeout_in_ms`.
+END
+  }
+  summary: <<END
+Perform batches of RPC requests.
+END
+  description: <<END
+This op asynchronously performs either a single RPC request, or a batch
+of requests.  RPC requests are defined by three main parameters:
+
+  - `address` (the host+port or BNS address of the request)
+  - `method` (the method name for the request)
+  - `request` (the serialized proto string, or vector of strings,
+     of the RPC request argument).
+
+For example, if you have an RPC service running on port localhost:2345,
+and its interface is configured with the following proto declaration:
+
+```
+service MyService {
+  rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+  }
+};
+```
+
+then call this op with arguments:
+
+```
+address = "localhost:2345"
+method = "MyService/MyMethod"
+```
+
+The `request` tensor is a string tensor representing serialized `MyRequestProto`
+strings; and the output string tensor `response` will have the same shape
+and contain (upon successful completion) corresponding serialized
+`MyResponseProto` strings.
+
+For example, to send a single, empty, `MyRequestProto`, call
+this op with `request = ""`.  To send 5 **parallel** empty requests,
+call this op with `request = ["", "", "", "", ""]`.
+
+More generally, one can create a batch of `MyRequestProto` serialized protos
+from regular batched tensors using the `encode_proto` op, and convert
+the response `MyResponseProto` serialized protos to batched tensors
+using the `decode_proto` op.
+
+**NOTE** Working with serialized proto strings is faster than instantiating
+actual proto objects in memory, so no performance degradation is expected
+compared to writing custom kernels for this workflow.
+
+Unlike the standard `Rpc` op, if the connection fails or the remote worker
+returns an error status, this op does **not** reraise the exception.
+Instead, the `status_code` and `status_message` entry for the corresponding RPC
+call is set with the error returned from the RPC call.  The `response` tensor
+will contain valid response values for those minibatch entries whose RPCs did
+not fail; the rest of the entries will have empty strings.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_While.pbtxt b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..95a19c6dff99a51aa3228923b6408b3d7a995835
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_While.pbtxt
@@ -0,0 +1,33 @@
+op {
+  graph_op_name: "While"
+  in_arg {
+    name: "input"
+    description: "A list of input tensors whose types are T."
+  }
+  out_arg {
+    name: "output"
+    description: "A list of output tensors whose types are T."
+  }
+  attr { name: "T"  description: "dtype in use." }
+  attr {
+    name: "cond"
+    description: <<END
+      A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+END
+  }
+  attr {
+    name: "body"
+    description: <<END
+      A function that takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as specified
+      by T.
+END
+  }
+  summary: "output = input; While (Cond(output)) { output = Body(output) }"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cacdd5c2ca0838701aff1c085f06d81319612832
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ClipByValue.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ClipByValue"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2d5ed2b432d8ac5e60414409311308dcce7a486d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_DeepCopy.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "DeepCopy"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0b863520e987b69df680c84efcbdfca44518c6e8
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Empty.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Empty"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_For.pbtxt b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a58ddf56fe1a8516c57ca203f14ea76414ab55f5
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_For.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "For" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_If.pbtxt b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a44db5da081692ee26a7931850236d31d2231627
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_If.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "If" visibility: HIDDEN }
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..390e3bbf97340472608414af23ad5e6d8ee300ae
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..af9634f9b2b0cfe4e050005b8b05ca127d0523d9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceSub.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceSub"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5fa9d778ea6c937e7b8502b6db32d15bfa2ca90d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_InplaceUpdate.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "InplaceUpdate"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f5b6decf6d1cfc6b3fbd8492824ab95958b060b
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..867116c5da718f66205132d70a93c39464096df6
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SlideDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_While.pbtxt b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f47a9b0fceb77af3dc9fcad3569f8e18b7f44188
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_While.pbtxt
@@ -0,0 +1 @@
+op { graph_op_name: "While" visibility: HIDDEN }
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index e34945dd48a1e54e4ae82dd7ea9959f39a97f2c2..b8e773503c7a2f8024e8a6f58247ad343a762f71 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/allocator_retry.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57eb2943a7fac79804587043e1958c279923daf
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/notification.h"
+
+namespace tensorflow {
+
+BufRendezvous::~BufRendezvous() {
+  mutex_lock l(mu_);
+  if (!hook_table_.empty()) {
+    PurgeTable(errors::Internal("Delete called on non-empty BufRendezvous"),
+               &hook_table_);
+  }
+}
+
+void BufRendezvous::StartAbort(const Status& s) {
+  CHECK(!s.ok());
+  HookTable dummy_table;
+  {
+    mutex_lock l(mu_);
+    status_.Update(s);
+    hook_table_.swap(dummy_table);
+  }
+  PurgeTable(s, &dummy_table);
+}
+
+void BufRendezvous::PurgeTable(const Status& s, HookTable* table) {
+  for (auto& it : *table) {
+    Hook* h = it.second;
+    if (h->cons_cb != nullptr) {
+      h->cons_cb(s, nullptr);
+    }
+    if (h->prod_cb != nullptr) {
+      h->prod_cb(s);
+    }
+    delete h;
+  }
+  table->clear();
+}
+
+string BufRendezvous::Hook::DebugString() const {
+  return strings::StrCat("[dev:", (prod_dev ? prod_dev->name() : "none"),
+                         ", ctx:", reinterpret_cast<uint64>(prod_ctx),
+                         ", val:", reinterpret_cast<uint64>(prod_value),
+                         ", pcb:", reinterpret_cast<uint64>(&prod_cb),
+                         ", ccb:", reinterpret_cast<uint64>(&cons_cb), "]");
+}
+
+void BufRendezvous::ProvideBuf(const string& key, Device* dev,
+                               DeviceContext* dev_ctx, const Tensor* v,
+                               const AllocatorAttributes& attr,
+                               const ProducerCallback& done) {
+  Hook* h = nullptr;
+  Status providebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      providebuf_status = status_;
+      break;
+    } else {
+      auto it = hook_table_.find(key);
+      if (it == hook_table_.end()) {
+        h = new Hook;
+        it = hook_table_.insert(std::make_pair(key, h)).first;
+      } else {
+        if (it->second->prod_cb != nullptr) {
+          providebuf_status = errors::Internal(
+              "BufRendezvous::ProvideBuf already called for key ", key);
+          break;
+        }
+        h = it->second;
+      }
+      // Populate Hook with all of the prod values.
+      h->prod_dev = dev;
+      h->prod_ctx = dev_ctx;
+      h->prod_value = v;
+      h->prod_attr = attr;
+      h->prod_cb = done;
+      // If consumer is waiting, kick off right away, removing Hook from table.
+      if (h->cons_cb != nullptr) {
+        hook_table_.erase(it);
+      } else {
+        h = nullptr;
+      }
+    }
+  } while (false);
+  if (h) {
+    h->cons_cb(Status::OK(), h);
+  }
+  if (!providebuf_status.ok()) {
+    done(providebuf_status);
+  }
+}
+
+void BufRendezvous::ConsumeBuf(const string& key,
+                               const ConsumerCallback& done) {
+  Hook* existing_hook = nullptr;
+  Status consumebuf_status;
+  do {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      consumebuf_status = status_;
+      break;
+    }
+    auto it = hook_table_.find(key);
+    if (it != hook_table_.end()) {
+      // Prepare to consume immediately.
+      if (it->second->cons_cb) {
+        consumebuf_status =
+            errors::Internal("Second consumer arrived for key ", key);
+        break;
+      }
+      existing_hook = it->second;
+      hook_table_.erase(it);
+      existing_hook->cons_cb = done;
+    } else {
+      // Hang consumer callback on the Hook.
+      Hook* h = new Hook;
+      hook_table_[key] = h;
+      h->cons_cb = done;
+      return;
+    }
+  } while (false);
+  if (existing_hook) {
+    existing_hook->cons_cb(Status::OK(), existing_hook);
+    return;
+  }
+  if (!consumebuf_status.ok()) {
+    done(consumebuf_status, nullptr);
+    return;
+  }
+}
+
+/*static*/
+void BufRendezvous::DoneWithHook(Hook* h) {
+  h->prod_cb(Status::OK());
+  delete h;
+}
+
+void BufRendezvous::LogContents() {
+  mutex_lock l(mu_);
+  LOG(INFO) << strings::StrCat("BufRendezvous ",
+                               strings::Hex(reinterpret_cast<uint64>(this)),
+                               " step_id=", step_id_, " current contents:");
+  for (auto it : hook_table_) {
+    LOG(INFO) << it.first << ":" << it.second->DebugString();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
new file mode 100644
index 0000000000000000000000000000000000000000..e94e88b323ec74a36948ffff4e5718f211efbbb6
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+#define TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+class Device;
+class DeviceContext;
+class Tensor;
+
+// EXPERIMENTAL: RDMA oriented producer/consumer rendezvous on a local
+// Tensor value for which DMAHelper::CanUseDMA() is true, i.e. dense
+// numeric types.  Similar to Rendezvous but never owns a Ref on the
+// tensor, instead it uses an explicit callback to the producer when
+// the consumer side is finished with the value.  This allows the
+// producer to perform in-place updates on the source buffer or to take
+// other actions that depend on knowing the consumer has passed a certain
+// execution point.
+class BufRendezvous {
+ public:
+  explicit BufRendezvous(uint64 step_id) : step_id_(step_id) {}
+
+  ~BufRendezvous();
+
+  // Inform all all waiting parties that this BufRendezvous is defunct
+  // because of an error Status interrupting the Step.
+  void StartAbort(const Status& s);
+
+  struct Hook;
+  // Provided by the consumer to be called when access to the buffer
+  // is available.  If the Status arg is not OK, then hook will not
+  // be populated.  Ownership of Hook passes to consumer with the
+  // callback.
+  typedef std::function<void(const Status&, Hook*)> ConsumerCallback;
+  // Provided by the producer to be called when the consumer has finished
+  // reading the buffer and will no longer access it.
+  typedef std::function<void(const Status&)> ProducerCallback;
+
+  struct Hook {
+    Device* prod_dev;
+    DeviceContext* prod_ctx;
+    const Tensor* prod_value;
+    AllocatorAttributes prod_attr;
+    ProducerCallback prod_cb;
+    ConsumerCallback cons_cb;
+    Hook()
+        : prod_dev(nullptr),
+          prod_ctx(nullptr),
+          prod_value(nullptr),
+          prod_cb(nullptr),
+          cons_cb(nullptr) {}
+    string DebugString() const;
+  };
+
+  // Called to advertise availability of a Tensor value corresponding
+  // to key.  That value must stay valid until done is called.
+  void ProvideBuf(const string& key, Device* dev, DeviceContext* dev_ctx,
+                  const Tensor* v, const AllocatorAttributes& attr,
+                  const ProducerCallback& done);
+
+  // Called to request access to a Tensor value corresponding to key.
+  // Consumer is provide with a Hook as soon as availble.
+  void ConsumeBuf(const string& key, const ConsumerCallback& done);
+
+  // Consumer must call this function when it's done reading the Hook provided
+  // by the ConsumerCallback.  This function will invoke the producer callback
+  // and then delete h.
+  static void DoneWithHook(Hook* h);
+
+  // Write the current contents of the table to the INFO log.
+  void LogContents();
+
+ protected:
+  const uint64 step_id_;
+  mutex mu_;
+  Status status_ GUARDED_BY(mu_);
+  typedef gtl::FlatMap<string, Hook*> HookTable;
+  HookTable hook_table_ GUARDED_BY(mu_);
+
+  void PurgeTable(const Status& s, HookTable* table);
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_BUF_RENDEZVOUS_H_
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e798235bf0649428409a2fa72ac3067736c347a
--- /dev/null
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class BufRendezvousTest : public ::testing::Test {
+ protected:
+  BufRendezvousTest() {
+    br_.reset(new BufRendezvous(123));
+    fake_dev_ptr_ = reinterpret_cast<Device*>(512LLU);
+    fake_dev_ctx_ = reinterpret_cast<DeviceContext*>(1024LLU);
+    a_ = Tensor(DT_FLOAT, TensorShape({24}));
+    b_ = Tensor(DT_FLOAT, TensorShape({24}));
+  }
+
+  Device* fake_dev_ptr_ = nullptr;
+  DeviceContext* fake_dev_ctx_ = nullptr;
+  Tensor a_;
+  Tensor b_;
+  AllocatorAttributes aa_;
+  std::unique_ptr<BufRendezvous> br_;
+};
+
+TEST_F(BufRendezvousTest, CorrectUseProducerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_FALSE(prod_callback_called);
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, CorrectUseConsumerFirst) {
+  Status prod_status;
+  Status cons_status;
+  bool prod_callback_called = false;
+  bool cons_callback_called = false;
+  Notification note;
+  br_->ConsumeBuf("key0", [this, &cons_status, &cons_callback_called](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_callback_called = true;
+    ASSERT_TRUE(h != nullptr);
+    EXPECT_EQ(h->prod_dev, fake_dev_ptr_);
+    EXPECT_EQ(h->prod_ctx, fake_dev_ctx_);
+    EXPECT_EQ(h->prod_value, &a_);
+    br_->DoneWithHook(h);
+  });
+  EXPECT_FALSE(cons_callback_called);
+  br_->ProvideBuf(
+      "key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+      [&note, &prod_status, &prod_callback_called](const Status& s) {
+        prod_status = s;
+        prod_callback_called = true;
+        note.Notify();
+      });
+  EXPECT_TRUE(cons_callback_called);
+  note.WaitForNotification();
+  EXPECT_TRUE(prod_callback_called);
+  TF_EXPECT_OK(cons_status);
+  TF_EXPECT_OK(prod_status);
+}
+
+TEST_F(BufRendezvousTest, ErrorDuplicatePut) {
+  bool prod_callback_called = false;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_callback_called](const Status& s) {
+                    prod_callback_called = true;
+                  });
+  Status bad_status;
+  Notification note;
+  br_->ProvideBuf("key0", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [&bad_status, &note](const Status& s) {
+                    bad_status = s;
+                    note.Notify();
+                  });
+  note.WaitForNotification();
+  EXPECT_FALSE(bad_status.ok());
+  EXPECT_EQ("BufRendezvous::ProvideBuf already called for key key0",
+            bad_status.error_message());
+  EXPECT_FALSE(prod_callback_called);
+  br_.reset();
+}
+
+TEST_F(BufRendezvousTest, ErrorDeleteNonEmpty) {
+  Status cons_status;
+  br_->ConsumeBuf(
+      "key0", [this, &cons_status](const Status& s, BufRendezvous::Hook* h) {
+        cons_status = s;
+        EXPECT_EQ(h, nullptr);
+      });
+  EXPECT_TRUE(cons_status.ok());
+  br_.reset();
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ("Delete called on non-empty BufRendezvous",
+            cons_status.error_message());
+}
+
+TEST_F(BufRendezvousTest, AbortNonEmpty) {
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+TEST_F(BufRendezvousTest, AbortEmpty) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+}
+
+TEST_F(BufRendezvousTest, UseAfterAbort) {
+  br_->StartAbort(errors::Internal("Falling sky detected"));
+  Status cons_status;
+  Status prod_status;
+  Notification prod_note;
+  Notification cons_note;
+  br_->ConsumeBuf("key0", [this, &cons_note, &cons_status](
+                              const Status& s, BufRendezvous::Hook* h) {
+    cons_status = s;
+    cons_note.Notify();
+  });
+  br_->ProvideBuf("key1", fake_dev_ptr_, fake_dev_ctx_, &a_, aa_,
+                  [this, &prod_note, &prod_status](const Status& s) {
+                    prod_status = s;
+                    prod_note.Notify();
+                  });
+  prod_note.WaitForNotification();
+  cons_note.WaitForNotification();
+  EXPECT_FALSE(prod_status.ok());
+  EXPECT_EQ(prod_status.error_message(), "Falling sky detected");
+  EXPECT_FALSE(cons_status.ok());
+  EXPECT_EQ(cons_status.error_message(), "Falling sky detected");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5c4946e58edf900ef3e42586f6b484d8f5e4891
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -0,0 +1,114 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/build_graph_options.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+namespace tensorflow {
+namespace {
+// TODO(tucker): Temporary class just until a real CollectiveExecutor
+// implementation is submitted in a later CL.
+class DummyCollectiveExecutor : public CollectiveExecutor {
+ public:
+  explicit DummyCollectiveExecutor(CollectiveExecutorMgr* ce_mgr)
+      : CollectiveExecutor(ce_mgr) {}
+
+  ~DummyCollectiveExecutor() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override {
+    done(errors::Internal("Unimplemented"));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(DummyCollectiveExecutor);
+};
+}  // namespace
+
+CollectiveExecutorMgr::CollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverInterface* dev_resolver,
+    ParamResolverInterface* param_resolver)
+    : dev_mgr_(dev_mgr),
+      dev_resolver_(dev_resolver),
+      param_resolver_(param_resolver) {}
+
+CollectiveExecutorMgr::~CollectiveExecutorMgr() {
+  for (auto iter : executor_table_) {
+    iter.second->Unref();
+  }
+}
+
+CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+    } else {
+      ce = new DummyCollectiveExecutor(this);
+      executor_table_[step_id] = ce;
+    }
+    ce->Ref();
+  }
+  return ce;
+}
+
+void CollectiveExecutorMgr::Cleanup(int64 step_id) {
+  CollectiveExecutor* ce = nullptr;
+  {
+    mutex_lock l(exec_mu_);
+    auto it = executor_table_.find(step_id);
+    if (it != executor_table_.end()) {
+      ce = it->second;
+      executor_table_.erase(it);
+    }
+  }
+  if (ce) ce->Unref();
+}
+
+void CollectiveExecutorMgr::GetStepSequenceAsync(
+    const GetStepSequenceRequest* request, GetStepSequenceResponse* response,
+    const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement GetStepSequence."));
+}
+
+void CollectiveExecutorMgr::RefreshStepIdSequenceAsync(
+    int64 graph_key, const StatusCallback& done) {
+  done(errors::Internal(
+      "CollectiveExecutorMgr does not implement RefreshStepIdSequence."));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b42e2b4d16c5804e0660079c7a149442b47edb0
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class ConfigProto;
+class DeviceMgr;
+
+class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
+ public:
+  CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
+                        DeviceResolverInterface* dev_resolver,
+                        ParamResolverInterface* param_resolver);
+
+  virtual ~CollectiveExecutorMgr();
+
+  CollectiveExecutor* FindOrCreate(int64 step_id) override;
+
+  void Cleanup(int64 step_id) override;
+
+  ParamResolverInterface* GetParamResolver() const override {
+    return param_resolver_.get();
+  }
+
+  DeviceResolverInterface* GetDeviceResolver() const override {
+    return dev_resolver_.get();
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            const StatusCallback& done) override;
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override;
+
+  int64 NextStepId(int64 graph_key) override {
+    return CollectiveExecutor::kInvalidId;
+  }
+
+  void RetireStepId(int64 graph_key, int64 step_id) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+  std::unique_ptr<DeviceResolverInterface> dev_resolver_;
+  std::unique_ptr<ParamResolverInterface> param_resolver_;
+  CollectiveRemoteAccess* remote_access_;
+  string task_name_;
+  mutex exec_mu_;
+  // Map from step_id to CollectiveExecutor
+  gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34c9163d6a40ba47323afc306cc2803b643e1d8b
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveExecutorMgrTest : public ::testing::Test {
+ protected:
+  CollectiveExecutorMgrTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    string task_name = "/job:localhost/replica:0/task:0";
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
+    cme_.reset(new CollectiveExecutorMgr(
+        cp, device_mgr_.get(), drl,
+        new CollectiveParamResolverLocal(device_mgr_.get(), drl, task_name)));
+  }
+
+  std::unique_ptr<CollectiveExecutorMgr> cme_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+};
+
+TEST_F(CollectiveExecutorMgrTest, FindOrCreate) {
+  CollectiveExecutor::Handle* h =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_TRUE(h->get());
+  CollectiveExecutor::Handle* h2 =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(h->get(), h2->get());
+  CollectiveExecutor* ce = h->get();
+  delete h;
+  delete h2;
+  CollectiveExecutor::Handle h3(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(ce, h3.get());
+  cme_->Cleanup(1);
+}
+
+TEST_F(CollectiveExecutorMgrTest, StepSequenceRelated) {
+  EXPECT_EQ(CollectiveExecutor::kInvalidId, cme_->NextStepId(123));
+  Notification ss_note;
+  Status ss_status;
+  cme_->RefreshStepIdSequenceAsync(
+      123, [this, &ss_status, &ss_note](const Status& s) {
+        ss_status = s;
+        ss_note.Notify();
+      });
+  ss_note.WaitForNotification();
+  EXPECT_FALSE(ss_status.ok());
+  EXPECT_EQ(ss_status.error_message(),
+            "CollectiveExecutorMgr does not implement RefreshStepIdSequence.");
+  Notification gs_note;
+  Status gs_status;
+  GetStepSequenceRequest* req = nullptr;
+  GetStepSequenceResponse* resp = nullptr;
+  cme_->GetStepSequenceAsync(req, resp,
+                             [this, &gs_status, &gs_note](const Status& s) {
+                               gs_status = s;
+                               gs_note.Notify();
+                             });
+  gs_note.WaitForNotification();
+  EXPECT_FALSE(gs_status.ok());
+  EXPECT_EQ(gs_status.error_message(),
+            "CollectiveExecutorMgr does not implement GetStepSequence.");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b34950b2f471d756f3d553b296e437c796320fcb
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -0,0 +1,666 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+CollectiveParamResolverLocal::CollectiveParamResolverLocal(
+    const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
+    const string& task_name)
+    : dev_mgr_(dev_mgr), dev_resolver_(dev_resolver), task_name_(task_name) {}
+
+void CollectiveParamResolverLocal::CompleteGroupAsync(
+    const CompleteGroupRequest* request, CompleteGroupResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteGroup is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteGroupLocal(
+    const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupLocal " << cp << ": " << cp->ToString();
+  std::vector<StatusCallback> to_be_called;
+  GroupRec* gr = nullptr;
+  {
+    mutex_lock l(group_mu_);
+    auto it = group_table_.find(cp->group.group_key);
+    if (it == group_table_.end()) {
+      gr = new GroupRec;
+      gr->group.group_key = cp->group.group_key;
+      gr->group.group_size = cp->group.group_size;
+      gr->group.device_type = cp->group.device_type;
+      group_table_[gr->group.group_key].reset(gr);
+      VLOG(2) << "New group_key=" << gr->group.group_key
+              << " group_size=" << gr->group.group_size;
+    } else {
+      gr = it->second.get();
+    }
+  }
+  Status status;
+  {
+    mutex_lock gr_lock(gr->mu);
+    if (!gr->device_set.empty()) {
+      // Check for consistency with existing GroupRec.
+      if (cp->group.device_type != gr->group.device_type) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " is assigned to device ", device,
+            " with type ", cp->group.device_type.type_string(),
+            " and group_key ", cp->group.group_key, " but that group has type ",
+            gr->group.device_type.type_string());
+      } else if (cp->group.group_size != gr->group.group_size) {
+        status = errors::Internal(
+            "Collective Op ", cp->name, " has group_size ",
+            cp->group.group_size, " and group_key", cp->group.group_key,
+            " but that group has size ", gr->group.group_size);
+      }
+    }
+    if (status.ok()) {
+      // Insert device if not already present.
+      auto it = gr->device_set.find(device);
+      if (it == gr->device_set.end()) {
+        if (gr->device_set.size() == gr->group.group_size) {
+          // The group is already full.
+          status = errors::Internal(
+              "Collective Op ", cp->name, " is assigned to device ", device,
+              " and group_key ", cp->group.group_key,
+              " but that group doesn't contain that device.");
+        } else {
+          // This is a new device that has not yet joined the group.
+          gr->device_set.insert(device);
+          gr->device_list.push_back(device);
+          DeviceNameUtils::ParsedName parsed_device;
+          DeviceNameUtils::ParseFullName(device, &parsed_device);
+          string task_name = strings::StrCat("/job:", parsed_device.job,
+                                             "/replica:", parsed_device.replica,
+                                             "/task:", parsed_device.task);
+          gr->task_set.insert(task_name);
+          gr->task_list.push_back(task_name);
+          gr->group.num_tasks = static_cast<int32>(gr->task_set.size());
+          VLOG(1) << "group_key=" << gr->group.group_key
+                  << " group_size=" << gr->group.group_size
+                  << " dev_set=" << gr->device_set.size();
+        }
+      }
+    }
+
+    if (status.ok()) {
+      // If the group is not yet complete, queue to wait for it.
+      VLOG(2) << "group_size " << gr->group.group_size << " set size "
+              << gr->device_set.size() << " gr " << gr;
+
+      if (gr->device_set.size() < gr->group.group_size) {
+        gr->waiting.push_back(std::bind(done, std::placeholders::_1, gr));
+        return;
+      }
+      CHECK_EQ(gr->device_set.size(), gr->group.group_size);
+      if (!gr->waiting.empty()) {
+        std::swap(to_be_called, gr->waiting);
+      }
+    }
+  }
+  done(status, gr);
+  for (int i = 0; i < to_be_called.size(); ++i) {
+    to_be_called[i](Status::OK());
+  }
+}
+
+namespace {
+
+struct DevRec {
+  string task;
+  string device;
+  int original_rank;
+  int local_rank;
+  int global_rank;
+  const DeviceLocality* locality;
+};
+typedef std::unordered_map<string, DevRec> TaskDeviceMap;
+typedef std::unordered_map<string, TaskDeviceMap> GlobalDeviceMap;
+
+// Create a populated GlobalDeviceMap from CollInstanceParams and localities.
+GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
+                             const std::vector<DeviceLocality>& localities) {
+  GlobalDeviceMap gdm;
+  CHECK_EQ(ip.device_names.size(), ip.task_names.size());
+  CHECK_EQ(ip.device_names.size(), localities.size());
+  for (int i = 0; i < ip.device_names.size(); ++i) {
+    TaskDeviceMap& tdm = gdm[ip.task_names[i]];
+    DevRec* dr = &tdm[ip.device_names[i]];
+    dr->task = ip.task_names[i];
+    dr->device = ip.device_names[i];
+    dr->original_rank = i;
+    dr->local_rank = 0;   // Will be populated later by OrderTaskDeviceMap.
+    dr->global_rank = 0;  // Will be populated later by EstablishGlobalRank.
+    dr->locality = &localities[i];
+  }
+  return gdm;
+}
+
+void OrderTaskDeviceMap(TaskDeviceMap* tdm) {
+  CHECK_GT(tdm->size(), 0);  // Should never be called with 0 devices
+  int least_rank = -1;
+  string next_device;
+  std::set<string> selected;
+  // Starting device is one with the least initial rank.
+  for (const auto& it : *tdm) {
+    if (least_rank < 0 || it.second.original_rank < least_rank) {
+      least_rank = it.second.original_rank;
+      next_device = it.second.device;
+    }
+  }
+  CHECK_GE(least_rank, 0);
+  DeviceNameUtils::ParsedName parsed_name;
+  CHECK(DeviceNameUtils::ParseFullName(next_device, &parsed_name));
+  // NOTE: InterconnectLink has only a device_id, nothing more, so for
+  // the time being if there's more than one device at a task we
+  // assume they're all GPUs.
+
+  int next_rank = 0;
+  while (true) {
+    selected.insert(next_device);
+    DevRec* dr = &(*tdm)[next_device];
+    dr->local_rank = next_rank;
+    ++next_rank;
+    if (selected.size() == tdm->size()) {
+      break;
+    }
+    // For the present time we assume Locality links only cover GPUs.
+    // For multiple CPUs, just take them in order.
+    const InterconnectLink* best_link = nullptr;
+    if (parsed_name.type == "GPU") {
+      for (const InterconnectLink& il : dr->locality->links().link()) {
+        parsed_name.id = il.device_id();
+        string endpoint_device =
+            DeviceNameUtils::ParsedNameToString(parsed_name);
+        if (selected.find(endpoint_device) != selected.end()) {
+          continue;
+        }
+        if (best_link == nullptr || il.strength() > best_link->strength()) {
+          best_link = &il;
+        }
+      }
+    }
+    if (best_link != nullptr) {
+      // Follow the best edge
+      parsed_name.id = best_link->device_id();
+      next_device = DeviceNameUtils::ParsedNameToString(parsed_name);
+    } else {
+      // No good edges, alas. Pick the lowest initial rank among remaining
+      // devices.
+      least_rank = -1;
+      for (const auto& it : *tdm) {
+        if (selected.find(it.second.device) != selected.end()) {
+          continue;
+        }
+        if (least_rank < 0 || it.second.original_rank < least_rank) {
+          least_rank = it.second.original_rank;
+          next_device = it.second.device;
+        }
+      }
+      CHECK_GE(least_rank, 0);
+    }
+  }
+}
+
+// The first time a shared CollectiveParams is established for a
+// shared set of instances we compute a good rank order for all the
+// devices in the group, that is appropriate for a ring algorithm.
+// This order need not be the same across different instance groups
+// sharing the same device group where there is more than one good
+// order.
+GlobalDeviceMap EstablishGlobalRank(
+    CollectiveParams* cp, const std::vector<DeviceLocality>& localities) {
+  VLOG(1) << "EstablishGlobalRank";
+  GlobalDeviceMap gdm = BuildDevRecs(cp->instance, localities);
+  for (auto& iter : gdm) {
+    TaskDeviceMap& tdm = iter.second;
+    OrderTaskDeviceMap(&tdm);
+  }
+  // Connect the global rank order by the order in which tasks first appear.
+  std::set<string> ordered_tasks;
+  int next_rank = 0;
+  for (int i = 0; i < cp->instance.task_names.size(); ++i) {
+    const string& task_name = cp->instance.task_names[i];
+    if (ordered_tasks.find(task_name) != ordered_tasks.end()) {
+      continue;
+    }
+    ordered_tasks.insert(task_name);
+    TaskDeviceMap* tdm = &gdm[task_name];
+    for (auto& it : *tdm) {
+      it.second.global_rank = it.second.local_rank + next_rank;
+    }
+    next_rank += tdm->size();
+  }
+  return gdm;
+}
+
+// Sort cp->instance.device_names lexicographically, but do by first
+// computing a reordering permutation so we can keep cp->instance.task_names
+// in corresponding order.
+void SortDevicesAndTasks(CollectiveParams* cp) {
+  VLOG(1) << "SortDevicesAndTasks " << cp << " instance " << &cp->instance;
+  CHECK(cp);
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size());
+  CHECK_EQ(cp->group.group_size, cp->instance.task_names.size());
+  std::vector<int> perm(cp->group.group_size);
+  // TODO(tucker): substitute std::iota when the windows build supports it.
+  // std::iota(perm.begin(), perm.end(), 0);
+  for (int i = 0; i < perm.size(); ++i) {
+    perm[i] = i;
+  }
+  std::sort(perm.begin(), perm.end(), [cp](const int& a, const int& b) {
+    return cp->instance.device_names[a] < cp->instance.device_names[b];
+  });
+  std::vector<string> new_devs;
+  std::vector<string> new_tasks;
+  new_devs.reserve(cp->group.group_size);
+  new_tasks.reserve(cp->group.group_size);
+  for (int pi : perm) {
+    new_devs.push_back(cp->instance.device_names[pi]);
+    new_tasks.push_back(cp->instance.task_names[pi]);
+  }
+  cp->instance.device_names = std::move(new_devs);
+  cp->instance.task_names = std::move(new_tasks);
+  VLOG(1) << "Modified device_names on " << cp;
+}
+
+// Establish the requested number of subdivision permutations based on the
+// ring order implicit in the device order.
+void GenerateSubdivPerms(const string& device, int source_rank,
+                         CollectiveParams* cp) {
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  // Each subdiv permutation is a ring formed by rotating each
+  // single-task subsequence of devices by an offset.  This makes most
+  // sense when each task has the same number of devices but we can't
+  // depend on that being the case so we'll compute something that
+  // works in any case.
+
+  // Start by counting the devices in each task.
+  // Precondition: device_names must be sorted so that all devices in
+  // the same task are adjacent.
+  VLOG(2) << "Sorted task names: "
+          << str_util::Join(cp->instance.task_names, ", ");
+  std::vector<int> dev_per_task;
+  const string* prior_task_name = &cp->instance.task_names[0];
+  int dev_count = 1;
+  for (int di = 1; di < cp->group.group_size; ++di) {
+    if (cp->instance.task_names[di] != *prior_task_name) {
+      dev_per_task.push_back(dev_count);
+      dev_count = 1;
+      prior_task_name = &cp->instance.task_names[di];
+    } else {
+      ++dev_count;
+    }
+  }
+  dev_per_task.push_back(dev_count);
+  CHECK_EQ(cp->group.num_tasks, dev_per_task.size());
+
+  // Generate a ring permutation for each requested offset.
+  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
+  VLOG(2) << "Setting up perms for cp " << cp << " subdiv_permutations "
+          << &cp->instance.impl_details.subdiv_permutations;
+  cp->instance.impl_details.subdiv_permutations.resize(
+      cp->instance.impl_details.subdiv_offsets.size());
+  cp->subdiv_rank.resize(cp->instance.impl_details.subdiv_offsets.size(), -1);
+  for (int sdi = 0; sdi < cp->instance.impl_details.subdiv_offsets.size();
+       ++sdi) {
+    std::vector<int>& perm = cp->instance.impl_details.subdiv_permutations[sdi];
+    CHECK_EQ(perm.size(), 0);
+    int offset = cp->instance.impl_details.subdiv_offsets[sdi];
+    int prior_dev_count = 0;
+    for (int ti = 0; ti < cp->group.num_tasks; ++ti) {
+      for (int di = 0; di < dev_per_task[ti]; ++di) {
+        int offset_di = (di + offset) % dev_per_task[ti];
+        int permuted_di = prior_dev_count + offset_di;
+        perm.push_back(permuted_di);
+        if (cp->instance.device_names[prior_dev_count + di] == device) {
+          CHECK_EQ(prior_dev_count + di, cp->default_rank);
+          cp->subdiv_rank[sdi] = permuted_di;
+        }
+      }
+      prior_dev_count += dev_per_task[ti];
+    }
+    CHECK_EQ(cp->group.group_size, perm.size());
+  }
+
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CHECK_GE(source_rank, 0);
+    cp->subdiv_source_rank.resize(
+        cp->instance.impl_details.subdiv_offsets.size(), -1);
+    for (int sdi = 0; sdi < cp->subdiv_source_rank.size(); ++sdi) {
+      for (int j = 0; j < cp->group.group_size; ++j) {
+        if (cp->instance.impl_details.subdiv_permutations[sdi][j] ==
+            source_rank) {
+          cp->subdiv_source_rank[sdi] = j;
+          break;
+        }
+      }
+      CHECK_GE(cp->subdiv_source_rank[sdi], 0);
+    }
+  }
+
+  if (VLOG_IS_ON(1)) {
+    // Log the computed ring order for each subdiv.
+    string buf;
+    for (int sdi = 0;
+         sdi < cp->instance.impl_details.subdiv_permutations.size(); ++sdi) {
+      buf = strings::StrCat("Subdiv ", sdi, " device order:\n");
+      for (int di = 0;
+           di < cp->instance.impl_details.subdiv_permutations[sdi].size();
+           ++di) {
+        int idx = cp->instance.impl_details.subdiv_permutations[sdi][di];
+        strings::StrAppend(&buf, cp->instance.device_names[idx], "\n");
+      }
+      strings::StrAppend(&buf, " subdiv_offsets: ");
+      for (auto o : cp->instance.impl_details.subdiv_offsets)
+        strings::StrAppend(&buf, o, " ");
+      strings::StrAppend(&buf, " SubdivRank: ");
+      for (auto d : cp->subdiv_rank) strings::StrAppend(&buf, d, " ");
+      VLOG(1) << buf;
+    }
+  }
+}
+
+}  // namespace
+
+void CollectiveParamResolverLocal::CompleteTaskIsLocal(const string& task_name,
+                                                       CollectiveParams* cp) {
+  cp->task.is_local.resize(cp->group.group_size, false);
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    cp->task.is_local[i] = (cp->instance.task_names[i] == task_name);
+  }
+}
+
+void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
+                                                  CollectiveParams* cp) {
+  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size()) << cp;
+  for (int i = 0; i < cp->group.group_size; ++i) {
+    if (cp->instance.device_names[i] == device) {
+      cp->default_rank = i;
+      break;
+    }
+  }
+}
+
+Status CollectiveParamResolverLocal::InitInstanceSharedParams(
+    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+  VLOG(1) << "InitInstanceSharedParams " << ir;
+  ir->shared.instance = cp->instance;
+  {
+    mutex_lock gl(gr->mu);
+    ir->shared.group = gr->group;
+    ir->shared.instance.device_names.assign(gr->device_list.begin(),
+                                            gr->device_list.end());
+    ir->shared.instance.task_names.assign(gr->task_list.begin(),
+                                          gr->task_list.end());
+    VLOG(2) << "Initialized names for instance: "
+            << ir->shared.instance.ToString();
+  }
+  ir->shared.default_rank = -1;
+
+  // Sort devce_names lexicographcally, keeping task_names in
+  // corresponding order.
+  SortDevicesAndTasks(&ir->shared);
+
+  // Get Locality data for all devices.
+
+  // Set is_local and task_names in *shared prior to invoking
+  // GetDeviceLocalitiesAsync.  In a distributed context this function can be
+  // called by a derived class, some of the devices may be non-local and
+  // GetDeviceLocalitiesAsync will use those fields to launch RPCs.
+  CompleteTaskIsLocal(task_name_, &ir->shared);
+  std::vector<DeviceLocality> localities;
+  Notification note;
+  Status status;
+  dev_resolver_->GetDeviceLocalitiesAsync(ir->shared.instance, &localities,
+                                          [&note, &status](const Status& s) {
+                                            status = s;
+                                            note.Notify();
+                                          });
+  note.WaitForNotification();
+  if (status.ok()) {
+    CompleteDefaultRanking(gr, cp, ir, localities);
+  }
+  return status;
+}
+
+void CollectiveParamResolverLocal::CompleteDefaultRanking(
+    GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const std::vector<DeviceLocality>& localities) {
+  // Establish an instance-specific default rank order for devices
+  // based on localities.  This rank order should be a good ring
+  // order, if possible.
+  GlobalDeviceMap gdm = EstablishGlobalRank(&ir->shared, localities);
+  // Reflect the new global ranking on shared
+  size_t num_devices = ir->shared.group.group_size;
+  std::vector<string> new_device_names(num_devices, "");
+  std::vector<string> new_task_names(num_devices, "");
+  for (const auto& git : gdm) {
+    const TaskDeviceMap& tdm = git.second;
+    for (const auto& tit : tdm) {
+      const DevRec& dr = tit.second;
+      new_device_names[dr.global_rank] =
+          ir->shared.instance.device_names[dr.original_rank];
+      new_task_names[dr.global_rank] =
+          ir->shared.instance.task_names[dr.original_rank];
+    }
+  }
+
+  ir->shared.instance.device_names = new_device_names;
+  ir->shared.instance.task_names = new_task_names;
+  if (VLOG_IS_ON(2)) {
+    string buf;
+    for (const auto& d : cp->instance.device_names)
+      strings::StrAppend(&buf, "\n", d);
+    VLOG(2) << "Optimized device order for " << ir->shared.name << ": " << buf;
+  }
+}
+
+void CollectiveParamResolverLocal::CallbackWithStatus(
+    const InstanceRecCallback& done, InstanceRec* irec) {
+  Status s;
+  {
+    mutex_lock l(irec->out_mu);
+    s = irec->status;
+  }
+  done(s, irec);
+}
+
+void CollectiveParamResolverLocal::FindInstanceRec(
+    GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+  InstanceRec* irec = nullptr;
+  bool exit_outside_locks = false;
+  {
+    mutex_lock l(instance_mu_);
+    auto it = instance_table_.find(cp->instance.instance_key);
+    if (it != instance_table_.end()) {
+      irec = it->second.get();
+      {
+        mutex_lock l(irec->in_mu);
+        if (irec->is_init) {
+          exit_outside_locks = true;
+        } else {
+          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+            CallbackWithStatus(done, irec);
+          });
+          return;
+        }
+      }
+    } else {
+      // Create new InstanceRec.
+      irec = new InstanceRec;
+      instance_table_[cp->instance.instance_key].reset(irec);
+    }
+  }
+  if (exit_outside_locks) {
+    CallbackWithStatus(done, irec);
+    return;
+  }
+  // Initialize the new InstanceRec while holding out_mu.
+  {
+    mutex_lock il(irec->out_mu);
+    irec->known.resize(cp->group.group_size, false);
+    irec->status = InitInstanceSharedParams(gr, cp, irec);
+  }
+  // Prepare to invoke any waiters that accumlated during initialization.
+  std::vector<IRConsumer> init_waiters;
+  {
+    mutex_lock tl(instance_mu_);
+    {
+      mutex_lock l(irec->in_mu);
+      irec->is_init = true;
+      if (!irec->init_waiters.empty()) {
+        std::swap(init_waiters, irec->init_waiters);
+      }
+    }
+  }
+  CallbackWithStatus(done, irec);
+  for (auto& f : init_waiters) {
+    f(irec);
+  }
+}
+
+void CollectiveParamResolverLocal::CompleteParamsAsync(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    const StatusCallback& done) {
+  VLOG(1) << "CompleteParams " << device << " for " << cp << ": "
+          << cp->ToString();
+  CompleteGroupLocal(
+      device, cp, [this, device, cp, done](const Status& s, GroupRec* gr) {
+        if (s.ok()) {
+          CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+        } else {
+          done(s);
+        }
+      });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceAsync(
+    const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  done(
+      errors::Internal("CompleteInstance is not implemented by "
+                       "CollectiveParamResolverLocal which is "
+                       "intended only for non-distributed deployment."));
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceLocal(
+    const string& device, GroupRec* gr, CollectiveParams* cp, bool is_source,
+    const StatusCallback& done) {
+  VLOG(1) << "CompleteInstanceLocal " << device
+          << " instance_key: " << cp->instance.instance_key << " gr " << gr;
+
+  // Populate the group portion of *cp from *gr.  Most of it should already
+  // match.
+  DCHECK_EQ(cp->group.group_key, gr->group.group_key);
+  DCHECK_EQ(cp->group.group_size, gr->group.group_size);
+  DCHECK_EQ(cp->group.device_type, gr->group.device_type);
+  cp->group = gr->group;
+
+  // Get the shared InstanceRec for this instance.
+  FindInstanceRec(gr, cp,
+                  [this, device, gr, cp, is_source, done](const Status& s,
+                                                          InstanceRec* ir) {
+                    if (s.ok()) {
+                      CompleteInstanceFromInitializedIRec(device, gr, cp, ir,
+                                                          is_source, done);
+                    } else {
+                      done(s);
+                    }
+                  });
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
+    const string& device, GroupRec* gr, CollectiveParams* cp, InstanceRec* ir,
+    bool is_source, const StatusCallback& done) {
+  // Populate the fields common across instance.
+  {
+    mutex_lock l(ir->out_mu);
+    // custom operator= does a deep copy.
+    cp->instance = ir->shared.instance;
+  }
+  // Populate the fields common across task, also default_rank.
+  SetDefaultRank(device, cp);
+  CompleteTaskIsLocal(task_name_, cp);
+  // If broadcast, may need to wait for source discovery.
+  if (cp->instance.type == BROADCAST_COLLECTIVE) {
+    CompleteInstanceSource(ir, cp, is_source,
+                           [this, ir, device, cp, done](InstanceRec* irec) {
+                             CHECK_EQ(ir, irec);
+                             Status s;
+                             int source_rank;
+                             {
+                               mutex_lock l(irec->out_mu);
+                               s = irec->status;
+                               source_rank = ir->source_rank;
+                             }
+                             if (s.ok()) {
+                               GenerateSubdivPerms(device, source_rank, cp);
+                             }
+                             done(s);
+                           });
+    return;
+  } else {
+    GenerateSubdivPerms(device, 0, cp);
+  }
+  done(Status::OK());
+}
+
+void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
+                                                          CollectiveParams* cp,
+                                                          bool is_source,
+                                                          const IRConsumer& f) {
+  std::vector<IRConsumer> ready_waiters;
+  {
+    mutex_lock l(ir->out_mu);
+    CHECK_EQ(cp->group.group_size, ir->known.size());
+    CHECK_GE(cp->default_rank, 0);
+    if (!ir->known[cp->default_rank]) {
+      ir->known[cp->default_rank] = true;
+      ++ir->known_count;
+      if (is_source) {
+        if (ir->source_rank >= 0) {
+          ir->status = errors::Internal("Instance ", cp->instance.instance_key,
+                                        " already has source ", ir->source_rank,
+                                        ", recevied second claim from ",
+                                        cp->default_rank);
+        } else {
+          ir->source_rank = cp->default_rank;
+        }
+      }
+    }
+    if (ir->known_count < ir->shared.group.group_size) {
+      ir->known_waiters.push_back(f);
+      return;
+    }
+    CHECK_EQ(ir->known_count, ir->shared.group.group_size);
+    CHECK_GE(ir->source_rank, 0);
+    if (!ir->known_waiters.empty()) {
+      ready_waiters = std::move(ir->known_waiters);
+    }
+  }
+  f(ir);
+  for (auto& f : ready_waiters) {
+    f(ir);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff3415b0a909b22cb573ff0e6d6cb210924c092a
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceMgr;
+
+// Implements ParamResolverInterface for a single-task context.
+// It also implements the functionality necessary to serve as the
+// group leader for param resolution in a multi-task context.
+class CollectiveParamResolverLocal : public ParamResolverInterface {
+ public:
+  CollectiveParamResolverLocal(const DeviceMgr* dev_mgr,
+                               DeviceResolverInterface* dev_resolver,
+                               const string& task_name);
+
+  ~CollectiveParamResolverLocal() override {}
+
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+ protected:
+  // Used to complete/verify CollGroup.
+  struct GroupRec {
+    CollGroupParams group;
+    mutex mu;
+    Status status GUARDED_BY(mu);
+    std::set<string> device_set GUARDED_BY(mu);
+    std::vector<string> device_list GUARDED_BY(mu);
+    std::set<string> task_set GUARDED_BY(mu);
+    std::vector<string> task_list GUARDED_BY(mu);
+    std::vector<StatusCallback> waiting GUARDED_BY(mu);
+  };
+
+  // Finds the GroupRec that corresponds to cp->group_key.
+  // Also populates cp->group from that group_rec.
+  // Will wait until GroupRec is fully populated or an error arises before
+  // calling done.  Callback GroupRec* arg is only valid if status is ok.
+  // Ownership of GroupRec stays with this object and does not pass to the
+  // callback.
+  typedef std::function<void(const Status& s, GroupRec* gr)> GroupRecCallback;
+  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
+                          const GroupRecCallback& done)
+      LOCKS_EXCLUDED(group_mu_);
+
+  // Used to complete/verify CollInstance.
+  struct InstanceRec;
+  typedef std::function<void(InstanceRec*)> IRConsumer;
+  struct InstanceRec {
+    // This structure has two mutexes so that a possibly long
+    // initialization can be done without holding the instance_mu_
+    // table lock the whole time (which can cause an excessive number
+    // of threads to block on it), and because the compiler may not
+    // permit mutex locks to be taken in more than one order.
+    //
+    // out_mu guards access to most of the fields.
+    // in_mu guards access to a queue of comsumer callbacks wanting to
+    // read the fields guarded by out_mu.
+    //
+    // The in_mu should be locked only while holding instance_mu_; the
+    // out_mu should be locked only while not holding
+    // instance_mu_.
+    //
+    // When is_init is false (the initial value) any potential user
+    // other than the creator should queue a callback on init_waiters.
+    // As soon as the shared member of this structure is fully
+    // initialized is_init will be set true and those callbacks will
+    // be invoked.
+    //
+    // Once inserted in the table this structure will never be replaced
+    // so users can capture the pointer while holding instance_mu_,
+    // drop that lock, then take a lock on out_mu before
+    // reading/modifying its values.
+    mutex in_mu;
+    bool is_init GUARDED_BY(in_mu);
+    std::vector<IRConsumer> init_waiters GUARDED_BY(in_mu);
+
+    // Values to be shared by all instances, constant after initialization.
+    mutex out_mu;
+    CollectiveParams shared GUARDED_BY(out_mu);
+    // If an error occurs during initialization this structure stays in
+    // the table with a non-OK status.  Purging the table and restarting
+    // needs to be done at a higher level.
+    Status status GUARDED_BY(out_mu);
+
+    // These fields are used to count the instances that have called
+    // in and become known while resolving broadcast source identity.
+    int source_rank GUARDED_BY(out_mu);
+    int known_count GUARDED_BY(out_mu);
+    std::vector<bool> known GUARDED_BY(out_mu);
+    std::vector<IRConsumer> known_waiters GUARDED_BY(out_mu);
+
+    InstanceRec() : is_init(false), source_rank(-1), known_count(0) {}
+  };
+
+  // Find the InstanceRec with the same instance_key as cp.  If it doesn't
+  // already exist, create and initialize from gr and cp.
+  //
+  // Precondition: *gr must be a complete GroupRec, i.e. the value set
+  // by CompleteGroupLocal. *cp must be populated with all the fields
+  // required by InitInstanceSharedParams.  Ownership of InstanceRec stays
+  // with this object and does not pass to the callback.
+  typedef std::function<void(const Status& s, InstanceRec* ir)>
+      InstanceRecCallback;
+  void FindInstanceRec(GroupRec* gr, CollectiveParams* cp,
+                       const InstanceRecCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Populate *ir with device membership from gr, then initialize to be specific
+  // to cp->instance_key, i.e. order the devices and tasks.
+  //
+  // Preconditions:
+  //  cp is populated with all DeviceLocalities
+  Status InitInstanceSharedParams(GroupRec* gr, const CollectiveParams* cp,
+                                  InstanceRec* ir)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
+
+  // Establishes the final order of ir->shared.instance.device_names and
+  // ir->shared.instance.task_names by considering localities of all devices.
+  void CompleteDefaultRanking(GroupRec* gr, const CollectiveParams* cp,
+                              InstanceRec* ir,
+                              const std::vector<DeviceLocality>& localities)
+      EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+
+  // Finish populating *cp.
+  // Precondition: *gr has been fully populated by CompleteGroupLocal.
+  void CompleteInstanceLocal(const string& device, GroupRec* gr,
+                             CollectiveParams* cp, bool is_source,
+                             const StatusCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Finish populating *cp from fully initialized *ir.
+  // Precondition: *gr and *ir are fully populated.
+  void CompleteInstanceFromInitializedIRec(const string& device, GroupRec* gr,
+                                           CollectiveParams* cp,
+                                           InstanceRec* ir, bool is_source,
+                                           const StatusCallback& done)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // Complete source data for a broadcast instance.
+  // Precondition: *cp has complete group data and default_rank.
+  void CompleteInstanceSource(InstanceRec* ir, CollectiveParams* cp,
+                              bool is_source, const IRConsumer& f)
+      LOCKS_EXCLUDED(ir->out_mu);
+
+  // If cp.device_names contains only devices local to this process
+  // populates *localities, else returns an error.
+  Status GetLocalDeviceLocalities(const CollectiveParams& cp,
+                                  std::vector<DeviceLocality>* localities);
+
+  // Sets CollTaskParams.is_local and CollectiveParams.default_rank.
+  // Precondition: cp->device_names is fully populated and in final order.
+  void CompleteTaskIsLocal(const string& task_name, CollectiveParams* cp);
+
+  // Sets cp->instance_default_rank according to location of device in
+  // current ordering of cp->instance.device_names.
+  void SetDefaultRank(const string& device, CollectiveParams* cp);
+
+  // Helper to grab status under lock, invoke callback out of lock.
+  void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
+      LOCKS_EXCLUDED(irec->out_mu);
+
+  const DeviceMgr* dev_mgr_;
+  DeviceResolverInterface* dev_resolver_;
+  string task_name_;
+  mutex group_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
+      GUARDED_BY(group_mu_);
+  mutex instance_mu_;
+  gtl::FlatMap<int32, std::unique_ptr<InstanceRec>> instance_table_
+      GUARDED_BY(instance_mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_PARAM_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3c7125f2b319e4fc3fa227187277688977a313
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class CollectiveParamResolverLocalTest : public ::testing::Test {
+ protected:
+  CollectiveParamResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                task_name));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+};
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 7;
+    cp->instance.type = REDUCTION_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = false;
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    EXPECT_EQ(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_FALSE(cps[i].is_source);
+    EXPECT_EQ(cps[i].default_rank, i);
+  }
+}
+
+TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
+  CollectiveParams cps[NUM_DEVS];
+  Status statuses[NUM_DEVS];
+  Notification note[NUM_DEVS];
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    CollectiveParams* cp = &cps[i];
+    cp->group.group_key = 1;
+    cp->group.group_size = 3;
+    cp->group.device_type = DeviceType("CPU");
+    cp->group.num_tasks = 1;
+    cp->instance.instance_key = 3;
+    cp->instance.type = BROADCAST_COLLECTIVE;
+    cp->instance.data_type = DataType(DT_FLOAT);
+    cp->instance.shape = TensorShape({5});
+    cp->instance.device_names.push_back(
+        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
+    cp->instance.impl_details.subdiv_offsets.push_back(0);
+    cp->is_source = (i == 1);
+    Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
+      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+                                nullptr /*CancellationManager*/,
+                                [this, &statuses, &note, i](const Status& s) {
+                                  statuses[i] = s;
+                                  note[i].Notify();
+                                });
+    });
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    note[i].WaitForNotification();
+  }
+  for (int i = 0; i < NUM_DEVS; ++i) {
+    TF_ASSERT_OK(statuses[i]);
+    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    for (int j = 0; j < NUM_DEVS; ++j) {
+      EXPECT_EQ(
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
+          cps[i].instance.device_names[j]);
+      EXPECT_TRUE(cps[i].task.is_local[j]);
+    }
+    ASSERT_GT(cps[i].subdiv_rank.size(), 0);
+    EXPECT_EQ(cps[i].subdiv_rank[0], i);
+    ASSERT_GT(cps[i].subdiv_source_rank.size(), 0);
+    EXPECT_EQ(cps[i].subdiv_source_rank[0], 1);
+    EXPECT_EQ(cps[i].is_source, (i == 1));
+    EXPECT_EQ(cps[i].default_rank, i);
+  }
+}
+
+// TEST_F(CollectiveParamResolverLocalTest,
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad9b32ce3514dcfb29662d781ca6f1febd406c89
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+
+namespace tensorflow {
+
+void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
+  buf_rendezvous_.StartAbort(s);
+}
+
+void CollectiveRemoteAccessLocal::RecvFromPeer(
+    const string& peer_device, const string& peer_task, bool peer_is_local,
+    const string& key, Device* to_device, DeviceContext* to_device_ctx,
+    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "RecvFromPeer " << this << " from " << peer_device << " key "
+          << key;
+  if (!peer_is_local) {
+    done(
+        errors::Internal("CollectiveRemoteAccessLocal::RecvFromPeer "
+                         "called with peer_is_local=false"));
+    return;
+  }
+  buf_rendezvous_.ConsumeBuf(
+      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr, done](
+               const Status& s, BufRendezvous::Hook* hook) {
+        if (!s.ok()) {
+          done(s);
+          delete hook;
+        } else {
+          int64 recv_bytes = to_tensor->TotalBytes();
+          CHECK_EQ(recv_bytes, hook->prod_value->TotalBytes());
+          MemCpyAsync(hook->prod_ctx,    // src DeviceContext
+                      to_device_ctx,     // dst DeviceContext
+                      hook->prod_dev,    // src Device
+                      to_device,         // dst Device
+                      hook->prod_attr,   // src AllocatorAttributes
+                      to_alloc_attr,     // dst AllocatorAttributes
+                      hook->prod_value,  // src Tensor*
+                      to_tensor,         // dst Tensor*
+                      [hook, done](const Status& s) {
+                        done(s);
+                        hook->prod_cb(s);
+                        delete hook;
+                      });
+        }
+      });
+}
+
+void CollectiveRemoteAccessLocal::PostToPeer(
+    const string& peer_device, const string& peer_task, const string& key,
+    Device* from_device, DeviceContext* from_device_ctx,
+    const AllocatorAttributes& from_alloc_attr, const Tensor* from_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  VLOG(1) << "PostToPeer " << this << " key " << key
+          << " step_id_=" << step_id_;
+  buf_rendezvous_.ProvideBuf(key, from_device, from_device_ctx, from_tensor,
+                             from_alloc_attr, done);
+}
+
+/*static*/
+void CollectiveRemoteAccessLocal::MemCpyAsync(
+    DeviceContext* src_dev_ctx, DeviceContext* dst_dev_ctx, Device* src_dev,
+    Device* dst_dev, const AllocatorAttributes& src_attr,
+    const AllocatorAttributes& dst_attr, const Tensor* src, Tensor* dst,
+    const StatusCallback& done) {
+  // We want a real copy to happen, i.e. the bytes inside of src should be
+  // transferred to the buffer backing dst.  If src and dst are on different
+  // devices then CopyTensor::ViaDMA will do just that.  But if they're both
+  // the same CPU, then it will actually just reset dst to point to src.
+  // Since this routine is used for copying between devices and within a
+  // device, we need to detect and bypass the wrong-semantics case.
+  const DeviceType src_device_type(
+      src_attr.on_host() ? DEVICE_CPU : src_dev->attributes().device_type());
+  const DeviceType dst_device_type(
+      dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
+  const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
+  const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+  if (non_cpu_src) CHECK(src_dev_ctx);
+  if (non_cpu_dst) CHECK(dst_dev_ctx);
+  if (non_cpu_src || non_cpu_dst) {
+    CopyTensor::ViaDMA("",  // edge name (non-existent)
+                       src_dev_ctx, dst_dev_ctx, src_dev, dst_dev, src_attr,
+                       dst_attr, src, dst, done);
+  } else {
+    int64 bytes = src->TotalBytes();
+    DCHECK_EQ(dst->TotalBytes(), bytes);
+    memcpy(DMAHelper::base(dst), DMAHelper::base(src), bytes);
+    done(Status::OK());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/collective_rma_local.h b/tensorflow/core/common_runtime/collective_rma_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..d25dd5f04acb4814931f33bc8fa710ebdc215d68
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#define TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/rendezvous.h"
+
+namespace tensorflow {
+
+// Basic implementation of PerStepCollectiveRemoteAccess.
+class CollectiveRemoteAccessLocal : public PerStepCollectiveRemoteAccess {
+ public:
+  CollectiveRemoteAccessLocal(const DeviceMgr* dev_mgr,
+                              DeviceResolverInterface* dev_resolver,
+                              int64 step_id)
+      : dev_mgr_(dev_mgr),
+        dev_resolver_(dev_resolver),
+        buf_rendezvous_(step_id),
+        step_id_(step_id) {}
+
+  virtual ~CollectiveRemoteAccessLocal() {}
+
+  void StartAbort(const Status& s);
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override;
+
+  void PostToPeer(const string& peer_device, const string& peer_task,
+                  const string& key, Device* from_device,
+                  DeviceContext* from_device_ctx,
+                  const AllocatorAttributes& from_alloc_attr,
+                  const Tensor* from_tensor,
+                  const DeviceLocality& client_locality,
+                  const StatusCallback& done) override;
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override {
+    dev_resolver_->GetDeviceLocalitiesAsync(ci_params, localities, done);
+  }
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override {
+    dev_resolver_->GetLocalityAsync(device, task, locality, done);
+  }
+
+  void ClearTask(const string& task) override {
+    dev_resolver_->ClearTask(task);
+  }
+
+  // Copy utility that always copies bytes from src to dst even if
+  // they are on the same device, unlike CopyTensor::ViaDMA which will
+  // just change the dst buffer pointer in that case.
+  static void MemCpyAsync(DeviceContext* src_dev_ctx,
+                          DeviceContext* dst_dev_ctx, Device* src_dev,
+                          Device* dst_dev, const AllocatorAttributes& src_attr,
+                          const AllocatorAttributes& dst_attr,
+                          const Tensor* src, Tensor* dst,
+                          const StatusCallback& done);
+
+ protected:
+  const DeviceMgr* dev_mgr_;               // not owned
+  DeviceResolverInterface* dev_resolver_;  // not owned
+  BufRendezvous buf_rendezvous_;
+  int64 step_id_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_COLLECTIVE_RMA_LOCAL_ACCESS_H_
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcd4272d96b5f855660509bf69de4585128f836c
--- /dev/null
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -0,0 +1,148 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+static const int kStepId = 123;
+
+class CollectiveRemoteAccessLocalTest : public ::testing::Test {
+ protected:
+  const string kTaskName = "/job:localhost/replica:0/task:0";
+
+  CollectiveRemoteAccessLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+    prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
+                                                kTaskName));
+    rma_.reset(new CollectiveRemoteAccessLocal(device_mgr_.get(), drl_.get(),
+                                               kStepId));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+  std::unique_ptr<CollectiveParamResolverLocal> prl_;
+  std::unique_ptr<CollectiveRemoteAccessLocal> rma_;
+};
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU0) {
+  Device* cpu0 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:0", &cpu0));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:0", kTaskName, true /*is_local*/,
+                     "key_0", cpu0 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:0", kTaskName, "key_0",
+                   cpu0 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+TEST_F(CollectiveRemoteAccessLocalTest, PostRecvCPU1_2) {
+  Device* cpu2 = nullptr;
+  AllocatorAttributes attr;
+  DeviceLocality dev_locality;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:2", &cpu2));
+  Tensor sink_tensor(DT_FLOAT, TensorShape({8}));
+  Notification recv_note;
+  Status recv_status;
+  rma_->RecvFromPeer(kTaskName + "/device:CPU:1", kTaskName, true /*is_local*/,
+                     "key_0", cpu2 /*to_device*/, nullptr /*to_device_ctx*/,
+                     attr /*to_alloc_attr*/, &sink_tensor, dev_locality,
+                     [this, &recv_note, &recv_status](const Status& s) {
+                       recv_status = s;
+                       recv_note.Notify();
+                     });
+  Tensor source_tensor(DT_FLOAT, TensorShape({8}));
+  for (int i = 0; i < 8; ++i) {
+    source_tensor.flat<float>()(i) = i / 2;
+  }
+  // Tensors have distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+  Device* cpu1 = nullptr;
+  TF_ASSERT_OK(device_mgr_->LookupDevice(kTaskName + "/device:CPU:1", &cpu1));
+  Notification send_note;
+  Status send_status;
+  rma_->PostToPeer(kTaskName + "/device:CPU:2", kTaskName, "key_0",
+                   cpu1 /*from_device*/, nullptr /*from_device_ctx*/,
+                   attr /*to_alloc_attr*/, &source_tensor, dev_locality,
+                   [this, &send_note, &send_status](const Status& s) {
+                     send_status = s;
+                     send_note.Notify();
+                   });
+  recv_note.WaitForNotification();
+  send_note.WaitForNotification();
+  TF_EXPECT_OK(recv_status);
+  TF_EXPECT_OK(send_status);
+  // Sink tensor gets the source tensor values.
+  for (int i = 0; i < 8; ++i) {
+    EXPECT_EQ(sink_tensor.flat<float>()(i), i / 2);
+  }
+  // And still has distinct storage.
+  EXPECT_NE(DMAHelper::base(&source_tensor), DMAHelper::base(&sink_tensor));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17ef4a228449560c619b51dabbac0e67ec1d4db8
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+
+namespace tensorflow {
+
+void DeviceResolverLocal::GetDeviceLocalitiesAsync(
+    const CollInstanceParams& ci_params,
+    std::vector<DeviceLocality>* localities, const StatusCallback& done) {
+  localities->clear();
+  for (const string& device_name : ci_params.device_names) {
+    Device* dev;
+    Status s = dev_mgr_->LookupDevice(device_name, &dev);
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    localities->push_back(dev->attributes().locality());
+  }
+  done(Status::OK());
+}
+
+void DeviceResolverLocal::GetLocalityAsync(const string& device,
+                                           const string& task,
+                                           DeviceLocality* locality,
+                                           const StatusCallback& done) {
+  Device* dev;
+  Status s = dev_mgr_->LookupDevice(device, &dev);
+  if (s.ok()) {
+    *locality = dev->attributes().locality();
+  }
+  done(s);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..098eccdf842ea754c445e9cb83a2b270ec82e386
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+#define TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+
+namespace tensorflow {
+class DeviceMgr;
+
+// Implements DeviceResolverInterface in a single-task context.
+class DeviceResolverLocal : public DeviceResolverInterface {
+ public:
+  DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+
+  virtual ~DeviceResolverLocal() {}
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& ci_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override;
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override;
+
+  void ClearTask(const string& task) override {}
+
+ protected:
+  const DeviceMgr* dev_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a6471ff731578d377ccfc9ad146847ae3f221c
--- /dev/null
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -0,0 +1,87 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace {
+
+#define NUM_DEVS 3
+
+class DeviceResolverLocalTest : public ::testing::Test {
+ protected:
+  DeviceResolverLocalTest() {
+    ConfigProto cp;
+    SessionOptions options;
+    string task_name = "/job:localhost/replica:0/task:0";
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
+  }
+
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<DeviceResolverLocal> drl_;
+};
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesKnown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:1");
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:CPU:2");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  TF_EXPECT_OK(status);
+  EXPECT_EQ(2, localities.size());
+}
+
+TEST_F(DeviceResolverLocalTest, GetDeviceLocalitiesUnknown) {
+  CollectiveParams cp;
+  std::vector<DeviceLocality> localities;
+  // In some builds there may be 1 GPU, but there should never be 9.
+  cp.instance.device_names.push_back(
+      "/job:localhost/replica:0/task:0/device:GPU:9");
+  Notification note;
+  Status status;
+  drl_->GetDeviceLocalitiesAsync(cp.instance, &localities,
+                                 [this, &note, &status](const Status& s) {
+                                   status = s;
+                                   note.Notify();
+                                 });
+  note.WaitForNotification();
+  EXPECT_FALSE(status.ok());
+  EXPECT_EQ(0, localities.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 25cfb9e524cd12c92fc5edb01f0d4bed64fb872f..0479061daffb5b73d21f351a6ae3dd1108f21888 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -69,20 +70,6 @@ auto* direct_session_runs = monitoring::Counter<0>::New(
     "/tensorflow/core/direct_session_runs",
     "The number of times DirectSession::Run() has been called.");
 
-int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
-  const int32 t = options.config.inter_op_parallelism_threads();
-  if (t != 0) return t;
-  // Default to using the number of cores available in the process.
-  return port::NumSchedulableCPUs();
-}
-
-thread::ThreadPool* NewThreadPoolFromSessionOptions(
-    const SessionOptions& options) {
-  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
-  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
-  return new thread::ThreadPool(options.env, "Compute", num_threads);
-}
-
 Status NewThreadPoolFromThreadPoolOptions(
     const SessionOptions& options,
     const ThreadPoolOptionProto& thread_pool_options, int pool_number,
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index ee3896061858bd65d03171b97cae0ec850f82ad9..8ddc9958b2259f4da6dc1750c6c79a706c804be8 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -80,6 +81,7 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     test::FillValues<float>(&a_tensor, a_values);
     Node* a = test::graph::Constant(&graph, a_tensor);
     a->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
+    a_ = a->name();
 
     Tensor x_tensor(DT_FLOAT, TensorShape({2, 1}));
     test::FillValues<float>(&x_tensor, {1, 1});
@@ -96,12 +98,18 @@ class DirectSessionMinusAXTest : public ::testing::Test {
     y_neg_ = y_neg->name();
     y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
 
+    Node* z = test::graph::Unary(&graph, "Identity", y_neg);
+    z_ = z->name();
+    z->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:1");
+
     test::graph::ToGraphDef(&graph, &def_);
   }
 
+  string a_;
   string x_;
   string y_;
   string y_neg_;
+  string z_;
   GraphDef def_;
 };
 
@@ -132,7 +140,6 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
   auto session = CreateSession();
   ASSERT_TRUE(session != nullptr);
   TF_ASSERT_OK(session->Create(def_));
-  std::vector<std::pair<string, Tensor>> inputs;
 
   // Run the test twice to ensure that the Make/Run/Release cycle is hermetic.
   for (int i = 0; i < 2; ++i) {
@@ -155,22 +162,175 @@ TEST_F(DirectSessionMinusAXTest, RunSimpleNetwork_Callable) {
 
     Status s = session->RunCallable(handle, {}, nullptr, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("`fetch_tensors` must be provided"));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                      "`fetch_tensors` must be provided"));
 
     TF_ASSERT_OK(session->ReleaseCallable(handle));
 
     std::vector<Tensor> outputs;
     s = session->RunCallable(handle, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
-    EXPECT_TRUE(
-        StringPiece(s.error_message())
-            .contains("Attempted to run callable after handle was released"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(),
+        "Attempted to run callable after handle was released"));
 
     s = session->RunCallable(handle + 1, {}, &outputs, nullptr);
     EXPECT_TRUE(errors::IsInvalidArgument(s));
     EXPECT_TRUE(
-        StringPiece(s.error_message()).contains("No such callable handle"));
+        str_util::StrContains(s.error_message(), "No such callable handle"));
+  }
+}
+
+TEST_F(DirectSessionMinusAXTest, TestTensorConnection) {
+  Initialize({3, 2, -1, 0});
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def_));
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);".
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(1, outputs.size());
+    auto mat = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Directly wire the output of node a to the output of node y, making the
+    // callable graph into "Neg(a);"; also fetch the result of a.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(a_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+    std::vector<Tensor> outputs;
+    TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+    ASSERT_EQ(2, outputs.size());
+    auto mat_a = outputs[0].matrix<float>();
+    ASSERT_TRUE(outputs[0].IsInitialized());
+    EXPECT_FLOAT_EQ(3.0, mat_a(0, 0));
+    EXPECT_FLOAT_EQ(2.0, mat_a(0, 1));
+    EXPECT_FLOAT_EQ(-1.0, mat_a(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_a(1, 1));
+
+    auto mat_y_neg = outputs[1].matrix<float>();
+    ASSERT_TRUE(outputs[1].IsInitialized());
+    EXPECT_FLOAT_EQ(-3.0, mat_y_neg(0, 0));
+    EXPECT_FLOAT_EQ(-2.0, mat_y_neg(0, 1));
+    EXPECT_FLOAT_EQ(1.0, mat_y_neg(1, 0));
+    EXPECT_FLOAT_EQ(0.0, mat_y_neg(1, 1));
+    TF_ASSERT_OK(session->ReleaseCallable(handle));
+  }
+
+  {
+    // Wire the output of "Neg(Matmul(a, x))" to the output of "a",
+    // creating an invalid cycle.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(y_ + ":0");
+    c->set_to_tensor(a_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "would create a cycle"));
+  }
+
+  {
+    // Attempt to wire a non-existent node to a node that does exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor("unknown_node:0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown node"));
+  }
+
+  {
+    // Attempt to wire a non-existent output from a node that does
+    // exist to another node.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":17");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "unknown edge"));
+  }
+
+  {
+    // Attempt to wire a tensor to a node that doesn't exist.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor("unknown_node:0");
+    callable_options.add_fetch(y_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsNotFound(s));
+    EXPECT_TRUE(
+        str_util::StrContains(s.error_message(), "unable to find feed output"));
+  }
+
+  {
+    // Attempt to wire two tensors to the same tensor.
+    CallableOptions callable_options;
+    TensorConnection* c1 = callable_options.add_tensor_connection();
+    c1->set_from_tensor(a_ + ":0");
+    c1->set_to_tensor(y_neg_ + ":0");
+    TensorConnection* c2 = callable_options.add_tensor_connection();
+    c2->set_from_tensor(x_ + ":0");
+    c2->set_to_tensor(y_neg_ + ":0");
+    callable_options.add_fetch(z_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+  }
+
+  {
+    // Attempt to wire a tensor to a tensor that is also being fed.
+    CallableOptions callable_options;
+    TensorConnection* c = callable_options.add_tensor_connection();
+    c->set_from_tensor(a_ + ":0");
+    c->set_to_tensor(y_ + ":0");
+    callable_options.add_feed(y_ + ":0");
+    callable_options.add_fetch(y_neg_ + ":0");
+
+    Session::CallableHandle handle;
+    Status s = session->MakeCallable(callable_options, &handle);
+    EXPECT_TRUE(errors::IsInvalidArgument(s));
+    EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
   }
 }
 
@@ -567,7 +727,7 @@ TEST(DirectSessionTest, MultipleFeedTest) {
       {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
       &outputs);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
 }
 
 TEST(DirectSessionTest, MultipleFeedTest_Callable) {
@@ -650,7 +810,56 @@ TEST(DirectSessionTest, MultipleFeedTest_Callable) {
           {first_identity->name() + ":0", second_identity->name() + ":0"}, {}),
       &handle);
   EXPECT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("fed more than once"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+}
+
+TEST(DirectSessionTest, TestTensorConnectionUseTwice) {
+  Graph graph(OpRegistry::Global());
+
+  Tensor a_tensor(DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&a_tensor, {1.0, 2.0, 3.0, 4.0});
+  Node* a = test::graph::Constant(&graph, a_tensor);
+
+  Tensor dummy_tensor(DT_FLOAT, TensorShape({1}));
+  test::FillValues<float>(&dummy_tensor, {-1.0});
+
+  Node* left = test::graph::Constant(&graph, dummy_tensor);
+  Node* right = test::graph::Constant(&graph, dummy_tensor);
+
+  // y = A * x
+  Node* y = test::graph::Add(&graph, left, right);
+
+  GraphDef def;
+  test::graph::ToGraphDef(&graph, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  CallableOptions callable_options;
+  // Directly wire the output of node a to the outputs of nodes left
+  // and right, making the callable graph into "a + a;".
+  TensorConnection* c_left = callable_options.add_tensor_connection();
+  c_left->set_from_tensor(a->name() + ":0");
+  c_left->set_to_tensor(left->name() + ":0");
+  TensorConnection* c_right = callable_options.add_tensor_connection();
+  c_right->set_from_tensor(a->name() + ":0");
+  c_right->set_to_tensor(right->name() + ":0");
+
+  callable_options.add_fetch(y->name() + ":0");
+
+  Session::CallableHandle handle;
+  TF_ASSERT_OK(session->MakeCallable(callable_options, &handle));
+  std::vector<Tensor> outputs;
+  TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+  ASSERT_EQ(1, outputs.size());
+  auto mat = outputs[0].matrix<float>();
+  ASSERT_TRUE(outputs[0].IsInitialized());
+  EXPECT_FLOAT_EQ(2.0, mat(0, 0));
+  EXPECT_FLOAT_EQ(4.0, mat(0, 1));
+  EXPECT_FLOAT_EQ(6.0, mat(1, 0));
+  EXPECT_FLOAT_EQ(8.0, mat(1, 1));
+  TF_ASSERT_OK(session->ReleaseCallable(handle));
 }
 
 TEST(DirectSessionTest, FetchMultipleTimes) {
@@ -845,8 +1054,8 @@ TEST(DirectSessionTest, PartialRunMissingFeed) {
   s = session->PRun(handle, {{first_const->name(), value_11}},
                     {third_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 }
 
 TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
@@ -875,8 +1084,8 @@ TEST(DirectSessionTest, PartialRunMultiOutputFeed) {
   // Fetch fourth_identity without feeds.
   s = session->PRun(handle, {}, {fourth_identity->name() + ":0"}, &outputs);
   ASSERT_TRUE(errors::IsInvalidArgument(s));
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("can't be computed from the feeds"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "can't be computed from the feeds"));
 
   // Feed switch_node:1 and fetch fourth_identity.
   s = session->PRun(handle, {{switch_node->name() + ":1", bool_value}},
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index a619cac9a47b8bd464e9688143cffaea41b50c93..941a0e61c75cbf807636a5feed65339bffef4338 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -135,17 +135,34 @@ tf_cc_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.
+cc_library(
+    name = "execute",
+    srcs = ["execute.cc"],
+    hdrs = ["execute.h"],
+    deps = [
+        ":context",
+        ":copy_to_device_node",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+cc_library(
+    name = "execute_node",
+    hdrs = ["execute_node.h"],
+    deps = [
+        ":context",
+        ":eager_executor",
+        ":execute",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 0566329f185e7871f020b395890578e0084f3b8f..d3fe6a7edeabecdaba0d894cca700608b896026d 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,32 +15,22 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
-namespace tensorflow {
+#include "tensorflow/core/common_runtime/process_util.h"
 
-ContextDevicePlacementPolicy PlacementPolicy(
-    bool soft_placement, ContextDevicePlacementPolicy original_policy) {
-  if (!soft_placement) {
-    return original_policy;
-  }
-  if (original_policy == DEVICE_PLACEMENT_EXPLICIT ||
-      original_policy == DEVICE_PLACEMENT_SILENT_FOR_INT32) {
-    return DEVICE_PLACEMENT_SILENT;
-  }
-  return original_policy;
-}
+namespace tensorflow {
 
 EagerContext::EagerContext(const SessionOptions& opts,
                            ContextDevicePlacementPolicy default_policy,
                            bool async, std::unique_ptr<DeviceMgr> device_mgr,
                            Rendezvous* rendezvous)
-    : soft_placement_(opts.config.allow_soft_placement()),
-      policy_(PlacementPolicy(soft_placement_, default_policy)),
+    : policy_(default_policy),
       device_manager_(std::move(device_mgr)),
       devices_(device_manager_->ListDevices()),
       rendezvous_(rendezvous),
-      pflr_(new ProcessFunctionLibraryRuntime(device_manager_.get(), opts.env,
-                                              TF_GRAPH_DEF_VERSION,
-                                              &func_lib_def_, {})),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
+      pflr_(new ProcessFunctionLibraryRuntime(
+          device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
+          {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       async_default_(async) {
   if (async_default_) {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index bc97219dae532f058b63515be1ec24fcedf9bf10..6665df27d09a73d4c30756cf01e383834fcae339 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -43,23 +44,18 @@ namespace tensorflow {
 
 // Note: there's a copy enum in eager/c_api.h. It should be kept in sync.
 enum ContextDevicePlacementPolicy {
-  // Running operations with input tensors on the wrong device will fail. When
-  // soft placement is enabled acts like TFE_DEVICE_PLACEMENT_SILENT.
+  // Running operations with input tensors on the wrong device will fail.
   DEVICE_PLACEMENT_EXPLICIT = 0,
   // Copy the tensor to the right device but log a warning.
   DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the
-  // operation will be blocked till the copy completes.
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default policy.
   DEVICE_PLACEMENT_SILENT = 2,
   // Default placement policy which silently copies int32 tensors but not other
-  // dtypes.  When soft placement is enabled acts like
-  // TFE_DEVICE_PLACEMENT_SILENT.
+  // dtypes.
   DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 };
 
-ContextDevicePlacementPolicy PlacementPolicy(
-    bool soft_placement, ContextDevicePlacementPolicy original_policy);
-
 class EagerContext {
  public:
   explicit EagerContext(const SessionOptions& opts,
@@ -116,8 +112,6 @@ class EagerContext {
 
   Device* HostCPU() { return devices_[0]; }
 
-  bool SoftPlacement() { return soft_placement_; }
-
   uint64 NextId() { return executor_.NextId(); }
 
   void ExecutorAdd(EagerNode* node) { executor_.Add(node); }
@@ -148,7 +142,6 @@ class EagerContext {
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
 
  private:
-  const bool soft_placement_;
   const ContextDevicePlacementPolicy policy_;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
@@ -168,6 +161,8 @@ class EagerContext {
   FunctionLibraryDefinition func_lib_def_ GUARDED_BY(functions_mu_){
       OpRegistry::Global(), {}};
 
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
   // One FunctionLibraryRuntime per device.
   // func_libs[i] is the FunctionLibraryRuntime corresponding to
   // session->devices[i].
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e8471102b5e0a8813f97d5f99f9a383f0b6225
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -0,0 +1,130 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute.h"
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals) {
+  if (device == nullptr) {
+    // TODO(apassos) debug how the assignment below might return a different
+    // device from the one requested above.
+    device = kernel->device();
+  }
+
+  std::vector<Tensor> outputs(1);
+  const MemoryTypeVector* output_memory_types = nullptr;
+  output_memory_types = &kernel->kernel()->output_memory_types();
+  std::vector<Tensor> inputs(op_inputs.size());
+  for (int i = 0; i < op_inputs.size(); ++i) {
+    const Tensor* input_tensor = nullptr;
+    TF_RETURN_IF_ERROR(op_inputs[i]->Tensor(&input_tensor));
+    inputs[i] = *input_tensor;
+  }
+  // WARNING: kernel->Run utilizes the FunctionLibraryRuntime
+  // (ctx->func_lib(device)), which in turn holds a pointer to func_lib_def.
+  // But knowledge of the implementation
+  // of FunctionLibraryRuntime tells us that func_lib_def is not accessed by
+  // FunctionLibraryRuntime::Run(), so there is no thread-safety concern here.
+  // This is quite subtle. Re-work things to make this better?  (Would it make
+  // sense for FunctionLibraryRuntime to ensure thread-safe access to
+  // FunctionLibraryDefinition?).  TODO(apassos) figure out how to record stats
+  // for ops which are a part of functions.
+  // TODO(agarwal): change Run to take vector of handles ?
+  TF_RETURN_IF_ERROR(kernel->Run(&inputs, &outputs, maybe_stats));
+  if (maybe_stats != nullptr) {
+    maybe_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
+                                       maybe_stats->all_start_micros());
+    mutex_lock ml(*ctx->MetadataMu());
+    if (ctx->ShouldStoreMetadata()) {
+      auto* step_stats = ctx->RunMetadataProto()->mutable_step_stats();
+      // Lazily initialize the RunMetadata with information about all devices if
+      // this is the first call.
+      while (step_stats->dev_stats_size() < ctx->devices()->size()) {
+        step_stats->add_dev_stats();
+      }
+      // Find the current device's index.
+      int device_idx = 0;
+      for (int i = 0; i < ctx->devices()->size(); ++i) {
+        if (ctx->devices()->at(i) == device) {
+          device_idx = i;
+          break;
+        }
+      }
+      // Populate the device stats for this device.
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      dev_stats->set_device(device->name());
+      *dev_stats->add_node_stats() = *maybe_stats;
+    }
+  }
+  DCHECK_EQ(num_retvals, outputs.size());
+  Device* op_device = device;
+  for (int i = 0; i < num_retvals; ++i) {
+    Device* d = op_device;
+    if (d != nullptr && output_memory_types != nullptr &&
+        (*output_memory_types)[i] == HOST_MEMORY) {
+      d = nullptr;
+    }
+    if (retvals[i] == nullptr) {
+      retvals[i] = new TensorHandle(outputs[i], d, op_device, ctx);
+    } else {
+      retvals[i]->SetTensorAndDevice(outputs[i], d, op_device);
+    }
+  }
+  return Status::OK();
+}
+
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result) {
+  TF_RETURN_IF_ERROR(ctx->GetStatus());
+  Device* dstd = ctx->HostCPU();
+  if (device_name != nullptr && strlen(device_name) > 0) {
+    TF_RETURN_IF_ERROR(ctx->device_mgr()->LookupDevice(device_name, &dstd));
+  }
+  if (ctx->Async()) {
+    // Note that `h` may not be currently ready. However execution order will
+    // make sure that `h` is ready before the copy is actually done.
+    CopyToDeviceNode* node = new CopyToDeviceNode(h, dstd, ctx);
+    TensorHandle* output = node->dst();
+    // Note that calling Add makes `node` accessible by the EagerExecutor
+    // thread. So further accesses need to be thread-safe.
+    ctx->ExecutorAdd(node);
+    *result = output;
+    return Status::OK();
+  } else {
+    TF_RETURN_IF_ERROR(h->CopyToDevice(ctx, dstd, result));
+    return Status::OK();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f6ad031e1dd03bc9c3b90778ed1b37e958085e0
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+// Low-level utility to execute the kernel specified by kernel on device device,
+// with the inputs op_inputs, in the context ctx.
+Status EagerExecute(EagerContext* ctx, Device* device,
+                    const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
+                    KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+                    TensorHandle** retvals, int num_retvals);
+
+// Low-level utility to copy a tensor handle from one device to another.
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_H_
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..93018dd96914c0d091c7242a9c053fabce434e78
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace tensorflow {
+
+class ExecuteNode : public EagerNode {
+ public:
+  ExecuteNode(uint64 id, EagerContext* ctx, Device* op_device,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 4>& inputs,
+              KernelAndDevice* kernel, NodeExecStats* maybe_stats,
+              const DataTypeVector& output_dtypes,
+              const tensorflow::gtl::InlinedVector<TensorHandle*, 2>& retvals)
+      : EagerNode(id),
+        ctx_(ctx),
+        op_device_(op_device),
+        inputs_(inputs),
+        kernel_(kernel),
+        maybe_stats_(maybe_stats),
+        retvals_(retvals) {
+    for (auto handle : inputs_) {
+      handle->Ref();
+    }
+    for (auto handle : retvals_) {
+      handle->Ref();
+    }
+  }
+
+  ~ExecuteNode() override {
+    for (auto handle : inputs_) {
+      handle->Unref();
+    }
+    for (auto handle : retvals_) {
+      handle->Unref();
+    }
+  }
+
+  tensorflow::Status Run() override {
+    const Status status =
+        EagerExecute(ctx_, op_device_, inputs_, kernel_, maybe_stats_.get(),
+                     retvals_.begin(), retvals_.size());
+    if (status.ok()) {
+      return status;
+    } else {
+      return Status(status.code(),
+                    strings::StrCat("Got error, \"", status.error_message(),
+                                    "\" while executing kernel ",
+                                    kernel_->kernel()->def().DebugString()));
+    }
+  }
+
+ private:
+  tensorflow::EagerContext* ctx_;
+  tensorflow::Device* op_device_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 4> inputs_;
+  tensorflow::KernelAndDevice* kernel_;
+  std::unique_ptr<NodeExecStats> maybe_stats_;
+  tensorflow::gtl::InlinedVector<TensorHandle*, 2> retvals_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EXECUTE_NODE_H_
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 328cd5dd5c7aa53d56a0e2579f87643fe50acc6b..8e11f7b7104ce22cc585e2b03fcfd914e0eb80aa 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -47,7 +47,7 @@ namespace tensorflow {
 bool TensorHandle::IsReady() {
   if (node_id == 0) return true;
   mutex_lock l(ctx_mutex_);
-  return ctx_ == nullptr;
+  return is_ready_;
 }
 
 Status TensorHandle::WaitReady() {
@@ -55,7 +55,7 @@ Status TensorHandle::WaitReady() {
   EagerExecutor* executor = nullptr;
   {
     mutex_lock l(ctx_mutex_);
-    if (ctx_ == nullptr) return Status::OK();
+    if (is_ready_) return Status::OK();
     executor = ctx_->Executor();
   }
   return executor->WaitFor(node_id);
@@ -97,9 +97,10 @@ void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
                                       tensorflow::Device* device,
                                       tensorflow::Device* op_device) {
   mutex_lock l(ctx_mutex_);
-  DCHECK(node_id > 0 && ctx_) << "SetTensorAndDevice should be only called  "
-                              << "on non-ready handles.";
-  ctx_ = nullptr;
+  DCHECK(node_id > 0 && !is_ready_)
+      << "SetTensorAndDevice should be only called  "
+      << "on non-ready handles.";
+  is_ready_ = true;
   tensor_ = tensor;
   device_ = device;
   op_device_ = op_device;
@@ -122,7 +123,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   const bool both_on_cpu = src_cpu && dst_cpu;
   if (is_same_device || both_on_cpu) {
     dstd = dst_cpu ? nullptr : dstd;
-    *output = new tensorflow::TensorHandle(*src, dstd, dstd);
+    *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
     return tensorflow::Status::OK();
   }
   if (!dst_cpu && (src->dtype() != tensorflow::DT_VARIANT &&
@@ -139,7 +140,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   tensorflow::Tensor dst(dstd->GetAllocator(attr), src->dtype(), src->shape());
   if (src->shape().num_elements() == 0) {
     dstd = dst_cpu ? nullptr : dstd;
-    *output = new tensorflow::TensorHandle(dst, dstd, dstd);
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
     return tensorflow::Status::OK();
   }
   tensorflow::DeviceContext* src_device_context = nullptr;
@@ -170,7 +171,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   n.WaitForNotification();
   if (status.ok()) {
     dstd = dst_cpu ? nullptr : dstd;
-    *output = new tensorflow::TensorHandle(dst, dstd, dstd);
+    *output = new tensorflow::TensorHandle(dst, dstd, dstd, ctx);
   }
   return status;
 }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index eb69a13c064f3223a3a4e03214a95a198f310bdb..d66c4d95e2a5513680f81e3f7c1875266b2dfb02 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -49,13 +49,14 @@ namespace tensorflow {
 // (unrelated to python TensorHandle).
 class TensorHandle : public core::RefCounted {
  public:
-  TensorHandle(const Tensor& t, Device* d, Device* op_device)
+  TensorHandle(const Tensor& t, Device* d, Device* op_device, EagerContext* ctx)
       : dtype(t.dtype()),
         node_id(0),
         tensor_(t),
         device_(d),
         op_device_(op_device),
-        ctx_(nullptr) {}
+        ctx_(ctx),
+        is_ready_(true) {}
 
   TensorHandle(uint64 node_id, DataType dtype, EagerContext* ctx)
       : dtype(dtype),
@@ -63,7 +64,8 @@ class TensorHandle : public core::RefCounted {
         tensor_(dtype),
         device_(nullptr),
         op_device_(nullptr),
-        ctx_(ctx) {
+        ctx_(ctx),
+        is_ready_(ctx == nullptr) {
     DCHECK_GT(node_id, 0);
   }
 
@@ -88,6 +90,12 @@ class TensorHandle : public core::RefCounted {
   Status CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
                       TensorHandle** output);
 
+  // Warning: can return nullptr for CPU tensors.
+  EagerContext* Context() {
+    mutex_lock ml(ctx_mutex_);
+    return ctx_;
+  }
+
   // dtype for the handle. It must be the same as t.dtype() once the handle is
   // ready.
   const DataType dtype;
@@ -126,6 +134,7 @@ class TensorHandle : public core::RefCounted {
   // typically true when the handle was produced during async execution.
   // `ctx` object is not owned and should outlive this handle.
   EagerContext* ctx_ GUARDED_BY(ctx_mutex_);
+  bool is_ready_ GUARDED_BY(ctx_mutex_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index b06b75d6585f01640374eb7ab9842bf441cf9411..0c461a9ee98ca61fb3d3f165d93adf0e5cec7ee7 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -258,6 +258,13 @@ struct NodeItem {
   // Return array of per-output allocator attributes.
   const AllocatorAttributes* output_attrs() const { return output_attr_base(); }
 
+  // Return array of expected input index from which each output should
+  // be forwarded:
+  // kNeverForward (-2) for DO NOT FORWARD (must allocate).
+  // kNoReservation (-1) for no expected forwarding.
+  // 0... for forward from that input.
+  const int* forward_from() const { return forward_from_base(); }
+
  private:
   friend class GraphView;
 
@@ -267,6 +274,7 @@ struct NodeItem {
   //   AllocatorAttributes output_attr[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
+  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -292,6 +300,13 @@ struct NodeItem {
         sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
   }
 
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs +
+                                  sizeof(uint8) * num_inputs +
+                                  sizeof(uint8) * num_outputs);
+  }
+
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
 };
 
@@ -466,7 +481,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
+      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -737,8 +753,8 @@ Status InferAllocAttr(const Node* n, const Node* dst,
       VLOG(2) << "node " << n->name() << " is the sink of an RPC in";
     } else if ((local_dev_name.type == "CPU" || n->IsHostRecv()) &&
                parsed_src_name.type != "CPU") {
-      // Value is going to be the sink of a local DMA from GPU to CPU (or other
-      // types of accelerators).
+      // Value is going to be the sink of a local DMA from GPU to CPU (or
+      // other types of accelerators).
       attr->set_gpu_compatible(true);
       VLOG(2) << "node " << n->name() << " is the sink of a gpu->cpu copy";
     } else {
@@ -1022,7 +1038,8 @@ class ExecutorState {
     int total_input_tensors = 0;
     std::vector<const Node*>* nodes = nullptr;
 
-    // Lock ordering: ExecutorState.mu_ < mu.
+    // Lock ordering: ExecutorState.mu_ < mu;
+    // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
     void InitializeFrameInfo(const string& enter_name) {
@@ -1090,7 +1107,8 @@ class ExecutorState {
     void ActivateLoopInvs(const GraphView* gview, int64 iter,
                           TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
-    // Add a new loop invariant and make it available to all active iterations.
+    // Add a new loop invariant and make it available to all active
+    // iterations.
     void AddLoopInv(const NodeItem* item, const Entry& value,
                     TaggedNodeSeq* ready) EXCLUSIVE_LOCKS_REQUIRED(mu);
 
@@ -1147,8 +1165,8 @@ class ExecutorState {
         if (front_index_ == ready_.size()) {
           ready_.clear();
         } else {
-          // Lots of unused entries at beginning of vector: move everything down
-          // to start of vector.
+          // Lots of unused entries at beginning of vector: move everything
+          // down to start of vector.
           ready_.erase(ready_.begin(), ready_.begin() + front_index_);
         }
         front_index_ = 0;
@@ -1596,6 +1614,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
+      params.forward_from_array = nullptr;  // later: item.forward_from();
 
       if (item.kernel_is_async) {
         // Asynchronous computes.
@@ -2333,8 +2352,9 @@ void ExecutorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   FrameState* parent_frame = frame->parent_frame;
   const int64 parent_iter = frame->parent_iter;
   if (parent_frame != nullptr) {
-    mutex_lock paranet_frame_lock(parent_frame->mu);
+    mutex_lock parent_frame_lock(parent_frame->mu);
     // Propagate all the dead exits to the parent frame.
+    mutex_lock this_frame_lock(frame->mu);
     for (const Node* node : frame->dead_exits) {
       auto parent_iter_state = parent_frame->GetIteration(parent_iter);
       for (const Edge* e : node->out_edges()) {
@@ -2603,7 +2623,7 @@ void ExecutorImpl::RunAsync(const Args& args, DoneCallback done) {
   (new ExecutorState(args, this))->RunAsync(std::move(done));
 }
 
-}  // end namespace
+}  // namespace
 
 Status NewLocalExecutor(const LocalExecutorParams& params,
                         std::unique_ptr<const Graph> graph,
@@ -2629,4 +2649,4 @@ Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
 
 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }
 
-}  // end namespace tensorflow
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 37c59a16f56809fe8d5f88c05b824bcbdcc7cf4e..d310520ebde5a6143b84e5879c9e1c23ab0b5935 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -796,16 +796,17 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
       done(status);
     };
   }
-  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
-    parent_->Run(run_opts, handle, args, rets, done);
-    return;
-  }
 
   if (run_opts.runner == nullptr) {
     run_opts.runner = &default_runner_;
   }
   DCHECK(run_opts.runner != nullptr);
 
+  if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
+    parent_->Run(run_opts, handle, args, rets, done);
+    return;
+  }
+
   Executor::Args* exec_args = new Executor::Args;
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index d17ef4d4590e5932e43a0bb01fe1e05ab2c4f873..61b2f0e60f7ea6ca7f7b36f21845766399489795 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -53,8 +54,8 @@ Status GetOpSig(const string& op, const OpDef** sig) {
   return OpRegistry::Global()->LookUpOpDef(op, sig);
 }
 
-void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+void HasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
@@ -240,7 +241,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
@@ -310,7 +311,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 6223a4e648baeb9cd7a2595c74881cddbf9a6f0b..2d09e83d013591ceaa91c9a7e0fe929a328742a3 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -153,7 +154,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
     EXPECT_TRUE(errors::IsInvalidArgument(status2));
     EXPECT_TRUE(
-        StringPiece(status2.error_message()).contains("remote execution."));
+        str_util::StrContains(status2.error_message(), "remote execution."));
 
     return status;
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
index 0a586344ccf2228a23059d68e7aa2d7a8f4eadba..208697361d2dfc4f3b8290ea511d15c9bd86857b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 63ed0b8be16ecb187113311db5283c8d4f3b1a5e..b0ca7e31096de836921a2c469fc7201738059d83 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -85,8 +85,8 @@ GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
 
 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   num_bytes += (2 * MASK_BYTES);
-
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Return the pointer after the header
   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
@@ -102,11 +102,13 @@ void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return rv;
 }
 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
-  CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
-  CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
+  if (ptr != nullptr) {
+    CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
+    CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
 
-  // Backtrack to the beginning of the header.
-  ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+    // Backtrack to the beginning of the header.
+    ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
+  }
   // Deallocate the memory
   base_allocator_->DeallocateRaw(ptr);
 }
@@ -168,10 +170,12 @@ GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
 
 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
+  if (allocated_ptr == nullptr) return allocated_ptr;
 
   // Initialize the buffer to Nans
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
+  std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                          std::nanf(""));
   gpu::DeviceMemory<float> nan_ptr{
       gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
@@ -182,13 +186,16 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   return allocated_ptr;
 }
 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
-  // Reset the buffer to Nans
-  size_t req_size = base_allocator_->RequestedSize(ptr);
-  std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
-  gpu::DeviceMemory<float> nan_ptr{
-      gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-  if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
-    LOG(ERROR) << "Could not initialize to NaNs";
+  if (ptr != nullptr) {
+    // Reset the buffer to Nans
+    size_t req_size = base_allocator_->RequestedSize(ptr);
+    std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
+                            std::nanf(""));
+    gpu::DeviceMemory<float> nan_ptr{
+        gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
+      LOG(ERROR) << "Could not initialize to NaNs";
+    }
   }
 
   // Deallocate the memory
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
index 0db08dc9759c9306ebd99b4acf4967128ef04895..adce3a84368ced958002443721016778cb6df028 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 52fd20e479918dea8fb07694e21a8db9fede8467..0b9e8f9cc2d8b65834c7963112427ac2e24a5789 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -257,6 +257,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
                                                          physical_device_desc)),
       gpu_allocator_(gpu_allocator),
       cpu_allocator_(cpu_allocator),
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)),
       tf_gpu_id_(tf_gpu_id),
       sync_every_op_(sync_every_op),
       max_streams_(max_streams) {
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator.h b/tensorflow/core/common_runtime/gpu/pool_allocator.h
index 38d669ea07c91bc1a892ecf925b3141f2ca506dd..91ce830df8521e7fe8284dd3c52d1bbf667891cd 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <vector>
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 2f17af273ff8cdc83a112ef350fde88346c7e13d..6a3e6906a3e3f076fd3b4731e36f86a1960742a6 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/subgraph.h"
@@ -237,6 +239,50 @@ void GraphExecutionState::RestoreStatefulNodes(Graph* graph) {
   }
 }
 
+namespace {
+
+class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
+ public:
+  TensorConnectionPruneRewrite(const string* endpoint_name,
+                               NodeBuilder::NodeOut from_tensor)
+      : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
+        from_tensor_(std::move(from_tensor)) {}
+
+  Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
+                 Node** out_node) override {
+    Status s;
+    auto check_no_cycle_fn = [this, feed_tensor, &s](Node* n) {
+      if (n == feed_tensor.node) {
+        s.Update(errors::InvalidArgument(
+            "Requested Tensor connection between nodes \"",
+            feed_tensor.node->name(), "\" and \"", from_tensor_.node->name(),
+            "\" would create a cycle."));
+      }
+    };
+    ReverseDFSFrom(*g, {from_tensor_.node}, std::move(check_no_cycle_fn),
+                   nullptr);
+    TF_RETURN_IF_ERROR(s);
+
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                    feed_tensor.index),
+                    "Identity")
+            .Input(from_tensor_)
+            .Attr("T",
+                  BaseType(from_tensor_.node->output_type(from_tensor_.index)))
+            .Finalize(g, out_node));
+
+    (*out_node)->set_assigned_device_name(
+        feed_tensor.node->assigned_device_name());
+    return Status::OK();
+  }
+
+ private:
+  NodeBuilder::NodeOut from_tensor_;
+};
+
+}  // namespace
+
 Status GraphExecutionState::PruneGraph(
     const BuildGraphOptions& options, Graph* graph,
     subgraph::RewriteGraphMetadata* out_rewrite_metadata) {
@@ -265,12 +311,48 @@ Status GraphExecutionState::PruneGraph(
           new subgraph::SendFetchRewrite(&fetch, device_info));
     }
   }
+
+  for (const TensorConnection& tensor_connection :
+       options.callable_options.tensor_connection()) {
+    Node* from_node = nullptr;
+    TensorId from_id(ParseTensorName(tensor_connection.from_tensor()));
+
+    for (Node* n : graph->nodes()) {
+      if (n->name() == from_id.first) {
+        from_node = n;
+        break;
+      }
+    }
+    if (from_node == nullptr) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown node: \"",
+          tensor_connection.to_tensor(), "\".");
+    }
+    if (from_id.second >= from_node->num_outputs()) {
+      return errors::InvalidArgument(
+          "Requested tensor connection from unknown edge: \"",
+          tensor_connection.to_tensor(),
+          "\" (actual number of outputs = ", from_node->num_outputs(), ").");
+    }
+
+    feed_rewrites.emplace_back(new TensorConnectionPruneRewrite(
+        &tensor_connection.to_tensor(), {from_node, from_id.second}));
+  }
+
   std::vector<string> target_node_names(
       options.callable_options.target().begin(),
       options.callable_options.target().end());
-  return subgraph::RewriteGraphForExecution(graph, feed_rewrites,
-                                            fetch_rewrites, target_node_names,
-                                            out_rewrite_metadata);
+  TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
+      graph, feed_rewrites, fetch_rewrites, target_node_names,
+      out_rewrite_metadata));
+
+  CHECK_EQ(out_rewrite_metadata->feed_types.size(),
+           options.callable_options.feed_size() +
+               options.callable_options.tensor_connection_size());
+  for (int i = 0; i < options.callable_options.tensor_connection_size(); ++i) {
+    out_rewrite_metadata->feed_types.pop_back();
+  }
+  return Status::OK();
 }
 
 Status GraphExecutionState::InitBaseGraph(const BuildGraphOptions& options) {
@@ -340,7 +422,13 @@ Status GraphExecutionState::OptimizeGraph(
                       options.callable_options.target().begin(),
                       options.callable_options.target().end());
 
-    if (!options.callable_options.feed().empty()) {
+    for (const TensorConnection& tensor_connection :
+         options.callable_options.tensor_connection()) {
+      item.fetch.push_back(tensor_connection.from_tensor());
+    }
+
+    if (!(options.callable_options.feed().empty() &&
+          options.callable_options.tensor_connection().empty())) {
       std::unordered_set<string> feeds;
       for (const string& feed : options.callable_options.feed()) {
         TensorId id = ParseTensorName(feed);
@@ -349,6 +437,15 @@ Status GraphExecutionState::OptimizeGraph(
         }
         feeds.insert(id.first.ToString());
       }
+      for (const TensorConnection& tensor_connection :
+           options.callable_options.tensor_connection()) {
+        TensorId id = ParseTensorName(tensor_connection.to_tensor());
+        if (id.second != 0) {
+          return errors::InvalidArgument("Unsupported feed: ",
+                                         tensor_connection.to_tensor());
+        }
+        feeds.insert(id.first.ToString());
+      }
       for (const NodeDef& node : original_graph_def_.node()) {
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 1125d2a34a5adcde5153ea4f039d0bda3159deb4..790f2eaa1e9de96b5cd399dd53a1e49696035f21 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// TODO(skyewm): this is necessary to make the single_threaded_cpu_device.h
+// include work. Some other include must be including eigen without defining
+// this. Consider defining in this in a BUILD rule.
+#define EIGEN_USE_THREADS
+
 #include "tensorflow/core/common_runtime/graph_runner.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -20,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/common_runtime/single_threaded_cpu_device.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_util.h"
@@ -36,18 +42,6 @@ namespace tensorflow {
 
 namespace {
 
-std::unique_ptr<Device> GetCPUDevice(Env* env) {
-  std::vector<Device*> devices;
-  SessionOptions session_options;
-  session_options.env = env;
-  Status s = DeviceFactory::GetFactory(DEVICE_CPU)
-                 ->CreateDevices(session_options, "", &devices);
-  if (s.ok() && !devices.empty()) {
-    return std::unique_ptr<Device>(devices[0]);
-  }
-  return nullptr;
-}
-
 // A simple rendezvous class.
 // Assumes a single sender and a single receiver, no duplicate sends, and no
 // sends of dead tensors.
@@ -98,7 +92,8 @@ class SimpleRendezvous : public Rendezvous {
 }  // namespace
 
 GraphRunner::GraphRunner(Env* env)
-    : device_deleter_(GetCPUDevice(env)), device_(device_deleter_.get()) {}
+    : device_deleter_(new SingleThreadedCpuDevice(env)),
+      device_(device_deleter_.get()) {}
 GraphRunner::GraphRunner(Device* device) : device_(device) {}
 
 GraphRunner::~GraphRunner() {}
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 55c8411ad017dd8a2e64309bc426d96852a2a696..b2ef51d10b33caf3b6ad9bd494d574abbcceff55 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include <cstdlib>
 #include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index e128b9257f2369e25c911f9a9e1d08475706d561..86851c2c075a60a57c6f169cbc7ad81253a94227 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -151,7 +152,8 @@ class ColocationGraph {
       if (attr_value != nullptr && attr_value->has_list()) {
         for (const string& class_spec : attr_value->list().s()) {
           StringPiece spec(class_spec);
-          if (spec.Consume(kColocationGroupPrefixStringPiece)) {
+          if (str_util::ConsumePrefix(&spec,
+                                      kColocationGroupPrefixStringPiece)) {
             found_spec = true;
             TF_RETURN_IF_ERROR(
                 ColocateNodeToGroup(&colocation_group_root, node, spec));
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 098024d2195aad8ef651120181ab271be168f92a..5ad251c892f175dceccc0304bceedc1405bc0123 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -262,9 +263,9 @@ class PlacerTest : public ::testing::Test {
                 ->attributes()                                          \
                 .device_type())
 
-#define EXPECT_DEVICE_CONTAINS(g, name, device_substr)                        \
-  EXPECT_TRUE(StringPiece(GetNodeByName((g), (name))->assigned_device_name()) \
-                  .contains(device_substr))
+#define EXPECT_DEVICE_CONTAINS(g, name, device_substr) \
+  EXPECT_TRUE(::tensorflow::str_util::StrContains(     \
+      GetNodeByName((g), (name))->assigned_device_name(), device_substr))
 
 // Test that a graph with no constraints will successfully assign nodes to the
 // "best available" device (i.e. prefer GPU over CPU).
@@ -488,11 +489,10 @@ TEST_F(PlacerTest, TestAssignedGpuDeviceToCpuDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
-              "does not have registered OpKernel support for TestInput"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:a/replica:0/task:0/device:fakegpu:0' "
+      "does not have registered OpKernel support for TestInput"));
 }
 
 // Test that graphs with reference connections are correctly placed.
@@ -541,15 +541,15 @@ TEST_F(PlacerTest, TestReferenceConnection) {
   {
     Status s = ReferenceTestHelper("VariableCPU", "AssignGPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "TestAssign", "FakeGPU"));
   {
     Status s = ReferenceTestHelper("VariableGPU", "AssignCPU", "FakeCPU");
     EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("no device type supports both of those nodes"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "no device type supports both of those nodes"));
   }
   TF_EXPECT_OK(ReferenceTestHelper("VariableGPU", "AssignGPU", "FakeGPU"));
 }
@@ -760,8 +760,9 @@ TEST_F(PlacerTest, TestInvalidMultipleColocationGroups) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'foo' and 'in' because no "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "Cannot colocate nodes 'foo' and 'in' because no "
                             "device type supports both of those nodes and the "
                             "other nodes colocated with them"));
 }
@@ -824,11 +825,11 @@ TEST_F(PlacerTest, TestColocationGroupWithUnsatisfiableReferenceConnections) {
   }
 
   Status s = Place(&g);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Cannot colocate nodes 'var3' and 'assign3' because no "
-                    "device type supports both of those nodes and the other "
-                    "nodes colocated with them."));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Cannot colocate nodes 'var3' and 'assign3' because no "
+      "device type supports both of those nodes and the other "
+      "nodes colocated with them."));
 }
 
 TEST_F(PlacerTest, TestColocationAndReferenceConnections) {
@@ -888,7 +889,7 @@ TEST_F(PlacerTest, TestEmptyDeviceSet) {
 
   Status s = Place(&g, &empty);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("No devices are registered"));
+      str_util::StrContains(s.error_message(), "No devices are registered"));
 }
 
 // Test that placement fails when the requested device forces an
@@ -913,16 +914,17 @@ TEST_F(PlacerTest, TestHeterogeneousDeviceSetFailure) {
   heterogeneous.AddDevice(cpu.get());
   Status s = Place(&g, &heterogeneous);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("colocated with a group of nodes that required "
+  EXPECT_TRUE(
+      str_util::StrContains(s.error_message(),
+                            "colocated with a group of nodes that required "
                             "incompatible device"));
 
   // The error message should contain information that indicates which
   // op types have which registered device types.
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("VariableGPU: FakeGPU"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "VariableGPU: FakeGPU"))
       << s;
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("TestAssign: FakeGPU FakeCPU"))
+      str_util::StrContains(s.error_message(), "TestAssign: FakeGPU FakeCPU"))
       << s;
 }
 
@@ -937,7 +939,7 @@ TEST_F(PlacerTest, TestUnknownDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the combination of partial
@@ -952,7 +954,7 @@ TEST_F(PlacerTest, TestUnknownMergedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/job:foo"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/job:foo"));
 }
 
 // Test that placement fails when the previously-assigned device for a
@@ -969,9 +971,9 @@ TEST_F(PlacerTest, TestUnknownAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:foo' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "Assigned device '/job:foo' does not match any device"));
 }
 
 // Test that placement fails when an op with no registered kernels is
@@ -986,12 +988,11 @@ TEST_F(PlacerTest, TestNoKernelsRegistered) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableNoKernels'"));
   EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No OpKernel was registered to support Op 'VariableNoKernels'"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("<no registered kernels>"));
+      str_util::StrContains(s.error_message(), "<no registered kernels>"));
 }
 
 // Test that placement fails when a kernel is registered but no known
@@ -1011,10 +1012,10 @@ TEST_F(PlacerTest, TestNoDevicesRegistered) {
 
   Status s = Place(&g, &cpu_only);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("No OpKernel was registered to support "
-                            "Op 'VariableGPU'"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("device='FakeGPU'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No OpKernel was registered to support Op 'VariableGPU'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "device='FakeGPU'"));
 }
 
 // Test that placement fails when a requested device is malformed.
@@ -1028,8 +1029,8 @@ TEST_F(PlacerTest, TestMalformedDeviceSpecification) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed device specification '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Malformed device specification '/foo:bar'"));
 }
 
 // Test that placement fails when a previously-assigned device is malformed.
@@ -1045,8 +1046,8 @@ TEST_F(PlacerTest, TestMalformedAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Malformed assigned device '/foo:bar'"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "Malformed assigned device '/foo:bar'"));
 }
 
 // Test that placement fails when a device was previously assigned to
@@ -1063,9 +1064,8 @@ TEST_F(PlacerTest, TestNonUniqueAssignedDevice) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INTERNAL, s.code());
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("Assigned device '/job:a' does not match any device"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Assigned device '/job:a' does not match any device"));
 }
 
 // Test that ops request to be placed on non-existent devices will be relocated
@@ -1099,7 +1099,7 @@ TEST_F(PlacerTest, TestNonexistentGpuNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakegpu:11"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakegpu:11"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1116,10 +1116,10 @@ TEST_F(PlacerTest, TestUnsupportedDeviceNoAllowSoftPlacement) {
   SessionOptions options;
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("/device:fakecpu:0"));
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains("no supported kernel for fakecpu devices is available"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "/device:fakecpu:0"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "no supported kernel for fakecpu devices is available"));
 }
 
 // Test that placement fails when a node requests an explicit device that is not
@@ -1137,9 +1137,9 @@ TEST_F(PlacerTest, TestNonExistentDevice) {
   Status s = Place(&g, &options);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
   LOG(WARNING) << s.error_message();
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("was explicitly assigned to /job:foo/replica:17 "
-                            "but available devices"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "was explicitly assigned to /job:foo/replica:17 but available devices"));
 }
 
 TEST_F(PlacerTest, TestUnsupportedDeviceAllowSoftPlacement) {
@@ -1205,8 +1205,8 @@ TEST_F(PlacerTest, TestUnsatisfiableConstraintWithReferenceConnections) {
 
   Status s = Place(&g);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("Cannot colocate nodes 'var' and 'assign'"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(), "Cannot colocate nodes 'var' and 'assign'"));
 }
 
 // Test that a generator node follows its consumers (where there are several
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index d69e8bc2a049e9d71ca4ef0298dfe0dc058f2c45..c7b8259f7872d623e112a5e0b21a3851461ba803 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -155,7 +155,10 @@ class ProcessFunctionLibraryRuntime {
 
     string target_device() { return target_device_; }
 
-    FunctionLibraryRuntime::LocalHandle local_handle() { return local_handle_; }
+    FunctionLibraryRuntime::LocalHandle local_handle() {
+      mutex_lock l(mu_);
+      return local_handle_;
+    }
 
     // Initializes the FunctionData object by potentially making an Initialize
     // call to the DistributedFunctionLibraryRuntime.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 2da67b084a04067d56f66dfca208287aa04d7b46..4fbf2abc6714bb27abb76b7f7b791868d0b3bdb7 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -132,7 +133,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                    });
     done2.WaitForNotification();
     EXPECT_TRUE(errors::IsNotFound(status));
-    EXPECT_TRUE(StringPiece(status.error_message()).contains("not found."));
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
 
     return Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index d738fef7be5b04f166651644542b2eadbe38715d..7ff360ee2677de033a2a99b4656d5f2044521efd 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 
+#ifdef INTEL_MKL
+#include <omp.h>
+#endif
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -46,6 +49,34 @@ thread::ThreadPool* ComputePool(const SessionOptions& options) {
   return compute_pool;
 }
 
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
+  const int32 inter_op = options.config.inter_op_parallelism_threads();
+  if (inter_op != 0) return inter_op;
+#ifdef INTEL_MKL
+  // MKL library executes ops in parallel using OMP threads
+  // Set inter_op conservatively to avoid thread oversubscription that could 
+  // lead to severe perf degradations and OMP resource exhaustion
+  const int mkl_intra_op = omp_get_max_threads();
+  CHECK_GE(mkl_intra_op, 1);
+  const int32 mkl_inter_op = std::max(
+          (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
+  VLOG(0) << "Creating new thread pool with default inter op setting: "
+          << mkl_inter_op
+          << ". Tune using inter_op_parallelism_threads for best performance.";
+  return mkl_inter_op;
+#else
+  // Default to using the number of cores available in the process.
+  return port::NumSchedulableCPUs();
+#endif
+}
+
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options) {
+  const int32 num_threads = NumInterOpThreadsFromSessionOptions(options);
+  VLOG(1) << "Direct session inter op parallelism threads: " << num_threads;
+  return new thread::ThreadPool(options.env, "Compute", num_threads);
+}
+
 void SchedClosure(std::function<void()> closure) {
   if (port::Tracing::IsActive()) {
     const uint64 id = port::Tracing::UniqueId();
diff --git a/tensorflow/core/common_runtime/process_util.h b/tensorflow/core/common_runtime/process_util.h
index fc3a262fe1c14856819361f29ea9066193181695..5d9266671617320eea4cea60de1ebd7210f3b674 100644
--- a/tensorflow/core/common_runtime/process_util.h
+++ b/tensorflow/core/common_runtime/process_util.h
@@ -30,6 +30,13 @@ namespace tensorflow {
 // using 'options'.  Caller does not take ownership over threadpool.
 thread::ThreadPool* ComputePool(const SessionOptions& options);
 
+// Returns number of inter op threads.
+int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options);
+
+// Creates a thread pool with number of inter op threads.
+thread::ThreadPool* NewThreadPoolFromSessionOptions(
+    const SessionOptions& options);
+
 // Schedule "closure" in the default thread queue.
 void SchedClosure(std::function<void()> closure);
 
diff --git a/tensorflow/core/common_runtime/process_util_test.cc b/tensorflow/core/common_runtime/process_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..46672ac92eef1c1da608720f17a30417f92d04bd
--- /dev/null
+++ b/tensorflow/core/common_runtime/process_util_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/process_util.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(ProcessUtilTest, NumThreads) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+  EXPECT_EQ(10, NumInterOpThreadsFromSessionOptions(opts));
+}
+
+TEST(ProcessUtilTest, ThreadPool) {
+  SessionOptions opts;
+  opts.config.set_inter_op_parallelism_threads(10);
+
+  thread::ThreadPool* pool = NewThreadPoolFromSessionOptions(opts);
+  EXPECT_EQ(10, pool->NumThreads());
+  delete pool;
+}
+
+}  // anonymous namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index e1f70404e32edaabca95f913cf0bb86080f8b411..be79cc4507124fbbef8104f87773045c07005ef6 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -103,7 +103,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // In normal execution the table should be empty and all of its
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
-  // explicitly.  So long as graph exection of the associated step has
+  // explicitly.  So long as graph execution of the associated step has
   // completey terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index a074154450694e2135e07f345082393773d97084..feaf29c7bb528c6019da3ae273681997173fd372 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/public/session.h"
 
 #include "tensorflow/core/common_runtime/session_factory.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -31,10 +32,9 @@ TEST(SessionTest, InvalidTargetReturnsNull) {
   Session* session;
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
-  EXPECT_TRUE(
-      StringPiece(s.error_message())
-          .contains(
-              "No session factory registered for the given session options"));
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No session factory registered for the given session options"));
 }
 
 // Register a fake session factory to test error handling paths in
@@ -44,7 +44,7 @@ class FakeSessionFactory : public SessionFactory {
   FakeSessionFactory() {}
 
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with("fake");
+    return str_util::StartsWith(options.target, "fake");
   }
 
   Session* NewSession(const SessionOptions& options) override {
@@ -68,9 +68,9 @@ TEST(SessionTest, MultipleFactoriesForTarget) {
   Status s = tensorflow::NewSession(options, &session);
   EXPECT_EQ(s.code(), error::INTERNAL);
   EXPECT_TRUE(
-      StringPiece(s.error_message()).contains("Multiple session factories"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_1"));
-  EXPECT_TRUE(StringPiece(s.error_message()).contains("FAKE_SESSION_2"));
+      str_util::StrContains(s.error_message(), "Multiple session factories"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_1"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "FAKE_SESSION_2"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index cef50be3b1566de9f05b14783212f90da3107fc6..06dbe049868b2f85e8ebcabe4df5cec2170486b4 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -351,6 +351,11 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
         }
       }
     }
+    if (node_context->requested_input_tensor_as_partial_shape(dst_input)) {
+      // The input value may have changed. Since we have no way to know if
+      // that's indeed the case, err on the safe side.
+      *refined = true;
+    }
 
     // Also propagate handle shape and dtype of edges which are carrying
     // resource handles.
@@ -426,6 +431,32 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   InferenceContext* src_context = GetContext(input_edge->src());
   if (src_context == nullptr) return errors::Internal("Missing src context");
   ShapeHandle src_shape = src_context->output(input_edge->src_output());
+
+  if (src_context->Value(src_context->Rank(src_shape)) == 0) {
+    Tensor t;
+    bool evaluated = false;
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantTensorForEdge(node, dst_idx, &evaluated, &t));
+    if (!evaluated) {
+      return errors::InvalidArgument(
+          "Received a shape scalar with unknown static value.  A static value "
+          "of '-1' is required to represent an unknown shape.");
+    }
+    if (t.dims() == 0) {
+      if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      } else if (t.dtype() == DT_INT64 && t.scalar<int64>()() == -1) {
+        *result = target_context->UnknownShape();
+        return Status::OK();
+      }
+    }
+    return errors::InvalidArgument(
+        "Received an invalid shape scalar with a static value that is not "
+        "'-1': ",
+        t.DebugString());
+  }
+
   TF_RETURN_IF_ERROR(src_context->WithRank(src_shape, 1, &src_shape));
 
   const string& src_op = input_edge->src()->type_string();
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index adf5a9afff2ebc6848db8811506ebd4a031df2bb..f48638afc0f602e4b0c1376f7e5732f3c637d025 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -143,8 +144,8 @@ TEST_F(ShapeRefinerTest, BadShapes) {
   // an error.
   Status s = m.AddNode(mm.node());
   ASSERT_FALSE(s.ok());
-  ASSERT_TRUE(StringPiece(s.error_message())
-                  .contains("Dimensions must be equal, but are 1 and 2"));
+  ASSERT_TRUE(str_util::StrContains(
+      s.error_message(), "Dimensions must be equal, but are 1 and 2"));
 }
 
 TEST_F(ShapeRefinerTest, SetShape) {
@@ -1032,8 +1033,8 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_PackInvalidInput) {
     TF_ASSERT_OK(m.AddNode(input.node()));
   }
   TF_ASSERT_OK(m.AddNode(pack.node()));
-  EXPECT_TRUE(
-      StringPiece(m.AddNode(result).error_message()).contains("but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(m.AddNode(result).error_message(),
+                                    "but is rank 2"));
 }
 
 TEST_F(ShapeRefinerTest, ConstantValueAsShape_Concat) {
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.h b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..04d5af9087059232097e7aebeb32141a3046ee63
--- /dev/null
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.h
@@ -0,0 +1,82 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+namespace tensorflow {
+
+class Env;
+
+// A simple single-threaded CPU device. This can be used to run inexpensive
+// computations. In particular, using this avoids initializing the global thread
+// pools in LocalDevice.
+class SingleThreadedCpuDevice : public Device {
+ public:
+  SingleThreadedCpuDevice(Env* env)
+      : Device(env, Device::BuildDeviceAttributes("/device:CPU:0", DEVICE_CPU,
+                                                  Bytes(256 << 20),
+                                                  DeviceLocality())) {
+    eigen_worker_threads_.num_threads = 1;
+    eigen_worker_threads_.workers = new thread::ThreadPool(
+        env, "graph_runner", eigen_worker_threads_.num_threads);
+    eigen_threadpool_wrapper_.reset(
+        new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    eigen_device_.reset(new Eigen::ThreadPoolDevice(
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+    set_tensorflow_cpu_worker_threads(&eigen_worker_threads_);
+    set_eigen_cpu_device(eigen_device_.get());
+  }
+
+  ~SingleThreadedCpuDevice() override {
+    eigen_threadpool_wrapper_.reset();
+    eigen_device_.reset();
+    delete eigen_worker_threads_.workers;
+  }
+
+  Status Sync() override { return Status::OK(); }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    Tensor parsed(tensor_proto.dtype());
+    if (!parsed.FromProto(cpu_allocator(), tensor_proto)) {
+      return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
+    }
+    *tensor = parsed;
+    return Status::OK();
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return cpu_allocator();
+  }
+
+ private:
+  DeviceBase::CpuWorkerThreads eigen_worker_threads_;
+  std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SINGLE_THREADED_CPU_DEVICE_H_
diff --git a/tensorflow/core/framework/visitable_allocator.h b/tensorflow/core/common_runtime/visitable_allocator.h
similarity index 94%
rename from tensorflow/core/framework/visitable_allocator.h
rename to tensorflow/core/common_runtime/visitable_allocator.h
index ed41b05531acaa1be803ac533854efe6160691b4..8edf922d11ee1662b78771bfdc7c38e0144aee19 100644
--- a/tensorflow/core/framework/visitable_allocator.h
+++ b/tensorflow/core/common_runtime/visitable_allocator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
-#define TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#ifndef TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
 
 #include <functional>
 #include "tensorflow/core/framework/allocator.h"
@@ -76,4 +76,4 @@ class TrackingVisitableAllocator : public TrackingAllocator,
   VisitableAllocator* allocator_;
 };
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_FRAMEWORK_VISITABLE_ALLOCATOR_H_
+#endif  // TENSORFLOW_COMMON_RUNTIME_VISITABLE_ALLOCATOR_H_
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index f6fe9edb022dce29286190e9948f385b933c5a07..5fab740e920519abe8ec109615b75555593ec4c8 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -339,18 +339,3 @@ cc_library(
 #     ],
 #     visibility = ["//visibility:public"],
 # )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 434626bd2da57ce4c4895017c0bb0abef58c6f44..b07cb8cdcb3198a5ca5d63816ce501ef615039f8 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -7,18 +7,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 049eec347c672de8e12f44eda9a6bebccb68043c..bafd9bfc68a3bd79492ec220257a0c145e535455 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -144,9 +144,9 @@ BaseRemoteRendezvous::~BaseRemoteRendezvous() {
 // Returns true if "device_name" is a valid full name of local device
 // of the "worker".  This helper is purely based on the worker name
 // and device name and does no lookups in the worker->device_mgr.
-static bool IsLocalDevice(const string& worker_name,
+static bool IsLocalDevice(const StringPiece worker_name,
                           const StringPiece device_name) {
-  return device_name.starts_with(worker_name);
+  return str_util::StartsWith(device_name, worker_name);
 }
 
 Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index aaa4cfa7341c42bf9f7302e8ef30a28b68e6213c..76315462a738b7e70ce8c1f9ca5776d0037e22f9 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -157,6 +157,47 @@ Status LocalMaster::Reset(CallOptions* call_options,
   return ret;
 }
 
+Status LocalMaster::MakeCallable(CallOptions* call_options,
+                                 const MakeCallableRequest* request,
+                                 MakeCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->MakeCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::RunCallable(CallOptions* call_options,
+                                const RunCallableRequest* request,
+                                RunCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->RunCallable(call_options, request, response,
+                            [&n, &ret](const Status& s) {
+                              ret.Update(s);
+                              n.Notify();
+                            });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+Status LocalMaster::ReleaseCallable(CallOptions* call_options,
+                                    const ReleaseCallableRequest* request,
+                                    ReleaseCallableResponse* response) {
+  Notification n;
+  Status ret;
+  master_impl_->ReleaseCallable(request, response, [&n, &ret](const Status& s) {
+    ret.Update(s);
+    n.Notify();
+  });
+  TF_RETURN_IF_ERROR(
+      WaitForNotification(call_options, default_timeout_in_ms_, &n));
+  return ret;
+}
+
 namespace {
 mutex* get_local_master_registry_lock() {
   static mutex local_master_registry_lock(LINKER_INITIALIZED);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index c20b40329ab1712b3dd0cae673d337481ee40196..cad6babad82b9b2ac2953f5497e46bb471699b10 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -71,6 +71,16 @@ class LocalMaster : public MasterInterface {
   Status Reset(CallOptions* call_options, const ResetRequest* request,
                ResetResponse* response) override;
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override;
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override;
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response);
+
   // Registers the mapping from the given `target` to the given `master`.
   //
   // WARNING: The `master` pointer remains owned by the caller. It is
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 1a488303ac73b8628b9d3fe4050ad9144724348e..f47502e844f70ed4005e2cd95220fc04341b8bc2 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -611,4 +611,55 @@ void Master::Reset(const ResetRequest* req, ResetResponse* resp,
   });
 }
 
+void Master::MakeCallable(const MakeCallableRequest* req,
+                          MakeCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->MakeCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                         RunCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, opts, req, resp](MyClosure done) {
+        Status s = session->RunCallable(opts, *req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
+void Master::ReleaseCallable(const ReleaseCallableRequest* req,
+                             ReleaseCallableResponse* resp, MyClosure done) {
+  auto session = FindMasterSession(req->session_handle());
+  if (session == nullptr) {
+    done(errors::Aborted("Session ", req->session_handle(), " is not found."));
+    return;
+  }
+
+  SchedClosure(std::bind(
+      [this, session, req, resp](MyClosure done) {
+        Status s = session->ReleaseCallable(*req, resp);
+        session->Unref();
+        done(s);
+      },
+      std::move(done)));
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index 678fc46bd7f4644022c5811f8a1c7a9f606be111..dbb337fd484960fbd3bfe47d0bfe0497985de66f 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -61,6 +61,13 @@ class Master {
   // See tensorflow::Reset() and the comment on ResetRequest.
   void Reset(const ResetRequest* req, ResetResponse* resp, MyClosure done);
 
+  void MakeCallable(const MakeCallableRequest* req, MakeCallableResponse* resp,
+                    MyClosure done);
+  void RunCallable(CallOptions* opts, const RunCallableRequest* req,
+                   RunCallableResponse* resp, MyClosure done);
+  void ReleaseCallable(const ReleaseCallableRequest* req,
+                       ReleaseCallableResponse* resp, MyClosure done);
+
  private:
   typedef Master ME;
 
diff --git a/tensorflow/core/distributed_runtime/master_interface.h b/tensorflow/core/distributed_runtime/master_interface.h
index bf6a2db3e27b301c01ca7d5073d175b24417220f..a8ae3cba3cdd3f02aae823d893e027b2bccae2c9 100644
--- a/tensorflow/core/distributed_runtime/master_interface.h
+++ b/tensorflow/core/distributed_runtime/master_interface.h
@@ -89,6 +89,16 @@ class MasterInterface {
   virtual Status Reset(CallOptions* call_options, const ResetRequest* request,
                        ResetResponse* response) = 0;
 
+  virtual Status MakeCallable(CallOptions* call_options,
+                              const MakeCallableRequest* request,
+                              MakeCallableResponse* response) = 0;
+  virtual Status RunCallable(CallOptions* call_options,
+                             const RunCallableRequest* request,
+                             RunCallableResponse* response) = 0;
+  virtual Status ReleaseCallable(CallOptions* call_options,
+                                 const ReleaseCallableRequest* request,
+                                 ReleaseCallableResponse* response) = 0;
+
  protected:
   // NOTE: This should only be called by implementations of this
   // interface whose CreateRunStepResponse() method returns a
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 01da54fcb3ce8d5467bbc15d8db3bc970d8727b5..e0a5bb4c537095d2d56ee0844d67443e73e9756f 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -66,13 +66,13 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     std::unique_ptr<ClientGraph> cg,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
-                    GraphExecutionState* execution_state, bool is_partial,
-                    WorkerCacheInterface* worker_cache, bool should_deregister)
+                    bool is_partial, WorkerCacheInterface* worker_cache,
+                    bool should_deregister)
       : session_handle_(handle),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
-        debug_opts_(bopts.callable_options.run_options().debug_options()),
+        callable_opts_(bopts.callable_options),
         worker_cache_(worker_cache),
         should_deregister_(should_deregister) {
     VLOG(1) << "Created ReffedClientGraph for node with "
@@ -80,8 +80,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
     stats_publisher_ = stats_publisher_factory(handle, bopts, session_opts);
 
-    // Initialize a name to node map for testing that fetches are reachable.
-    for (Node* n : execution_state->full_graph()->nodes()) {
+    // Initialize a name to node map for processing device stats.
+    for (Node* n : client_graph_->graph.nodes()) {
       name_to_node_.insert({n->name(), n});
     }
   }
@@ -94,12 +94,18 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   const ClientGraph* client_graph() { return client_graph_.get(); }
 
+  const CallableOptions& callable_options() { return callable_opts_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
   }
 
+  int64 get_and_increment_execution_count() {
+    return execution_count_.fetch_add(1);
+  }
+
   // Turn RPC logging on or off, both at the WorkerCache used by this
   // master process, and at each remote worker in use for the current
   // partitions.
@@ -178,6 +184,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                        CallOptions* opts, const RunStepRequestWrapper& req,
                        MutableRunStepResponseWrapper* resp,
                        CancellationManager* cm, const bool is_last_partial_run);
+  Status RunPartitions(const MasterEnv* env, int64 step_id,
+                       int64 execution_count, PerStepState* pss,
+                       CallOptions* call_opts, const RunCallableRequest& req,
+                       RunCallableResponse* resp, CancellationManager* cm);
 
   // Calls workers to cleanup states for the step "step_id".  Calls
   // `done` when all cleanup RPCs have completed.
@@ -211,10 +221,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   const std::unique_ptr<ClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
-  const DebugOptions& debug_opts_;
+  const CallableOptions callable_opts_;
   WorkerCacheInterface* const worker_cache_;  // Not owned.
   std::unordered_map<StringPiece, Node*, StringPieceHasher> name_to_node_;
   const bool should_deregister_;
+  std::atomic<int64> execution_count_ = {0};
 
   // Graph partitioned into per-location subgraphs.
   struct Part {
@@ -269,6 +280,17 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
       const PartitionOptions& popts,
       std::unordered_map<string, GraphDef> graph_partitions);
 
+  // Prepares a number of calls to workers. One call per partition.
+  // This is a generic method that handles Run, PartialRun, and RunCallable.
+  template <class FetchListType, class ClientRequestType,
+            class ClientResponseType>
+  Status RunPartitionsHelper(
+      const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+      const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+      int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+      const ClientRequestType& req, ClientResponseType* resp,
+      CancellationManager* cm, bool is_last_partial_run);
+
   // Deregisters the partitions on the workers.  Called in the
   // destructor and does not wait for the rpc completion.
   void DeregisterPartitions();
@@ -411,7 +433,8 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     c->req.set_session_handle(session_handle_);
     c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]);
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
-    *c->req.mutable_debug_options() = debug_opts_;
+    *c->req.mutable_debug_options() =
+        callable_opts_.run_options().debug_options();
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -490,24 +513,46 @@ class RunManyGraphs {
   TF_DISALLOW_COPY_AND_ASSIGN(RunManyGraphs);
 };
 
-Status MasterSession::ReffedClientGraph::RunPartitions(
-    const MasterEnv* env, int64 step_id, int64 execution_count,
-    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
-    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
-    const bool is_last_partial_run) {
-  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
-          << execution_count;
-  // Maps the names of fed tensors to their index in `req`.
-  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+namespace {
+Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
+}
 
-  for (size_t i = 0; i < req.num_feeds(); ++i) {
-    if (!feeds.insert({req.feed_name(i), i}).second) {
-      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
-    }
-  }
+Status AddSendFromClientRequest(const RunCallableRequest& client_req,
+                                MutableRunGraphRequestWrapper* worker_req,
+                                size_t index, const string& send_key) {
+  return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
+}
 
-  // Prepares a number of calls to workers. One call per partition.
+// TODO(mrry): Add a full-fledged wrapper that avoids TensorProto copies for
+// in-process messages.
+struct RunCallableResponseWrapper {
+  RunCallableResponse* resp;  // Not owned.
+  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+
+  RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
 
+  Status AddTensorFromRunGraphResponse(
+      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
+      size_t index) {
+    // TODO(b/74355905): Add a specialized implementation that avoids
+    // copying the tensor into the RunCallableResponse when at least
+    // two of the {client, master, worker} are in the same process.
+    return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
+  }
+};
+}  // namespace
+
+template <class FetchListType, class ClientRequestType,
+          class ClientResponseType>
+Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
+    const std::unordered_map<StringPiece, size_t, StringPieceHasher>& feeds,
+    const FetchListType& fetches, const MasterEnv* env, int64 step_id,
+    int64 execution_count, PerStepState* pss, CallOptions* call_opts,
+    const ClientRequestType& req, ClientResponseType* resp,
+    CancellationManager* cm, bool is_last_partial_run) {
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
   if (pss->report_tensor_allocations_upon_oom) {
@@ -553,28 +598,19 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     // We keep these as separate paths for now, to ensure we aren't
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
-      for (size_t i = 0; i < req.num_feeds(); ++i) {
-        const string& name = req.feed_name(i);
-        const auto iter = part.feed_key.find(name);
+      for (const auto& name_index : feeds) {
+        const auto iter = part.feed_key.find(name_index.first.ToString());
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
         const string& key = iter->second;
-        auto feeds_iter = feeds.find(name);
-        if (feeds_iter == feeds.end()) {
-          return errors::InvalidArgument("No feed is provided for feed=", name,
-                                         ", key=", key);
-        } else if (feeds_iter->second != static_cast<size_t>(i)) {
-          return errors::Internal("Cannot find feed named \"", name,
-                                  " in request.");
-        }
-        TF_RETURN_IF_ERROR(c->req->AddSendFromRunStepRequest(req, i, key));
+        TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
+                                                    name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (int i = 0; static_cast<size_t>(i) < req.num_fetches(); ++i) {
-        const string& req_fetch = req.fetch_name(i);
+      for (const string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -586,9 +622,13 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
       for (const auto& feed_key : part.feed_key) {
         const string& feed = feed_key.first;
         const string& key = feed_key.second;
-        const int64 feed_index = feeds[feed];
+        auto iter = feeds.find(feed);
+        if (iter == feeds.end()) {
+          return errors::Internal("No feed index found for feed: ", feed);
+        }
+        const int64 feed_index = iter->second;
         TF_RETURN_IF_ERROR(
-            c->req->AddSendFromRunStepRequest(req, feed_index, key));
+            AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
         const string& key = key_fetch.first;
@@ -622,50 +662,115 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
   } else {
     return errors::Cancelled("Step was cancelled");
   }
+  TF_RETURN_IF_ERROR(calls.status());
 
-  // Collects fetches.
-  Status status = calls.status();
-  if (status.ok()) {
-    for (int i = 0; i < num; ++i) {
-      const Part& part = partitions_[i];
-      MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
-      for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
-        auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
-        if (iter == part.key_fetch.end()) {
-          status.Update(errors::Internal("Unexpected fetch key: ",
-                                         run_graph_resp->recv_key(j)));
-          break;
-        }
-        const string& fetch = iter->second;
-        status.Update(
-            resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
-        if (!status.ok()) {
-          break;
-        }
+  // Collects fetches and metadata.
+  Status status;
+  for (int i = 0; i < num; ++i) {
+    const Part& part = partitions_[i];
+    MutableRunGraphResponseWrapper* run_graph_resp = calls.get(i)->resp.get();
+    for (size_t j = 0; j < run_graph_resp->num_recvs(); ++j) {
+      auto iter = part.key_fetch.find(run_graph_resp->recv_key(j));
+      if (iter == part.key_fetch.end()) {
+        status.Update(errors::Internal("Unexpected fetch key: ",
+                                       run_graph_resp->recv_key(j)));
+        break;
       }
-      if (pss->collect_timeline) {
-        pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+      const string& fetch = iter->second;
+      status.Update(
+          resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
+      if (!status.ok()) {
+        break;
       }
-      if (pss->collect_costs) {
-        CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
-        for (int j = 0; j < cost_graph->node_size(); ++j) {
-          resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
-              cost_graph->mutable_node(j));
-        }
+    }
+    if (pss->collect_timeline) {
+      pss->step_stats[i].Swap(run_graph_resp->mutable_step_stats());
+    }
+    if (pss->collect_costs) {
+      CostGraphDef* cost_graph = run_graph_resp->mutable_cost_graph();
+      for (int j = 0; j < cost_graph->node_size(); ++j) {
+        resp->mutable_metadata()->mutable_cost_graph()->add_node()->Swap(
+            cost_graph->mutable_node(j));
       }
-      if (pss->collect_partition_graphs) {
-        protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
-            resp->mutable_metadata()->mutable_partition_graphs();
-        for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
-          partition_graph_defs->Add()->Swap(
-              run_graph_resp->mutable_partition_graph(i));
-        }
+    }
+    if (pss->collect_partition_graphs) {
+      protobuf::RepeatedPtrField<GraphDef>* partition_graph_defs =
+          resp->mutable_metadata()->mutable_partition_graphs();
+      for (size_t i = 0; i < run_graph_resp->num_partition_graphs(); i++) {
+        partition_graph_defs->Add()->Swap(
+            run_graph_resp->mutable_partition_graph(i));
       }
     }
   }
   return status;
 }
 
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunStepRequestWrapper& req,
+    MutableRunStepResponseWrapper* resp, CancellationManager* cm,
+    const bool is_last_partial_run) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < req.num_feeds(); ++i) {
+    if (!feeds.insert({req.feed_name(i), i}).second) {
+      return errors::InvalidArgument("Duplicated feeds: ", req.feed_name(i));
+    }
+  }
+
+  std::vector<string> fetches;
+  fetches.reserve(req.num_fetches());
+  for (size_t i = 0; i < req.num_fetches(); ++i) {
+    fetches.push_back(req.fetch_name(i));
+  }
+
+  return RunPartitionsHelper(feeds, fetches, env, step_id, execution_count, pss,
+                             call_opts, req, resp, cm, is_last_partial_run);
+}
+
+Status MasterSession::ReffedClientGraph::RunPartitions(
+    const MasterEnv* env, int64 step_id, int64 execution_count,
+    PerStepState* pss, CallOptions* call_opts, const RunCallableRequest& req,
+    RunCallableResponse* resp, CancellationManager* cm) {
+  VLOG(2) << "RunPartitions step_id " << step_id << " execution_count "
+          << execution_count;
+  // Maps the names of fed tensors to their index in `req`.
+  std::unordered_map<StringPiece, size_t, StringPieceHasher> feeds(3);
+  for (size_t i = 0; i < callable_opts_.feed_size(); ++i) {
+    if (!feeds.insert({callable_opts_.feed(i), i}).second) {
+      // MakeCallable will fail if there are two feeds with the same name.
+      return errors::Internal("Duplicated feeds in callable: ",
+                              callable_opts_.feed(i));
+    }
+  }
+
+  // Create a wrapped response object to collect the fetched values and
+  // rearrange them for the RunCallableResponse.
+  RunCallableResponseWrapper wrapped_resp;
+  wrapped_resp.resp = resp;
+
+  TF_RETURN_IF_ERROR(RunPartitionsHelper(
+      feeds, callable_opts_.fetch(), env, step_id, execution_count, pss,
+      call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
+
+  // Collects fetches.
+  // TODO(b/74355905): Add a specialized implementation that avoids
+  // copying the tensor into the RunCallableResponse when at least
+  // two of the {client, master, worker} are in the same process.
+  for (const string& fetch : callable_opts_.fetch()) {
+    TensorProto* fetch_proto = resp->mutable_fetch()->Add();
+    auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
+    if (iter == wrapped_resp.fetch_key_to_protos.end()) {
+      return errors::Internal("Worker did not return a value for fetch: ",
+                              fetch);
+    }
+    fetch_proto->Swap(&iter->second);
+  }
+  return Status::OK();
+}
+
 namespace {
 
 class CleanupBroadcastHelper {
@@ -829,8 +934,6 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
 // TODO(suharsh,mrry): Build a map from fetch target to set of feeds it depends
 // on once at setup time to prevent us from computing the dependencies
 // everytime.
-// TODO(suharshs,mrry): Consider removing the need for execution_state to reduce
-// contention.
 Status MasterSession::ReffedClientGraph::CheckFetches(
     const RunStepRequestWrapper& req, const RunState* run_state,
     GraphExecutionState* execution_state) {
@@ -840,8 +943,8 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
     pending_feeds.insert(id);
@@ -856,11 +959,11 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     const string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    auto it = name_to_node_.find(id.first);
-    if (it == name_to_node_.end()) {
+    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
-    stack.push_back(it->second);
+    stack.push_back(n);
   }
 
   // Any tensor needed for fetches can't be in pending_feeds.
@@ -1268,15 +1371,11 @@ WorkerCacheInterface* MasterSession::get_worker_cache() const {
   return env_->worker_cache;
 }
 
-Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
-                                ReffedClientGraph** rcg, bool is_partial) {
+Status MasterSession::StartStep(const BuildGraphOptions& opts, bool is_partial,
+                                ReffedClientGraph** out_rcg, int64* out_count) {
   const uint64 hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
-    // Keep track of how many times this subgraph has been executed in
-    // this session.
-    int64* c = &subgraph_execution_counts_[hash];
-    *count = (*c)++;
     // TODO(suharshs): We cache partial run graphs and run graphs separately
     // because there is preprocessing that needs to only be run for partial
     // run calls.
@@ -1293,13 +1392,14 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, int64* count,
       WorkerCacheInterface* worker_cache = get_worker_cache();
       auto entry = new ReffedClientGraph(
           handle_, opts, std::move(client_graph), session_opts_,
-          stats_publisher_factory_, execution_state_.get(), is_partial,
-          worker_cache, !should_delete_worker_sessions_);
+          stats_publisher_factory_, is_partial, worker_cache,
+          !should_delete_worker_sessions_);
       iter = m->insert({hash, entry}).first;
       VLOG(1) << "Preparing to execute new graph";
     }
-    *rcg = iter->second;
-    (*rcg)->Ref();
+    *out_rcg = iter->second;
+    (*out_rcg)->Ref();
+    *out_count = (*out_rcg)->get_and_increment_execution_count();
   }
   return Status::OK();
 }
@@ -1318,6 +1418,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
+namespace {
+uint64 MakeStepId() {
+  return (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+}
+}  // namespace
+
 Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                       PartialRunSetupResponse* resp) {
   std::vector<string> inputs, outputs, targets;
@@ -1334,15 +1440,15 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
 
   // Prepare.
   BuildGraphOptions opts;
   BuildBuildGraphOptions(*req, &opts);
-  TF_RETURN_IF_ERROR(StartStep(opts, &count, &rcg, true));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(opts, true, &rcg, &count));
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
   rcg->Ref();
@@ -1587,6 +1693,73 @@ Status MasterSession::CreateDebuggerState(
   return Status::OK();
 }
 
+void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                                     const RunOptions& run_options,
+                                     uint64 step_id, int64 count,
+                                     PerStepState* out_pss,
+                                     std::unique_ptr<ProfileHandler>* out_ph) {
+  out_pss->collect_timeline =
+      run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->collect_rpcs = run_options.trace_level() == RunOptions::FULL_TRACE;
+  out_pss->report_tensor_allocations_upon_oom =
+      run_options.report_tensor_allocations_upon_oom();
+  // Build the cost model every 'build_cost_model_every' steps after skipping an
+  // initial 'build_cost_model_after' steps.
+  const int64 build_cost_model_after =
+      session_opts_.config.graph_options().build_cost_model_after();
+  const int64 build_cost_model_every =
+      session_opts_.config.graph_options().build_cost_model();
+  out_pss->collect_costs =
+      build_cost_model_every > 0 &&
+      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
+  out_pss->collect_partition_graphs = run_options.output_partition_graphs();
+
+  *out_ph = rcg->GetProfileHandler(step_id, count, run_options);
+  if (*out_ph) {
+    out_pss->collect_timeline = true;
+    out_pss->collect_rpcs = (*out_ph)->should_collect_rpcs();
+  }
+}
+
+Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
+                                     uint64 step_id,
+                                     const RunOptions& run_options,
+                                     PerStepState* pss,
+                                     const std::unique_ptr<ProfileHandler>& ph,
+                                     const Status& run_status,
+                                     RunMetadata* out_run_metadata) {
+  Status s = run_status;
+  if (s.ok()) {
+    pss->end_micros = Env::Default()->NowMicros();
+
+    // Schedule post-processing and cleanup to be done asynchronously.
+    rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
+  } else if (errors::IsCancelled(s)) {
+    mutex_lock l(mu_);
+    if (closed_) {
+      if (garbage_collected_) {
+        s = errors::Cancelled(
+            "Step was cancelled because the session was garbage collected due "
+            "to inactivity.");
+      } else {
+        s = errors::Cancelled(
+            "Step was cancelled by an explicit call to `Session::Close()`.");
+      }
+    }
+  }
+  Ref();
+  rcg->Ref();
+  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
+    if (!s.ok()) {
+      LOG(ERROR) << "Cleanup partition error: " << s;
+    }
+    rcg->Unref();
+    MarkRunCompletion();
+    Unref();
+  });
+  return s;
+}
+
 Status MasterSession::DoRunWithLocalExecution(
     CallOptions* opts, const RunStepRequestWrapper& req,
     MutableRunStepResponseWrapper* resp) {
@@ -1599,8 +1772,8 @@ Status MasterSession::DoRunWithLocalExecution(
   BuildGraphOptions bgopts;
   BuildBuildGraphOptions(req, &bgopts);
   ReffedClientGraph* rcg = nullptr;
-  int64 count = 0;
-  TF_RETURN_IF_ERROR(StartStep(bgopts, &count, &rcg, false));
+  int64 count;
+  TF_RETURN_IF_ERROR(StartStep(bgopts, false, &rcg, &count));
 
   // Unref "rcg" when out of scope.
   core::ScopedUnref unref(rcg);
@@ -1616,64 +1789,133 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  const uint64 step_id = (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+  const uint64 step_id = MakeStepId();
   TRACEPRINTF("stepid %llu", step_id);
 
-  pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.collect_rpcs = req.options().trace_level() == RunOptions::FULL_TRACE;
-  pss.report_tensor_allocations_upon_oom =
-      req.options().report_tensor_allocations_upon_oom();
-  // Build the cost model every 'build_cost_model_every' steps after skipping an
-  // initial 'build_cost_model_after' steps.
-  const int64 build_cost_model_after =
-      session_opts_.config.graph_options().build_cost_model_after();
-  const int64 build_cost_model_every =
-      session_opts_.config.graph_options().build_cost_model();
-  pss.collect_costs =
-      build_cost_model_every > 0 &&
-      ((count + 1 - build_cost_model_after) % build_cost_model_every == 0);
-  pss.collect_partition_graphs = req.options().output_partition_graphs();
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, req.options(), step_id, count, &pss, &ph);
 
-  std::unique_ptr<ProfileHandler> ph =
-      rcg->GetProfileHandler(step_id, count, req.options());
-  if (ph) {
-    pss.collect_timeline = true;
-    pss.collect_rpcs = ph->should_collect_rpcs();
+  Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
+                                &cancellation_manager_, false);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, req.options(), &pss, ph, s,
+                        resp->mutable_metadata());
+}
+
+Status MasterSession::MakeCallable(const MakeCallableRequest& req,
+                                   MakeCallableResponse* resp) {
+  UpdateLastAccessTime();
+
+  BuildGraphOptions opts;
+  opts.callable_options = req.options();
+  opts.use_function_convention = false;
+
+  ReffedClientGraph* callable;
+
+  {
+    mutex_lock l(mu_);
+    if (closed_) {
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    std::unique_ptr<ClientGraph> client_graph;
+    TF_RETURN_IF_ERROR(execution_state_->BuildGraph(opts, &client_graph));
+    callable = new ReffedClientGraph(handle_, opts, std::move(client_graph),
+                                     session_opts_, stats_publisher_factory_,
+                                     false /* is_partial */, get_worker_cache(),
+                                     !should_delete_worker_sessions_);
+  }
+
+  Status s = BuildAndRegisterPartitions(callable);
+  if (!s.ok()) {
+    callable->Unref();
+    return s;
   }
 
+  uint64 handle;
+  {
+    mutex_lock l(mu_);
+    handle = next_callable_handle_++;
+    callables_[handle] = callable;
+  }
+
+  resp->set_handle(handle);
+  return Status::OK();
+}
+
+Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                                    const RunCallableRequest& req,
+                                    RunCallableResponse* resp) {
+  VLOG(2) << "DoRunCallable req: " << req.DebugString();
+  PerStepState pss;
+  pss.start_micros = Env::Default()->NowMicros();
+  auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
+
+  // Prepare.
+  int64 count = rcg->get_and_increment_execution_count();
+
+  // Keeps the highest 8 bits 0x01: we reserve some bits of the
+  // step_id for future use.
+  const uint64 step_id = MakeStepId();
+  TRACEPRINTF("stepid %llu", step_id);
+
+  const RunOptions& run_options = rcg->callable_options().run_options();
+
+  if (run_options.timeout_in_ms() != 0) {
+    opts->SetTimeout(run_options.timeout_in_ms());
+  }
+
+  std::unique_ptr<ProfileHandler> ph;
+  FillPerStepState(rcg, run_options, step_id, count, &pss, &ph);
   Status s = rcg->RunPartitions(env_, step_id, count, &pss, opts, req, resp,
-                                &cancellation_manager_, false);
-  if (s.ok()) {
-    pss.end_micros = Env::Default()->NowMicros();
+                                &cancellation_manager_);
+  cleanup.release();  // MarkRunCompletion called in PostRunCleanup().
+  return PostRunCleanup(rcg, step_id, run_options, &pss, ph, s,
+                        resp->mutable_metadata());
+}
 
-    // Schedule post-processing and cleanup to be done asynchronously.
-    rcg->ProcessStats(step_id, &pss, ph.get(), req.options(),
-                      resp->mutable_metadata());
-  } else if (errors::IsCancelled(s)) {
+Status MasterSession::RunCallable(CallOptions* opts,
+                                  const RunCallableRequest& req,
+                                  RunCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* callable;
+  {
     mutex_lock l(mu_);
     if (closed_) {
-      if (garbage_collected_) {
-        s = errors::Cancelled(
-            "Step was cancelled because the session was garbage collected due "
-            "to inactivity.");
-      } else {
-        s = errors::Cancelled(
-            "Step was cancelled by an explicit call to `Session::Close()`.");
-      }
+      return errors::FailedPrecondition("Session is closed.");
+    }
+    int64 handle = req.handle();
+    if (handle >= next_callable_handle_) {
+      return errors::InvalidArgument("No such callable handle: ", handle);
+    }
+    auto iter = callables_.find(req.handle());
+    if (iter == callables_.end()) {
+      return errors::InvalidArgument(
+          "Attempted to run callable after handle was released: ", handle);
     }
+    callable = iter->second;
+    callable->Ref();
+    ++num_running_;
   }
-  Ref();
-  rcg->Ref();
-  cleanup.release();  // MarkRunCompletion called in done closure.
-  rcg->CleanupPartitionsAsync(step_id, [this, rcg](const Status& s) {
-    if (!s.ok()) {
-      LOG(ERROR) << "Cleanup partition error: " << s;
+  core::ScopedUnref unref_callable(callable);
+  return DoRunCallable(opts, callable, req, resp);
+}
+
+Status MasterSession::ReleaseCallable(const ReleaseCallableRequest& req,
+                                      ReleaseCallableResponse* resp) {
+  UpdateLastAccessTime();
+  ReffedClientGraph* to_unref = nullptr;
+  {
+    mutex_lock l(mu_);
+    auto iter = callables_.find(req.handle());
+    if (iter != callables_.end()) {
+      to_unref = iter->second;
+      callables_.erase(iter);
     }
-    rcg->Unref();
-    MarkRunCompletion();
-    Unref();
-  });
-  return s;
+  }
+  if (to_unref != nullptr) {
+    to_unref->Unref();
+  }
+  return Status::OK();
 }
 
 Status MasterSession::Close() {
@@ -1690,6 +1932,7 @@ Status MasterSession::Close() {
     }
     ClearRunsTable(&to_unref, &run_graphs_);
     ClearRunsTable(&to_unref, &partial_run_graphs_);
+    ClearRunsTable(&to_unref, &callables_);
   }
   for (ReffedClientGraph* rcg : to_unref) rcg->Unref();
   if (should_delete_worker_sessions_) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index 4bd4e1367aa75730df829a2909005a221b9ab780..a05419904f54054ee439bfa5578bb4abbe0bd45e 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -89,6 +89,15 @@ class MasterSession : public core::RefCounted {
 
   Status ListDevices(ListDevicesResponse* resp) const;
 
+  Status MakeCallable(const MakeCallableRequest& req,
+                      MakeCallableResponse* resp);
+
+  Status RunCallable(CallOptions* opts, const RunCallableRequest& req,
+                     RunCallableResponse* resp);
+
+  Status ReleaseCallable(const ReleaseCallableRequest& req,
+                         ReleaseCallableResponse* resp);
+
   // Close this session and delete "*this". Returns OK if all known
   // states are cleanup successfully.
   //
@@ -140,6 +149,8 @@ class MasterSession : public core::RefCounted {
   typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ GUARDED_BY(mu_);
+  int64 next_callable_handle_ GUARDED_BY(mu_) = 0;
+  RCGMap callables_ GUARDED_BY(mu_);
 
   struct PerStepState {
     bool collect_costs = false;
@@ -205,15 +216,28 @@ class MasterSession : public core::RefCounted {
   bool should_delete_worker_sessions_ = false;
   Status DeleteWorkerSessions();
 
-  Status StartStep(const BuildGraphOptions& opts, int64* count,
-                   ReffedClientGraph** graph, bool is_partial);
+  Status StartStep(const BuildGraphOptions& opts, bool is_partial,
+                   ReffedClientGraph** out_rcg, int64* out_count);
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
+                        const RunOptions& run_options, uint64 step_id,
+                        int64 count, PerStepState* out_pss,
+                        std::unique_ptr<ProfileHandler>* out_ph);
   Status DoRunWithLocalExecution(CallOptions* opts,
                                  const RunStepRequestWrapper& req,
                                  MutableRunStepResponseWrapper* resp);
   Status DoPartialRun(CallOptions* opts, const RunStepRequestWrapper& req,
                       MutableRunStepResponseWrapper* resp);
+  Status DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
+                       const RunCallableRequest& req,
+                       RunCallableResponse* resp);
+  Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+                        const RunOptions& run_options, PerStepState* pss,
+                        const std::unique_ptr<ProfileHandler>& ph,
+                        const Status& run_status,
+                        RunMetadata* out_run_metadata);
+
   void MarkRunCompletion();
   void UpdateLastAccessTime();
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 66ebb3080af7cd01021362b5ea0c0b54458aebfc..18668b44d3c6da13b4e8db717da008accb311cbc 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -326,6 +326,20 @@ Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  Tensor tensor;
+  if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
+    return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
+  }
+  sends_.emplace_back(send_key, std::move(tensor));
+  return Status::OK();
+}
+
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
 const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
@@ -439,6 +453,18 @@ Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
   return Status::OK();
 }
 
+// TODO(b/74355905): Add a specialized implementation that avoids
+// copying the tensor when at least two of the {client, master,
+// worker} are in the same process.
+Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
+    const RunCallableRequest& run_callable_request, size_t i,
+    const string& send_key) {
+  NamedTensorProto* send = request_.add_send();
+  send->set_name(send_key);
+  *send->mutable_tensor() = run_callable_request.feed(i);
+  return Status::OK();
+}
+
 size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index 79fa6f926ea6afb351eacf279d3cf493b6d4713f..1f7cdb98a41ec017db1bd7fa0b7c6f9bb2299021 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -302,6 +302,9 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   virtual Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) = 0;
+  virtual Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) = 0;
 
   virtual void add_recv_key(const string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
@@ -334,6 +337,9 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
@@ -385,6 +391,9 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
       const string& send_key) override;
+  Status AddSendFromRunCallableRequest(
+      const RunCallableRequest& run_callable_request, size_t i,
+      const string& send_key) override;
   void add_recv_key(const string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 9dae1b9859787393464f4a45fc597be7fc41601c..fa0f8c9b5250b100f1c261b99b4466a25467327e 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -5,18 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -201,7 +189,6 @@ cc_library(
     srcs = ["grpc_worker_service_impl.cc"],
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "@grpc//:grpc++_unsecure",
@@ -247,22 +234,11 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        ":grpc_serialization_traits",
         "//tensorflow/core:master_proto_cc",
         "@grpc//:grpc++_unsecure",
     ],
 )
 
-cc_library(
-    name = "grpc_serialization_traits",
-    srcs = [],
-    hdrs = ["grpc_serialization_traits.h"],
-    deps = [
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
-    ],
-)
-
 cc_library(
     name = "rpc_rendezvous_mgr",
     srcs = ["rpc_rendezvous_mgr.cc"],
@@ -511,3 +487,33 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
+
+cc_library(
+    name = "grpc_rpc_factory",
+    srcs = [
+        "grpc_rpc_factory.cc",
+    ],
+    hdrs = ["grpc_rpc_factory.h"],
+    deps = [
+        ":grpc_state",
+        ":grpc_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+    ],
+)
+
+cc_library(
+    name = "grpc_rpc_factory_registration",
+    srcs = [
+        "grpc_rpc_factory_registration.cc",
+    ],
+    deps = [
+        ":grpc_rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index b4d18d8607eaddd75f4e395e71fbd75554645a61..23968e24c87ee1d12cab973b6d9568f1af706a0e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -47,11 +47,11 @@ namespace tensorflow {
 
 class GrpcMasterService : public AsyncServiceInterface {
  public:
-  GrpcMasterService(Master* master, int64 default_timeout_in_ms,
+  GrpcMasterService(Master* master, const ConfigProto& default_session_config,
                     ::grpc::ServerBuilder* builder)
       : master_impl_(master),
-        default_timeout_in_ms_(default_timeout_in_ms),
-        is_shutdown_(false) {
+        is_shutdown_(false),
+        default_session_config_(default_session_config) {
     builder->RegisterService(&master_service_);
     cq_ = builder->AddCompletionQueue();
   }
@@ -111,6 +111,11 @@ class GrpcMasterService : public AsyncServiceInterface {
     ENQUEUE_REQUEST(CloseSession, false);
     ENQUEUE_REQUEST(ListDevices, false);
     ENQUEUE_REQUEST(Reset, false);
+    ENQUEUE_REQUEST(MakeCallable, false);
+    for (int i = 0; i < 100; ++i) {
+      ENQUEUE_REQUEST(RunCallable, true);
+    }
+    ENQUEUE_REQUEST(ReleaseCallable, false);
 
     void* tag;
     bool ok;
@@ -129,12 +134,12 @@ class GrpcMasterService : public AsyncServiceInterface {
 
  private:
   Master* master_impl_ = nullptr;  // Not owned.
-  const int64 default_timeout_in_ms_;
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   grpc::MasterService::AsyncService master_service_;
 
   mutex mu_;
   bool is_shutdown_ GUARDED_BY(mu_);
+  const ConfigProto default_session_config_;
   ::grpc::Alarm* shutdown_alarm_ = nullptr;
 
   template <class RequestMessage, class ResponseMessage>
@@ -144,9 +149,13 @@ class GrpcMasterService : public AsyncServiceInterface {
   // RPC handler for creating a session.
   void CreateSessionHandler(
       MasterCall<CreateSessionRequest, CreateSessionResponse>* call) {
-    master_impl_->CreateSession(&call->request, &call->response,
-                                [call](const Status& status) {
+    CreateSessionRequest* rewritten_req = new CreateSessionRequest;
+    rewritten_req->mutable_config()->MergeFrom(default_session_config_);
+    rewritten_req->MergeFrom(call->request);
+    master_impl_->CreateSession(rewritten_req, &call->response,
+                                [call, rewritten_req](const Status& status) {
                                   call->SendResponse(ToGrpcStatus(status));
+                                  delete rewritten_req;
                                 });
     ENQUEUE_REQUEST(CreateSession, true);
   }
@@ -178,7 +187,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (call->request.options().timeout_in_ms() > 0) {
       call_opts->SetTimeout(call->request.options().timeout_in_ms());
     } else {
-      call_opts->SetTimeout(default_timeout_in_ms_);
+      call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
     }
     RunStepRequestWrapper* wrapped_request =
         new ProtoRunStepRequest(&call->request);
@@ -232,6 +241,47 @@ class GrpcMasterService : public AsyncServiceInterface {
                         });
     ENQUEUE_REQUEST(Reset, false);
   }
+
+  // RPC handler for making a callable.
+  void MakeCallableHandler(
+      MasterCall<MakeCallableRequest, MakeCallableResponse>* call) {
+    master_impl_->MakeCallable(&call->request, &call->response,
+                               [call](const Status& status) {
+                                 call->SendResponse(ToGrpcStatus(status));
+                               });
+    ENQUEUE_REQUEST(MakeCallable, false);
+  }
+
+  // RPC handler for running a callable.
+  void RunCallableHandler(
+      MasterCall<RunCallableRequest, RunCallableResponse>* call) {
+    auto* trace = TraceRpc("RunCallable/Server", call->client_metadata());
+    CallOptions* call_opts = new CallOptions;
+    // The timeout may be overridden by a non-zero timeout in the
+    // callable's `RunOptions`; this overriding will happen inside the
+    // `MasterSession` implementation.
+    call_opts->SetTimeout(default_session_config_.operation_timeout_in_ms());
+    call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+    master_impl_->RunCallable(call_opts, &call->request, &call->response,
+                              [call, call_opts, trace](const Status& status) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                delete trace;
+                                call->SendResponse(ToGrpcStatus(status));
+                              });
+    ENQUEUE_REQUEST(RunCallable, false);
+  }
+
+  // RPC handler for making a callable.
+  void ReleaseCallableHandler(
+      MasterCall<ReleaseCallableRequest, ReleaseCallableResponse>* call) {
+    master_impl_->ReleaseCallable(&call->request, &call->response,
+                                  [call](const Status& status) {
+                                    call->SendResponse(ToGrpcStatus(status));
+                                  });
+    ENQUEUE_REQUEST(ReleaseCallable, false);
+  }
+
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
@@ -249,10 +299,10 @@ class GrpcMasterService : public AsyncServiceInterface {
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
 };
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder) {
-  return new GrpcMasterService(master, default_timeout_in_ms, builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder) {
+  return new GrpcMasterService(master, default_session_config, builder);
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
index 473604f257607456d0fb4dcb6d9189f2f6dba135..f0fe5b0c4e96941f2774392273c3e1457219796a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/master.pb.h"
 
 namespace grpc {
 class ServerBuilder;
@@ -28,9 +29,9 @@ namespace tensorflow {
 class AsyncServiceInterface;
 class Master;
 
-AsyncServiceInterface* NewGrpcMasterService(Master* master,
-                                            int64 default_timeout_in_ms,
-                                            ::grpc::ServerBuilder* builder);
+AsyncServiceInterface* NewGrpcMasterService(
+    Master* master, const ConfigProto& default_session_config,
+    ::grpc::ServerBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index e2016e824c0bf504af4c624cad253963b223eb35..c832adbbbf8eba1ec512d62470025fb56a39b8a4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -36,6 +36,9 @@ static const char* grpcMasterService_method_names[] = {
     "/tensorflow.MasterService/CloseSession",
     "/tensorflow.MasterService/ListDevices",
     "/tensorflow.MasterService/Reset",
+    "/tensorflow.MasterService/MakeCallable",
+    "/tensorflow.MasterService/RunCallable",
+    "/tensorflow.MasterService/ReleaseCallable",
 };
 
 std::unique_ptr<MasterService::Stub> MasterService::NewStub(
@@ -64,7 +67,14 @@ MasterService::Stub::Stub(
       rpcmethod_ListDevices_(grpcMasterService_method_names[5],
                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_Reset_(grpcMasterService_method_names[6],
-                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
+                       ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_MakeCallable_(grpcMasterService_method_names[7],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RunCallable_(grpcMasterService_method_names[8],
+                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_ReleaseCallable_(grpcMasterService_method_names[9],
+                                 ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                 channel) {}
 
 ::grpc::Status MasterService::Stub::CreateSession(
     ::grpc::ClientContext* context, const CreateSessionRequest& request,
@@ -115,8 +125,29 @@ MasterService::Stub::Stub(
                                              context, request, response);
 }
 
+::grpc::Status MasterService::Stub::MakeCallable(
+    ::grpc::ClientContext* context, const MakeCallableRequest& request,
+    MakeCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_MakeCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::RunCallable(
+    ::grpc::ClientContext* context, const RunCallableRequest& request,
+    RunCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RunCallable_, context, request, response);
+}
+
+::grpc::Status MasterService::Stub::ReleaseCallable(
+    ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+    ReleaseCallableResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_ReleaseCallable_, context, request, response);
+}
+
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 7; ++i) {
+  for (int i = 0; i < 10; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 6ae94b74417c3fb6c4da1589bb9f532cb6d79930..8f1b589698276d5df7aa0245d57bc5bdb4a9f0db 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -25,18 +25,8 @@ limitations under the License.
 #include "grpc++/impl/codegen/stub_options.h"
 #include "grpc++/impl/codegen/sync_stream.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::CreateSessionRequest);
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::ExtendSessionRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunStepResponse);
-
 namespace grpc {
 class CompletionQueue;
 class Channel;
@@ -79,6 +69,15 @@ class MasterService final {
     virtual ::grpc::Status Reset(::grpc::ClientContext* context,
                                  const ResetRequest& request,
                                  ResetResponse* response) = 0;
+    virtual ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                        const MakeCallableRequest& request,
+                                        MakeCallableResponse* response) = 0;
+    virtual ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                                       const RunCallableRequest& request,
+                                       RunCallableResponse* response) = 0;
+    virtual ::grpc::Status ReleaseCallable(
+        ::grpc::ClientContext* context, const ReleaseCallableRequest& request,
+        ReleaseCallableResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -104,6 +103,15 @@ class MasterService final {
     ::grpc::Status Reset(::grpc::ClientContext* context,
                          const ResetRequest& request,
                          ResetResponse* response) override;
+    ::grpc::Status MakeCallable(::grpc::ClientContext* context,
+                                const MakeCallableRequest& request,
+                                MakeCallableResponse* response) override;
+    ::grpc::Status RunCallable(::grpc::ClientContext* context,
+                               const RunCallableRequest& request,
+                               RunCallableResponse* response) override;
+    ::grpc::Status ReleaseCallable(::grpc::ClientContext* context,
+                                   const ReleaseCallableRequest& request,
+                                   ReleaseCallableResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -114,6 +122,9 @@ class MasterService final {
     const ::grpc::internal::RpcMethod rpcmethod_CloseSession_;
     const ::grpc::internal::RpcMethod rpcmethod_ListDevices_;
     const ::grpc::internal::RpcMethod rpcmethod_Reset_;
+    const ::grpc::internal::RpcMethod rpcmethod_MakeCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_RunCallable_;
+    const ::grpc::internal::RpcMethod rpcmethod_ReleaseCallable_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -179,6 +190,30 @@ class MasterService final {
       ::grpc::Service::RequestAsyncUnary(6, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestMakeCallable(
+        ::grpc::ServerContext* context, MakeCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<MakeCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(7, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRunCallable(
+        ::grpc::ServerContext* context, RunCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RunCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(8, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestReleaseCallable(
+        ::grpc::ServerContext* context, ReleaseCallableRequest* request,
+        ::grpc::ServerAsyncResponseWriter<ReleaseCallableResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(9, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 1088e9be66ceb7fbddfaed0691423745f362343f..1b92a79a67eae25a1b5e2942006da1246ca24b4b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -95,6 +95,28 @@ class GrpcRemoteMaster : public MasterInterface {
                 &MasterServiceStub::Reset);
   }
 
+  Status MakeCallable(CallOptions* call_options,
+                      const MakeCallableRequest* request,
+                      MakeCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::MakeCallable);
+  }
+  Status RunCallable(CallOptions* call_options,
+                     const RunCallableRequest* request,
+                     RunCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::RunCallable);
+  }
+  Status ReleaseCallable(CallOptions* call_options,
+                         const ReleaseCallableRequest* request,
+                         ReleaseCallableResponse* response) override {
+    ::grpc::ClientContext ctx;
+    return Call(&ctx, call_options, request, response,
+                &MasterServiceStub::ReleaseCallable);
+  }
+
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
   port::Tracing::TraceMe TraceRpc(StringPiece name,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d004abd1c189f72ed25c8e62e66a798d480fee78
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -0,0 +1,213 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+
+namespace tensorflow {
+
+namespace {
+class GrpcCall {
+ public:
+  explicit GrpcCall(CallContainer<GrpcCall>* container, int index, bool try_rpc,
+                    const string* request_msg, string* response_msg,
+                    int32* status_code, string* status_message)
+      : container_(container),
+        index_(index),
+        try_rpc_(try_rpc),
+        request_msg_(request_msg),
+        response_msg_(response_msg),
+        status_code_(status_code),
+        status_message_(status_message) {}
+
+  void StartCancel() { call_opts_.StartCancel(); }
+
+  void Done(const Status& s) {
+    DCHECK(container_ != nullptr);
+    if (!s.ok() && try_rpc_) {
+      DCHECK(status_code_ != nullptr);
+      DCHECK(status_message_ != nullptr);
+      *status_code_ = s.code();
+      *status_message_ = s.error_message();
+    }
+    container_->Done(s, index_);
+  }
+
+  const string& request() const { return *request_msg_; }
+  string* response() const { return response_msg_; }
+  CallOptions* call_opts() { return &call_opts_; }
+
+ private:
+  CallContainer<GrpcCall>* const container_;
+  const int index_;
+  bool try_rpc_;
+  CallOptions call_opts_;
+  const string* request_msg_;
+  string* response_msg_;
+  int* status_code_;
+  string* status_message_;
+};
+
+}  // namespace
+
+GrpcRPCFactory::GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                               int64 timeout_in_ms)
+    : RPCFactory(), fail_fast_(fail_fast), timeout_in_ms_(timeout_in_ms) {
+  // TODO(ebrevdo): Investigate possible performance improvements by
+  // replacing this thread with a threadpool.
+  polling_thread_ =
+      ctx->env()->StartThread(ThreadOptions(), "rpc_op_grpc_factory", [this]() {
+        void* tag;
+        bool ok;
+        while (completion_queue_.Next(&tag, &ok)) {
+          GrpcClientCQTag* callback_tag = static_cast<GrpcClientCQTag*>(tag);
+          callback_tag->OnCompleted(ok);
+        }
+      });
+}
+
+GrpcRPCFactory::~GrpcRPCFactory() {
+  // The amount of time we wait depends on several parameters, including:
+  //   - the value of the fail_fast attribute.
+  //   - the timeout option of the rpc call in the proto declaration.
+  //   - the network roundtrip time and service's execution time.
+  //
+  // If a connection is made but the service doesn't ever respond, and
+  // there is no timeout option set for this rpc call, then it is
+  // possible the RPC request will wait forever.
+  //
+  completion_queue_.Shutdown();
+  delete polling_thread_;
+}
+
+void GrpcRPCFactory::Call(OpKernelContext* ctx, int64 num_elements,
+                          const Tensor& address_t, const Tensor& method_t,
+                          const Tensor& request_t, const bool try_rpc,
+                          Tensor* response_t, Tensor* status_code_t,
+                          Tensor* status_message_t,
+                          AsyncOpKernel::DoneCallback done) {
+  auto address = address_t.flat<string>();
+  auto method = method_t.flat<string>();
+  auto request = request_t.flat<string>();
+
+  // Stubs are maintained by the GrpcRPCFactory class and will be
+  // deleted when the class is destroyed.
+  ::grpc::GenericStub* singleton_stub = nullptr;
+  if (address.size() == 1) {
+    singleton_stub = GetOrCreateStubForAddress(address(0));
+  }
+  auto get_stub = [&address, this,
+                   singleton_stub](int64 ix) -> ::grpc::GenericStub* {
+    return (address.size() > 1) ? GetOrCreateStubForAddress(address(ix))
+                                : singleton_stub;
+  };
+  auto get_method_ptr = [&method](int64 ix) -> const string* {
+    return (method.size() > 1) ? &(method(ix)) : &(method(0));
+  };
+  auto get_request_ptr = [&request](int64 ix) -> const string* {
+    return (request.size() > 1) ? &(request(ix)) : &(request(0));
+  };
+
+  if (try_rpc) {
+    // In this case status_code will never be set in the response,
+    // so we just set it to OK.
+    DCHECK(status_code_t != nullptr);
+    status_code_t->flat<int32>().setConstant(
+        static_cast<int>(errors::Code::OK));
+  }
+
+  CancellationManager* cm = ctx->cancellation_manager();
+  CancellationToken cancellation_token = cm->get_cancellation_token();
+
+  // This object will delete itself when done.
+  auto* container =
+      new CallContainer<GrpcCall>(ctx, num_elements, fail_fast_, try_rpc,
+                                  std::move(done), cancellation_token);
+
+  auto response = response_t->flat<string>();
+  int32* status_code_ptr = nullptr;
+  string* status_message_ptr = nullptr;
+  if (try_rpc) {
+    status_code_ptr = status_code_t->flat<int32>().data();
+    status_message_ptr = status_message_t->flat<string>().data();
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    container->calls()->emplace_back(
+        container, i, try_rpc, get_request_ptr(i), &response(i),
+        (try_rpc) ? &status_code_ptr[i] : nullptr,
+        (try_rpc) ? &status_message_ptr[i] : nullptr);
+  }
+
+  int i = 0;
+  for (GrpcCall& call : *(container->calls())) {
+    // This object will delete itself when done.
+    new RPCState<string>(get_stub(i), &completion_queue_, *get_method_ptr(i),
+                         call.request(), call.response(),
+                         /*done=*/[&call](const Status& s) { call.Done(s); },
+                         call.call_opts(), fail_fast_, timeout_in_ms_);
+    ++i;
+  }
+
+  // Need to register this callback after all the RPCs are in
+  // flight; otherwise we may try to cancel an RPC *before* it
+  // launches, which is a no-op, and then fall into a deadlock.
+  bool is_cancelled = !cm->RegisterCallback(
+      cancellation_token, [container]() { container->StartCancel(); });
+
+  if (is_cancelled) {
+    ctx->SetStatus(errors::Cancelled("Operation has been cancelled."));
+    // container's reference counter will take care of calling done().
+    container->StartCancel();
+  }
+}
+
+::grpc::GenericStub* GrpcRPCFactory::GetOrCreateStubForAddress(
+    const string& address) {
+  mutex_lock lock(mu_);
+
+  auto stub = stubs_.find(address);
+  if (stub != stubs_.end()) return stub->second.get();
+
+  ChannelPtr channel = CreateChannelForAddress(address);
+  auto* created = new ::grpc::GenericStub(channel);
+  stubs_[address].reset(created);
+  return created;
+}
+
+GrpcRPCFactory::ChannelPtr GrpcRPCFactory::CreateChannelForAddress(
+    const string& address) {
+  ::grpc::ChannelArguments args;
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+
+  // Set a standard backoff timeout of 1s instead of the
+  // (sometimes default) 20s.
+  args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  return ::grpc::CreateCustomChannel(
+      /*target=*/address, ::grpc::InsecureChannelCredentials(), args);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..34ec235aafc1535f27f5943b48e8b8afdcee43ac
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class GrpcRPCFactory : public RPCFactory {
+ public:
+  explicit GrpcRPCFactory(OpKernelConstruction* ctx, bool fail_fast,
+                          int64 timeout_in_ms);
+
+  // Explicit destructor to control destruction order.
+  ~GrpcRPCFactory() override;
+
+  void Call(OpKernelContext* ctx, int64 num_elements, const Tensor& address_t,
+            const Tensor& method_t, const Tensor& request_t, const bool try_rpc,
+            Tensor* response_t, Tensor* status_code_t, Tensor* status_message_t,
+            AsyncOpKernel::DoneCallback done) override;
+
+ protected:
+  typedef std::shared_ptr<::grpc::Channel> ChannelPtr;
+  virtual ChannelPtr CreateChannelForAddress(const string& address);
+
+ private:
+  ::grpc::GenericStub* GetOrCreateStubForAddress(const string& address);
+
+  bool fail_fast_;
+  int64 timeout_in_ms_;
+  ::grpc::CompletionQueue completion_queue_;
+  Thread* polling_thread_;  // Owned.
+
+  mutex mu_;
+  typedef std::unique_ptr<::grpc::GenericStub> StubPtr;
+  std::unordered_map<string, StubPtr> stubs_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b884489378464d7271e31e0ae1d180134becc6dc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory_registration.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+namespace {
+
+// Used for adding the grpc factory to the RPC factory registry.
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return new GrpcRPCFactory(ctx, fail_fast, timeout_in_ms);
+  }
+};
+
+REGISTER_RPC_FACTORY("grpc", Value::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h b/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
deleted file mode 100644
index e7f5fb0c6ae24caa3ffe5039d5daddb771c4858d..0000000000000000000000000000000000000000
--- a/tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
-
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/support/slice.h"
-#include "grpc/grpc.h"
-
-namespace grpc {
-
-namespace tensorflow_helper {
-
-const int kGrpcBufferWriterMaxBufferLength = 8192;
-
-class GrpcBufferWriter final
-    : public ::grpc::protobuf::io::ZeroCopyOutputStream {
- public:
-  explicit GrpcBufferWriter(grpc_byte_buffer** bp, int block_size)
-      : block_size_(block_size), byte_count_(0), have_backup_(false) {
-    *bp = grpc_raw_byte_buffer_create(NULL, 0);
-    slice_buffer_ = &(*bp)->data.raw.slice_buffer;
-  }
-
-  ~GrpcBufferWriter() override {
-    if (have_backup_) {
-      grpc_slice_unref(backup_slice_);
-    }
-  }
-
-  bool Next(void** data, int* size) override {
-    if (have_backup_) {
-      slice_ = backup_slice_;
-      have_backup_ = false;
-    } else {
-      slice_ = grpc_slice_malloc(block_size_);
-    }
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    grpc_slice_buffer_add(slice_buffer_, slice_);
-    return true;
-  }
-
-  void BackUp(int count) override {
-    grpc_slice_buffer_pop(slice_buffer_);
-    if (count == block_size_) {
-      backup_slice_ = slice_;
-    } else {
-      backup_slice_ =
-          grpc_slice_split_tail(&slice_, GRPC_SLICE_LENGTH(slice_) - count);
-      grpc_slice_buffer_add(slice_buffer_, slice_);
-    }
-    // It's dangerous to keep an inlined grpc_slice as the backup slice, since
-    // on a following Next() call, a reference will be returned to this slice
-    // via GRPC_SLICE_START_PTR, which will not be an address held by
-    // slice_buffer_.
-    have_backup_ = backup_slice_.refcount != NULL;
-    byte_count_ -= count;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override { return byte_count_; }
-
- private:
-  const int block_size_;
-  int64_t byte_count_;
-  grpc_slice_buffer* slice_buffer_;
-  bool have_backup_;
-  grpc_slice backup_slice_;
-  grpc_slice slice_;
-};
-
-class GrpcBufferReader final
-    : public ::grpc::protobuf::io::ZeroCopyInputStream {
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    (void)grpc_byte_buffer_reader_init(&reader_, buffer);
-  }
-  ~GrpcBufferReader() override { grpc_byte_buffer_reader_destroy(&reader_); }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = (int)backup_count_;
-      backup_count_ = 0;
-      return true;
-    }
-    if (!grpc_byte_buffer_reader_next(&reader_, &slice_)) {
-      return false;
-    }
-    grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_);
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  grpc::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-}  // namespace tensorflow_helper
-
-// Defines specialized serialization/deserialization routines that
-// default to allowing a 2GB max message size.
-//
-// To instantiate this template for a particular type `T`, use
-// `TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(T)`, as defined below.
-template <typename T>
-class UnlimitedSizeProtoSerializationTraits {
- public:
-  static Status Serialize(const T& msg, grpc_byte_buffer** bp,
-                          bool* own_buffer) {
-    *own_buffer = true;
-    int byte_size = msg.ByteSize();
-    if (byte_size < 0) {
-      return Status(StatusCode::INTERNAL, "Message length was negative");
-    } else if (byte_size <=
-               tensorflow_helper::kGrpcBufferWriterMaxBufferLength) {
-      grpc_slice slice = grpc_slice_malloc(byte_size);
-      GPR_CODEGEN_ASSERT(
-          GRPC_SLICE_END_PTR(slice) ==
-          msg.SerializeWithCachedSizesToArray(GRPC_SLICE_START_PTR(slice)));
-      *bp = grpc_raw_byte_buffer_create(&slice, 1);
-      grpc_slice_unref(slice);
-      return Status::OK;
-    } else {
-      tensorflow_helper::GrpcBufferWriter writer(
-          bp, tensorflow_helper::kGrpcBufferWriterMaxBufferLength);
-      return msg.SerializeToZeroCopyStream(&writer)
-                 ? Status::OK
-                 : Status(StatusCode::INTERNAL, "Failed to serialize message");
-    }
-  }
-
-  static Status Deserialize(grpc_byte_buffer* buffer, T* msg,
-                            int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-    Status result = Status::OK;
-    {
-      tensorflow_helper::GrpcBufferReader reader(buffer);
-      ::grpc::protobuf::io::CodedInputStream decoder(&reader);
-      if (max_message_size == 0) {
-        // NOTE(mrry): Override maximum message size to 2GB.
-        decoder.SetTotalBytesLimit(INT_MAX, INT_MAX);
-      } else {
-        decoder.SetTotalBytesLimit(max_message_size, max_message_size);
-      }
-      if (!msg->ParseFromCodedStream(&decoder)) {
-        result = Status(StatusCode::INTERNAL, msg->InitializationErrorString());
-      }
-      if (!decoder.ConsumedEntireMessage()) {
-        result = Status(StatusCode::INTERNAL, "Did not read entire message");
-      }
-    }
-    grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-
-}  // namespace grpc
-
-// For the given protobuf message type `MessageType`, specializes the
-// gRPC serialization and deserialization such that the default
-// maximum message size is 2GB.
-#define TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(MessageType)             \
-  namespace grpc {                                                    \
-  template <>                                                         \
-  class SerializationTraits<MessageType>                              \
-      : public UnlimitedSizeProtoSerializationTraits<MessageType> {}; \
-  }  // namespace grpc
-
-#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_SERIALIZATION_TRAITS_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index a6f4be3eaf69f40199e64c43dff443e886aa5aa1..be1910358214e3551ce3d53f8a2faa53b21a8695 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -183,8 +183,7 @@ Status GrpcServer::Init(
   builder.SetOption(
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
   master_impl_ = CreateMaster(&master_env_);
-  master_service_ = NewGrpcMasterService(
-      master_impl_.get(), config.operation_timeout_in_ms(), &builder);
+  master_service_ = NewGrpcMasterService(master_impl_.get(), config, &builder);
   worker_impl_ =
       worker_func ? worker_func(&worker_env_) : NewGrpcWorker(&worker_env_);
   worker_service_ =
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index 120a33f17b0d1f81e50dfbc844f56e3d85def096..fd1c150fa7aab95bee0c492ce553b9c7f58cd487 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/protobuf/master.pb.h"
 
@@ -90,6 +91,15 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+Status GrpcSession::Handle(string* out_handle) {
+  mutex_lock l(mu_);
+  if (handle_.empty()) {
+    return errors::InvalidArgument("A session is not created yet....");
+  }
+  *out_handle = handle_;
+  return Status::OK();
+}
+
 Status GrpcSession::CreateImpl(CallOptions* call_options,
                                const GraphDef& graph) {
   {
@@ -273,14 +283,9 @@ Status GrpcSession::Run(const std::vector<std::pair<string, Tensor>>& inputs,
 Status GrpcSession::RunProto(CallOptions* call_options,
                              MutableRunStepRequestWrapper* req,
                              MutableRunStepResponseWrapper* resp) {
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req->set_session_handle(handle_);
-  }
+  string handle;
+  TF_RETURN_IF_ERROR(Handle(&handle));
+  req->set_session_handle(handle);
   return master_->RunStep(call_options, req, resp);
 }
 
@@ -292,14 +297,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   PartialRunSetupRequest req;
   PartialRunSetupResponse resp;
   CallOptions call_options;
-  {
-    mutex_lock l(mu_);
-    if (handle_.empty()) {
-      return errors::InvalidArgument("A session is not created yet....");
-    }
-
-    req.set_session_handle(handle_);
-  }
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
   for (const string& feed : input_names) {
     req.add_feed(feed);
   }
@@ -399,10 +397,59 @@ Status GrpcSession::Reset(const SessionOptions& options,
   return ret;
 }
 
+Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
+                                 CallableHandle* out_handle) {
+  MakeCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  *req.mutable_options() = callable_options;
+  MakeCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->MakeCallable(&call_options, &req, &resp));
+  *out_handle = resp.handle();
+  return Status::OK();
+}
+
+Status GrpcSession::RunCallable(CallableHandle handle,
+                                const std::vector<Tensor>& feed_tensors,
+                                std::vector<Tensor>* fetch_tensors,
+                                RunMetadata* run_metadata) {
+  RunCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  for (const Tensor& feed : feed_tensors) {
+    feed.AsProtoTensorContent(req.mutable_feed()->Add());
+  }
+
+  RunCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  TF_RETURN_IF_ERROR(master_->RunCallable(&call_options, &req, &resp));
+  for (const TensorProto& fetch : resp.fetch()) {
+    Tensor fetch_tensor;
+    if (!fetch_tensor.FromProto(cpu_allocator(), fetch)) {
+      return errors::Internal(
+          "Could not parse fetched tensor data in response from master.");
+    }
+    fetch_tensors->push_back(std::move(fetch_tensor));
+  }
+  return Status::OK();
+}
+
+Status GrpcSession::ReleaseCallable(CallableHandle handle) {
+  ReleaseCallableRequest req;
+  TF_RETURN_IF_ERROR(Handle(req.mutable_session_handle()));
+  req.set_handle(handle);
+  ReleaseCallableResponse resp;
+  CallOptions call_options;
+  call_options.SetTimeout(options_.config.operation_timeout_in_ms());
+  return master_->ReleaseCallable(&call_options, &req, &resp);
+}
+
 class GrpcSessionFactory : public SessionFactory {
  public:
   bool AcceptsOptions(const SessionOptions& options) override {
-    return StringPiece(options.target).starts_with(kSchemePrefix);
+    return str_util::StartsWith(options.target, kSchemePrefix);
   }
 
   Session* NewSession(const SessionOptions& options) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index d87956a13515fde533e746d2abd04e4a2f4959ae..63795117f9763434f5ff331d3d2d3bdb99413e81 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -82,20 +82,27 @@ class GrpcSession : public Session {
   Status Close() override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRunSetup(const std::vector<string>& input_names,
-                                 const std::vector<string>& output_names,
-                                 const std::vector<string>& target_nodes,
-                                 string* handle) override;
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
 
   // NOTE: This API is still experimental and may change.
-  ::tensorflow::Status PRun(
-      const string& handle,
-      const std::vector<std::pair<string, Tensor> >& inputs,
-      const std::vector<string>& output_names,
-      std::vector<Tensor>* outputs) override;
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
 
   Status ListDevices(std::vector<DeviceAttributes>* response) override;
 
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle) override;
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata) override;
+  Status ReleaseCallable(CallableHandle handle) override;
+
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
@@ -111,6 +118,8 @@ class GrpcSession : public Session {
   // The current version of the graph.
   int64 current_graph_version_ GUARDED_BY(mu_);
 
+  Status Handle(string* out_handle) LOCKS_EXCLUDED(mu_);
+
   Status RunHelper(const RunOptions& run_options,
                    const std::vector<std::pair<string, Tensor> >& inputs,
                    const std::vector<string>& output_tensor_names,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 335c3febe20e17e5b5ea57dc68c69e616997e14b..45b15a54a29b481b4888515f18bd913d71c1013c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -120,6 +120,49 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
   }
 }
 
+TEST(GrpcSessionTest, BasicCallable) {
+  GraphDef graph;
+  string node_names[3];
+  // c = a * b
+  CreateGraphDef(&graph, node_names);
+
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+
+  std::unique_ptr<Session> session(
+      NewRemote(Options(cluster->targets()[0], 1)));
+  ASSERT_TRUE(session != nullptr);
+
+  for (int iters = 0; iters < 25; ++iters) {
+    TF_CHECK_OK(session->Create(graph));
+    {
+      // Just run to target node
+      CallableOptions opts;
+      opts.add_target(node_names[2]);
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      TF_CHECK_OK(session->RunCallable(handle, {}, nullptr, nullptr));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+    {
+      // Run to a target node and a real tensor
+      CallableOptions opts;
+      opts.add_target(node_names[1]);
+      opts.add_fetch(node_names[2] + ":0");
+      Session::CallableHandle handle;
+      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      std::vector<Tensor> outputs;
+      TF_CHECK_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      ASSERT_EQ(1, outputs.size());
+      ASSERT_TRUE(outputs[0].IsInitialized());
+      ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
+      TF_CHECK_OK(session->ReleaseCallable(handle));
+    }
+
+    TF_CHECK_OK(session->Close());
+  }
+}
+
 TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
   GraphDef graph;
   string node_names[3];
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index 2a2f7e3ffbef10f9f2997fc554f010d3f8689ca2..62b299d5c2c15ee4b4bbf3071ed2f8e194d353e7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -26,24 +26,16 @@ limitations under the License.
 #include "grpc++/impl/codegen/sync_stream.h"
 #include "grpc++/support/byte_buffer.h"
 
-#include "tensorflow/core/distributed_runtime/rpc/grpc_serialization_traits.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
-// Contains potentially large GraphDef.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RegisterGraphRequest);
-// Contains potentially large TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphRequest);
-// Contains potentially large StepStats, TensorProto.
-TF_GRPC_ALLOW_UNLIMITED_MESSAGE_SIZE(tensorflow::RunGraphResponse);
-
 namespace tensorflow {
 class GrpcByteSource : public TensorResponse::Source {
  public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
+  explicit GrpcByteSource(::grpc::ByteBuffer* buffer) : buffer_(buffer) {}
   ~GrpcByteSource() override { DeleteStream(); }
 
-  typedef ::grpc::tensorflow_helper::GrpcBufferReader Reader;
+  typedef ::grpc::GrpcProtoBufferReader Reader;
 
   protobuf::io::ZeroCopyInputStream* contents() override {
     DeleteStream();
@@ -58,7 +50,7 @@ class GrpcByteSource : public TensorResponse::Source {
     }
   }
 
-  grpc_byte_buffer* buffer_;  // Not owned
+  ::grpc::ByteBuffer* buffer_;  // Not owned
   Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
   char space_[sizeof(Reader)];
 };
@@ -74,17 +66,15 @@ class ServerContext;
 // Support parsing/unparsing of tensorflow::TensorResponse.
 // Wire-format is identical to RecvTensorResponse.
 template <>
-class SerializationTraits<tensorflow::TensorResponse>
-    : public UnlimitedSizeProtoSerializationTraits<tensorflow::TensorResponse> {
+class SerializationTraits<tensorflow::TensorResponse> {
  public:
-  static Status Serialize(const tensorflow::TensorResponse& msg,
-                          grpc_byte_buffer** bp, bool* own_buffer) {
+  static Status Serialize(const tensorflow::TensorResponse& msg, ByteBuffer* bp,
+                          bool* own_buffer) {
     LOG(FATAL) << "TODO(sanjay,jeff): Implement";
     return Status();
   }
-  static Status Deserialize(grpc_byte_buffer* buffer,
-                            tensorflow::TensorResponse* msg,
-                            int max_message_size = INT_MAX) {
+  static Status Deserialize(ByteBuffer* buffer,
+                            tensorflow::TensorResponse* msg) {
     if (buffer == nullptr) {
       return Status(StatusCode::INTERNAL, "No payload");
     }
@@ -98,7 +88,7 @@ class SerializationTraits<tensorflow::TensorResponse>
                             "TensorResponse parse error", s.ToString()));
       }
     }
-    grpc_byte_buffer_destroy(buffer);
+    buffer->Clear();
     return result;
   }
 };
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index a382b8be95f143898a8f52f887b9396f3823372b..1a7e5219cd243a1e0eb64c13317f109fe5a93336 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/framework/visitable_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
 
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/log_memory.h"
@@ -61,6 +61,26 @@ static bool cpu_allocator_collect_stats = false;
 // If true, cpu allocator collects full stats.
 static bool cpu_allocator_collect_full_stats = false;
 
+// Individual allocations large than this amount will trigger a warning.
+static const double kLargeAllocationWarningThreshold = 0.1;
+
+// If cpu_allocator_collect_stats is true, warn when the total allocated memory
+// exceeds this threshold.
+static const double kTotalAllocationWarningThreshold = 0.5;
+
+// Cache first invocation to port::AvailableRam, as it can be expensive.
+static int64_t LargeAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kLargeAllocationWarningThreshold);
+  return value;
+}
+
+static int64_t TotalAllocationWarningBytes() {
+  static int64_t value = static_cast<int64>(port::AvailableRam() *
+                                            kTotalAllocationWarningThreshold);
+  return value;
+}
+
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
@@ -68,17 +88,19 @@ void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
 
-class CPUAllocator : public VisitableAllocator {
+class CPUAllocator : public Allocator {
  public:
-  CPUAllocator() : allocation_begun_(false) {}
+  CPUAllocator() : total_allocation_warning_triggered_(false) {}
 
   ~CPUAllocator() override {}
 
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (!allocation_begun_) {
-      allocation_begun_ = true;
+    if (num_bytes > LargeAllocationWarningBytes()) {
+      LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
+                   << 100 * kLargeAllocationWarningThreshold
+                   << "% of system memory.";
     }
 
     void* p = port::AlignedMalloc(num_bytes, alignment);
@@ -91,39 +113,25 @@ class CPUAllocator : public VisitableAllocator {
           std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
       stats_.max_alloc_size =
           std::max<int64>(stats_.max_alloc_size, alloc_size);
-    }
 
-    // visit each Visitor in alloc_visitors_
-    if (p != nullptr) {
-      for (const Visitor& v : alloc_visitors_) {
-        v(p, num_bytes);
+      if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
+          !total_allocation_warning_triggered_) {
+        LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
+                     << "exceeds " << 100 * kTotalAllocationWarningThreshold
+                     << "% of system memory";
+        total_allocation_warning_triggered_ = true;
       }
     }
-
     return p;
   }
 
   void DeallocateRaw(void* ptr) override {
-    std::size_t alloc_size;
-    bool init_alloc_size = false;
     if (cpu_allocator_collect_stats) {
-      alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-      init_alloc_size = true;
+      const std::size_t alloc_size =
+          port::MallocExtension_GetAllocatedSize(ptr);
       mutex_lock l(mu_);
       stats_.bytes_in_use -= alloc_size;
     }
-
-    // visit each Visitor in free_visitors_
-    if (ptr != nullptr) {
-      if (!init_alloc_size) {
-        alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
-        init_alloc_size = true;
-      }
-      for (const Visitor& v : free_visitors_) {
-        v(ptr, alloc_size);
-      }
-    }
-
     port::AlignedFree(ptr);
   }
 
@@ -143,35 +151,10 @@ class CPUAllocator : public VisitableAllocator {
     return port::MallocExtension_GetAllocatedSize(ptr);
   }
 
-  // REQUIRES: can only add visitors before the first Allocate call
-
-  void AddAllocVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddAllocVisitor may not be called after allocation has begun.";
-    alloc_visitors_.push_back(visitor);
-  }
-
-  void AddFreeVisitor(Visitor visitor) override {
-    mutex_lock lock(visitor_mutex_);
-    CHECK(!allocation_begun_)
-        << "AddFreeVisitor may not be called after allocation has begun.";
-    free_visitors_.push_back(visitor);
-  }
-
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
-
-  // visitor_mutex_ protects write access to alloc_visitors_ and free_visitors_.
-  // While write access is mutually exclusive, reads may happen concurrently.
-  // This is okay because we may only append to alloc_visitors_ and
-  // free_visitors_ before first allocation, and subsequently we only read these
-  // vectors.
-  mutex visitor_mutex_;
-  std::vector<Visitor> alloc_visitors_;
-  std::vector<Visitor> free_visitors_;
-  std::atomic<bool> allocation_begun_;
+  bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index ebb56d525e52e7351e4159dce44349ce0649921c..87c1ddd15df4f89e29b1d073f4380e65dae531f9 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -186,7 +186,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   // check if has_list is false and some other field in attr_value is
   // set to flag the error.  This test can be made more strict once
   // support for GraphDef versions <= 4 is dropped.
-  if (StringPiece(type).starts_with("list(") && !attr_value.has_list()) {
+  if (str_util::StartsWith(type, "list(") && !attr_value.has_list()) {
     if (num_set) {
       return errors::InvalidArgument(
           "AttrValue missing value with expected type '", type, "'");
@@ -197,7 +197,7 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
   }
 
   // Okay to have an empty list, but not to be missing a non-list value.
-  if (num_set == 0 && !StringPiece(type).starts_with("list(")) {
+  if (num_set == 0 && !str_util::StartsWith(type, "list(")) {
     return errors::InvalidArgument(
         "AttrValue missing value with expected type '", type, "'");
   }
@@ -241,29 +241,29 @@ Status AttrValueHasType(const AttrValue& attr_value, StringPiece type) {
 bool ParseAttrValue(StringPiece type, StringPiece text, AttrValue* out) {
   // Parse type.
   string field_name;
-  bool is_list = type.Consume("list(");
-  if (type.Consume("string")) {
+  bool is_list = str_util::ConsumePrefix(&type, "list(");
+  if (str_util::ConsumePrefix(&type, "string")) {
     field_name = "s";
-  } else if (type.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&type, "int")) {
     field_name = "i";
-  } else if (type.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&type, "float")) {
     field_name = "f";
-  } else if (type.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&type, "bool")) {
     field_name = "b";
-  } else if (type.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&type, "type")) {
     field_name = "type";
-  } else if (type.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&type, "shape")) {
     field_name = "shape";
-  } else if (type.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&type, "tensor")) {
     field_name = "tensor";
-  } else if (type.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&type, "func")) {
     field_name = "func";
-  } else if (type.Consume("placeholder")) {
+  } else if (str_util::ConsumePrefix(&type, "placeholder")) {
     field_name = "placeholder";
   } else {
     return false;
   }
-  if (is_list && !type.Consume(")")) {
+  if (is_list && !str_util::ConsumePrefix(&type, ")")) {
     return false;
   }
 
diff --git a/tensorflow/core/framework/attr_value_util_test.cc b/tensorflow/core/framework/attr_value_util_test.cc
index e4fad917ffe1d4a0790bf1fd56e3c72f841523d8..1a3994736cb5627c590c3029c7b9e163dff2351c 100644
--- a/tensorflow/core/framework/attr_value_util_test.cc
+++ b/tensorflow/core/framework/attr_value_util_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/attr_value_util.h"
 
+#include <numeric>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a26f2c2f315df749db2cd5995c4fac981abb73b2
--- /dev/null
+++ b/tensorflow/core/framework/collective.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/collective.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+string CollGroupParams::ToString() const {
+  return strings::StrCat("CollGroupParams {group_key=", group_key,
+                         " group_size=", group_size,
+                         " device_type=", device_type.type_string(),
+                         " num_tasks=", num_tasks, "}");
+}
+
+CollInstanceParams& CollInstanceParams::operator=(
+    const CollInstanceParams& other) {
+  if (this != &other) {
+    instance_key = other.instance_key;
+    type = other.type;
+    data_type = other.data_type;
+    shape = other.shape;
+    device_names.clear();
+    device_names.assign(other.device_names.begin(), other.device_names.end());
+    task_names.assign(other.task_names.begin(), other.task_names.end());
+    impl_details.subdiv_offsets.assign(
+        other.impl_details.subdiv_offsets.begin(),
+        other.impl_details.subdiv_offsets.end());
+    impl_details.subdiv_permutations.clear();
+    for (auto p : other.impl_details.subdiv_permutations) {
+      impl_details.subdiv_permutations.push_back(
+          std::vector<int>(p.begin(), p.end()));
+    }
+    impl_details.subdiv_source_rank.assign(
+        other.impl_details.subdiv_source_rank.begin(),
+        other.impl_details.subdiv_source_rank.end());
+  }
+  return *this;
+}
+
+string CollInstanceParams::ToString() const {
+  string v = strings::StrCat("CollInstanceParams { instance_key=", instance_key,
+                             " type=", type, " data_type=", data_type,
+                             " shape=", shape.DebugString(), " devices {");
+  for (const auto& d : device_names) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "} task_names={");
+  for (const auto& n : task_names) {
+    strings::StrAppend(&v, n, ", ");
+  }
+  strings::StrAppend(&v, "}, subdiv_offsets={");
+  for (const auto& d : impl_details.subdiv_offsets) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "}, subdiv_perms={");
+  for (const auto& p : impl_details.subdiv_permutations) {
+    strings::StrAppend(&v, "{");
+    for (const auto& i : p) {
+      strings::StrAppend(&v, i, ",");
+    }
+    strings::StrAppend(&v, "}");  // one subdiv
+  }
+  strings::StrAppend(&v, "}");  // all subdivs
+  return v;
+}
+
+string CollTaskParams::ToString() const {
+  string v = strings::StrCat("CollTaskParams {is_local={");
+  for (const auto& b : is_local) {
+    strings::StrAppend(&v, static_cast<int>(b), ",");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+string CollectiveParams::ToString() const {
+  string v = strings::StrCat("CollectiveParams ", name, " {", group.ToString());
+  strings::StrAppend(&v, " ", instance.ToString());
+  strings::StrAppend(&v, " ", task.ToString());
+  strings::StrAppend(&v, " default_rank=", default_rank,
+                     " is_source=", is_source, " subdiv_rank={");
+  for (const auto& r : subdiv_rank) {
+    strings::StrAppend(&v, r, ",");
+  }
+  if (!subdiv_source_rank.empty()) {
+    strings::StrAppend(&v, " subdiv_rank={");
+    for (const auto& r : subdiv_source_rank) {
+      strings::StrAppend(&v, r, ",");
+    }
+    strings::StrAppend(&v, "}");
+  }
+  strings::StrAppend(&v, "}}");
+  return v;
+}
+
+/*static*/ OpKernelContext::Params* CollectiveExecutor::CtxParams(
+    OpKernelContext* ctx) {
+  return ctx->params_;
+}
+
+/*static*/
+int64 CollectiveExecutor::kInvalidId = -1;
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
new file mode 100644
index 0000000000000000000000000000000000000000..5810c7fa547183001fd4b2bb4aefea1e0b74b32b
--- /dev/null
+++ b/tensorflow/core/framework/collective.h
@@ -0,0 +1,305 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+#define TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+class BufRendezvous;
+class CancellationManager;
+class CompleteGroupRequest;
+class CompleteGroupResponse;
+class CompleteInstanceRequest;
+class CompleteInstanceResponse;
+class DeviceLocality;
+class GetStepSequenceRequest;
+class GetStepSequenceResponse;
+class Op;
+class Tensor;
+
+// Types of supported collective operations.
+enum CollectiveType {
+  REDUCTION_COLLECTIVE = 0,
+  BROADCAST_COLLECTIVE,
+  UNDEFINED_COLLECTIVE,
+};
+
+// Data common to all members of a device group.
+// All members share the same device set but its order is
+// particular to an instance so it is stored there.
+struct CollGroupParams {
+  int32 group_key;
+  int32 group_size;
+  DeviceType device_type;
+  int32 num_tasks;  // number of distinct tasks in group
+  string ToString() const;
+  CollGroupParams() : device_type(DEVICE_CPU) {}
+};
+
+// The best implementation of a collective op depends on many factors
+// including the number of devices involved, the topology of
+// interconnects between them and the sizes of inputs.  This structure
+// is used in generating and representing data movement choreography
+// for each specific algorithm, hence it does not have a single, fixed
+// interpretation.  On first execution the runtime will update this
+// structure with decisions that will guide all subsequent executions.
+struct CollImplDetails {
+  std::vector<std::vector<int>> subdiv_permutations;
+  std::vector<int> subdiv_offsets;
+  // broadcast only: rank of source in each subdiv
+  std::vector<int> subdiv_source_rank;
+};
+
+// Data common to all members of a collective instance.
+struct CollInstanceParams {
+  int32 instance_key;  // Identifies all participating graph nodes.
+  CollectiveType type;
+  DataType data_type;
+  TensorShape shape;
+  // Fully qualified name of device for each member, in default rank order.
+  std::vector<string> device_names;
+  // Task name prefix of corresponding device name.
+  std::vector<string> task_names;
+  CollImplDetails impl_details;
+  string ToString() const;
+  CollInstanceParams& operator=(const struct CollInstanceParams& other);
+};
+
+// Data common to all instance members in the same task.
+struct CollTaskParams {
+  // True for devices that are local to the process, i.e. no RPC needed.
+  std::vector<bool> is_local;
+  string ToString() const;
+};
+
+// Unique to a single CollectiveOp node.
+struct CollectiveParams {
+  CollGroupParams group;
+  CollInstanceParams instance;
+  CollTaskParams task;
+
+  string name;       // node name used only for log or error messages
+  int default_rank;  // index of this op within device_names
+  bool is_source;    // broadcast only
+  // Rank of this device in each subdivision permutation.
+  std::vector<int> subdiv_rank;
+  std::vector<int> subdiv_source_rank;
+  std::unique_ptr<OpKernel> merge_op;  // reduction only
+  std::unique_ptr<OpKernel> final_op;  // reduction only
+  string ToString() const;
+};
+
+class CollectiveExecutor;
+
+// Interface that provides resolution of device localities.
+class DeviceResolverInterface {
+ public:
+  virtual ~DeviceResolverInterface() {}
+
+  // Collects DeviceLocality protobufs from all of the devices identified
+  // in 'col_params'.
+  virtual void GetDeviceLocalitiesAsync(const CollInstanceParams& inst_params,
+                                        std::vector<DeviceLocality>* localities,
+                                        const StatusCallback& done) = 0;
+
+  // Populate *locality with the DeviceLocality of the specified
+  // device.
+  virtual void GetLocalityAsync(const string& device, const string& task,
+                                DeviceLocality* locality,
+                                const StatusCallback& done) = 0;
+
+  // Clear the cache of device data belonging
+  // to the specified task.
+  virtual void ClearTask(const string& task) = 0;
+};
+
+// Interface that provides resolution of shared CollectiveParams fields.
+class ParamResolverInterface {
+ public:
+  virtual ~ParamResolverInterface() {}
+
+  // Called by each collective op at first execution in order to fill out
+  // the CollectiveParams structure with data gathered from the full
+  // (maybe distributed) collection of peer nodes.
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify
+  // data shared across a device group.
+  virtual void CompleteGroupAsync(const CompleteGroupRequest* request,
+                                  CompleteGroupResponse* response,
+                                  CancellationManager* cancel_mgr,
+                                  const StatusCallback& done) = 0;
+
+  // Used within a distributed implementation to discover/verify data
+  // shared across an instance group.
+  virtual void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     CancellationManager* cancel_mgr,
+                                     const StatusCallback& done) = 0;
+};
+
+// Graphs which utilize Collective Ops in a common instance must
+// execute with identical step_ids even if they are disjoint graphs
+// run by otherwise independent tasks.  This interface supplies
+// coordinated step_ids to use in such cases.
+class StepSequenceInterface {
+ public:
+  virtual ~StepSequenceInterface() {}
+
+  // Used with a distributed implementation to coordinate step_id
+  // sequences across tasks.
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    const StatusCallback& done) = 0;
+
+  // Refresh the local per-graph_key step_id sequence from collective
+  // group leader, if applicable.
+  virtual void RefreshStepIdSequenceAsync(int64 graph_key,
+                                          const StatusCallback& done) = 0;
+
+  // Returns the the step_id that should be used for initiating a new execution
+  // on the specified graph. May return the same step_id multiple times if
+  // RetireStepId or RefreshStepIdReservation is not called.
+  virtual int64 NextStepId(int64 graph_key) = 0;
+
+  // Reports that execution of the given step has completed successfully.
+  // Should be called immediately after a step completes with OK status,
+  // prior to calling NextStepId().  If the step fails, don't call.
+  virtual void RetireStepId(int64 graph_key, int64 step_id) = 0;
+};
+
+// Interface that provides access to per-step CollectiveExecutor
+// instances and various distributed resolution capabilities.
+class CollectiveExecutorMgrInterface : public StepSequenceInterface {
+ public:
+  virtual ~CollectiveExecutorMgrInterface() {}
+
+  // Returns the step-specific CollectiveExecutor, creating if one does not
+  // already exist.  The caller assumes ownership of one Ref on the object.
+  virtual CollectiveExecutor* FindOrCreate(int64 step_id) = 0;
+
+  // If there is a CollectiveExecutor for step_id, remove it from the
+  // table.
+  virtual void Cleanup(int64 step_id) = 0;
+
+  virtual ParamResolverInterface* GetParamResolver() const = 0;
+
+  virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+};
+
+// Interface that a Collective Op implementation uses to exchange data
+// with peers.  Note that data exchange is currently limited to types
+// for which DMAHelper::CanUseDMA() returns true, i.e.  dense numeric
+// types.
+class PeerAccessInterface {
+ public:
+  virtual ~PeerAccessInterface() {}
+
+  virtual void RecvFromPeer(const string& peer_device, const string& peer_task,
+                            bool peer_is_local, const string& key,
+                            Device* to_device, DeviceContext* to_device_ctx,
+                            const AllocatorAttributes& to_alloc_attr,
+                            Tensor* to_tensor,
+                            const DeviceLocality& client_locality,
+                            const StatusCallback& done) = 0;
+
+  virtual void PostToPeer(const string& peer_device, const string& peer_task,
+                          const string& key, Device* from_device,
+                          DeviceContext* from_device_ctx,
+                          const AllocatorAttributes& from_alloc_attr,
+                          const Tensor* from_tensor,
+                          const DeviceLocality& client_locality,
+                          const StatusCallback& done) = 0;
+};
+
+class PerStepCollectiveRemoteAccess;
+
+// A step-specific object that can execute a collective operation completely
+// described by a CollectiveParams object.
+class CollectiveExecutor : public PeerAccessInterface, public core::RefCounted {
+ public:
+  virtual void StartAbort(const Status& s) {}
+
+  virtual void ExecuteAsync(OpKernelContext* ctx,
+                            const CollectiveParams& col_params,
+                            const string& exec_key, StatusCallback done) {
+    done(errors::Internal(
+        "A collective Op has been called in a context in which "
+        "a CollectiveExecutor has not been provided."));
+  }
+
+  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   StatusCallback done) {
+    cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+  }
+
+  virtual PerStepCollectiveRemoteAccess* remote_access() { return nullptr; }
+
+  // Used to designate an invalid group or instance key.
+  static int64 kInvalidId;
+
+  // Lexically scoped handle for Ref.
+  class Handle {
+   public:
+    explicit Handle(CollectiveExecutor* ce, bool inherit_ref) : ce_(ce) {
+      if (!inherit_ref) ce->Ref();
+    }
+    ~Handle() { ce_->Unref(); }
+    CollectiveExecutor* get() const { return ce_; }
+
+   private:
+    CollectiveExecutor* ce_;
+  };
+
+ protected:
+  explicit CollectiveExecutor(CollectiveExecutorMgrInterface* cem)
+      : cem_(cem) {}
+
+  // For use only by derived classes
+  static OpKernelContext::Params* CtxParams(OpKernelContext* ctx);
+  CollectiveExecutorMgrInterface* cem_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
+};
+
+// Interface of a helper object that provices a CollectiveExecutor with
+// all of the remote access it needs.
+class CollectiveRemoteAccess : public PeerAccessInterface,
+                               public DeviceResolverInterface {
+ public:
+  virtual ~CollectiveRemoteAccess() {}
+};
+
+// A per-step version of CollectiveRemoteAccess that cleans up outstanding
+// communications in case step execution is abandoned.
+class PerStepCollectiveRemoteAccess : public CollectiveRemoteAccess {
+ public:
+  virtual ~PerStepCollectiveRemoteAccess() {}
+  virtual void StartAbort(const Status& s) = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_FRAMEWORK_COLLECTIVE_EXECUTOR_H_
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 2fb17c2b020a32060fae45e4b7b29db5b685de06..72eeda7a43eb3255fcdec803771649f0e10bd823 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -504,8 +504,8 @@ Status Conv3DShape(shape_inference::InferenceContext* c) {
     input_shape =
         c->MakeShape({{dim('N'), dim('0'), dim('1'), dim('2'), dim('C')}});
     stride_planes = strides[2];
-    stride_cols = strides[3];
-    stride_rows = strides[4];
+    stride_rows = strides[3];
+    stride_cols = strides[4];
   } else {
     stride_planes = strides[1];
     stride_rows = strides[2];
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 7230e0f09cc2cb5ac4b2f107d12bf879c82123a2..789746b4037fbf9f11d34c425272bfc44d8623be 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -272,7 +272,7 @@ Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c, int output_index);
 // Shape function for binary operators that broadcast their inputs.
 // Tested by ops/math_ops_test.cc.
 inline Status BroadcastBinaryOpShapeFn(InferenceContext* c) {
-    return BroadcastBinaryOpOutputShapeFn(c, 0);
+  return BroadcastBinaryOpOutputShapeFn(c, 0);
 }
 
 // Shape function for random operations.
diff --git a/tensorflow/core/framework/common_shape_fns_test.cc b/tensorflow/core/framework/common_shape_fns_test.cc
index 5f3e5ad45731750bfd73181c41cd029f23aab55f..13d429b89519cf7cc696a251030a26a0d8ff67f4 100644
--- a/tensorflow/core/framework/common_shape_fns_test.cc
+++ b/tensorflow/core/framework/common_shape_fns_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference_testutil.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -140,9 +141,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 1"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 1"));
   }
 
   {
@@ -161,10 +161,9 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5}), S({3, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains(
-                "Invalid argument: Dimensions must be equal, but are 5 and 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Invalid argument: Dimensions must be equal, but are 5 and 3"));
   }
 
   {
@@ -173,9 +172,8 @@ TEST(CommonShapeFnsTest, MatMulShapeTest) {
                        {S({2, 5, 3}), S({3, 5, 4})}, {}, {}, {});
     auto s = MatMulShape(&c);
     EXPECT_FALSE(s.ok());
-    EXPECT_TRUE(
-        StringPiece(s.ToString())
-            .contains("Invalid argument: Shape must be rank 2 but is rank 3"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(), "Invalid argument: Shape must be rank 2 but is rank 3"));
   }
 
   {
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index beaf0adbc5e0972faac96ac975c887c0080ec74f..9e7ffe6c0be1ed1d8b9fa8d4be50b9b516c034c6 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -201,7 +201,7 @@ class GraphDefBuilderWrapper {
   // Also looks up the `op_def->name` in the global
   // `WhitelistedStatefulOpRegistry`.
   bool IsOpWhitelisted(const OpDef* op_def) const {
-    return (StringPiece(op_def->name()).ends_with("Dataset") &&
+    return (str_util::EndsWith(op_def->name(), "Dataset") &&
             op_def->output_arg_size() == 1 &&
             op_def->output_arg(0).type() == DT_VARIANT) ||
            dataset::WhitelistedStatefulOpRegistry::Global()->Contains(
@@ -474,11 +474,11 @@ class GraphDatasetBase : public DatasetBase {
   }
 
   // Key for storing the Dataset graph in the serialized format.
-  static const char kDatasetGraphKey[];
+  TF_EXPORT static const char kDatasetGraphKey[];
 
   // Key for storing the output node of the Dataset graph in the serialized
   // format.
-  static const char kDatasetGraphOutputNodeKey[];
+  TF_EXPORT static const char kDatasetGraphOutputNodeKey[];
 
  private:
   Status Serialize(OpKernelContext* ctx, string* serialized_graph_def,
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 3e7b89d4ebc91df42ee81c1c9fe67c68e755f736..bdc1af9fdaeb2e2ba7605b6f67ea55fa0bb7977a 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -278,7 +279,7 @@ class FunctionInstantiationHelper {
       auto it = index_.lower_bound(node_name);
       while (it != index_.end() && it->first <= node_colon_bound) {
         if (it->first == node_name ||
-            tensorflow::StringPiece(it->first).starts_with(node_colon)) {
+            tensorflow::str_util::StartsWith(it->first, node_colon)) {
           nid = it->second.nid;
           break;
         }
@@ -502,7 +503,7 @@ string Print(const NodeDef& n) {
   std::vector<StringPiece> dat;
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
-    if (s.Consume("^")) {
+    if (str_util::ConsumePrefix(&s, "^")) {
       dep.push_back(s.ToString());
     } else {
       dat.push_back(s);
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 23685e9c536a67ca33fbabdab438e7192c8a47fc..44e1383719c9c903f956fca0b1ba93ec5df4adb4 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -496,7 +496,7 @@ MySelect(x:float) -> (z:float) {
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/framework/graph_def_util.cc b/tensorflow/core/framework/graph_def_util.cc
index 896cb3cd7ffe45f2d528761403cfa4aaed902d96..f7539d37be08ce1235f35dcc0a8fd0bfcb12b434 100644
--- a/tensorflow/core/framework/graph_def_util.cc
+++ b/tensorflow/core/framework/graph_def_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb_text.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -94,7 +95,7 @@ static Status RemoveNewDefaultAttrsFromNodeDef(
   std::vector<string> to_remove;
   for (const auto& attr : node_def->attr()) {
     // If the attr is not in consumer_op_def and doesn't start with '_'...
-    if (!StringPiece(attr.first).starts_with("_") &&
+    if (!str_util::StartsWith(attr.first, "_") &&
         FindAttr(attr.first, *consumer_op_def) == nullptr) {
       const OpDef::AttrDef* producer_attr_def =
           FindAttr(attr.first, *producer_op_def);
diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc
index e836873f667a6971b2c12d44860e5436a04cb93c..cc583df348b8d4d5416e428698fe1a49c29f3637 100644
--- a/tensorflow/core/framework/node_def_builder_test.cc
+++ b/tensorflow/core/framework/node_def_builder_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,7 +83,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
     for (const string& message : messages) {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
           << status << ", " << message;
     }
   }
@@ -103,7 +104,7 @@ class NodeDefBuilderTest : public ::testing::Test {
     }
     EXPECT_FALSE(status.ok()) << SummarizeNodeDef(node_def);
     if (status.ok()) return;
-    EXPECT_TRUE(StringPiece(status.error_message()).contains(message))
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), message))
         << "Actual error: " << status.error_message()
         << "\nDoes not contain: " << message;
   }
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 95fb3863144e8150d78f5d21722f6bc102c451ea..bad92ca9b3d8c981a5dc56485d218179190e83d0 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -131,7 +132,7 @@ Status AttrSlice::Find(StringPiece attr_name,
   // Skip AttachDef for internal attrs since it is a little bit
   // expensive and it is common for them to correctly not be included
   // in a NodeDef.
-  if (!attr_name.starts_with("_") && ndef_ != nullptr) {
+  if (!str_util::StartsWith(attr_name, "_") && ndef_ != nullptr) {
     s = AttachDef(s, *ndef_);
   }
   return s;
@@ -399,7 +400,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   size_t num_inputs = 0;
   // TODO(josh11b): Unify the input field validation.
   for (const string& input : node_def.input()) {
-    if (StringPiece(input).starts_with("^")) {
+    if (str_util::StartsWith(input, "^")) {
       seen_control = true;
       if (input.find(':') != string::npos) {
         return errors::InvalidArgument(
@@ -425,7 +426,7 @@ Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
   }
   for (const auto& attr : node_def.attr()) {
     // Allow internal optional attributes with names starting with "_".
-    if (StringPiece(attr.first).starts_with("_")) {
+    if (str_util::StartsWith(attr.first, "_")) {
       continue;
     }
     auto iter = op_attrs.find(attr.first);
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index ae3a93eafeefb2be3a85e546c085691a72caf2e1..2a49425dba9edeacf71b0ba41b78c082809ab2ae 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -65,7 +66,7 @@ void ExpectFailure(const NodeDef& bad, const OpDef& op_def,
       << "; OpDef: " << SummarizeOpDef(op_def);
 
   LOG(INFO) << "Message: " << status.error_message();
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
       << "NodeDef: " << SummarizeNodeDef(bad)
       << "; OpDef: " << SummarizeOpDef(op_def) << "\nActual error: " << status
       << "\nDoes not contain: " << message;
@@ -265,7 +266,7 @@ void ExpectInvalidSyntax(const NodeDef& bad, const string& message) {
   EXPECT_TRUE(errors::IsInvalidArgument(status))
       << status << "; NodeDef: " << SummarizeNodeDef(bad);
 
-  EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+  EXPECT_TRUE(str_util::StrContains(StringPiece(status.ToString()), message))
       << "NodeDef: " << SummarizeNodeDef(bad) << ", " << status << ", "
       << message;
 }
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index fc5467b3c86934908c3f1261c79659c6a0469350..5f68c59fe9ae084569d16a38b99294b02a5c5bd8 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -142,7 +143,7 @@ void OpRegistry::Export(bool include_internal, OpList* ops) const {
   out->Reserve(sorted.size());
 
   for (const auto& item : sorted) {
-    if (include_internal || !StringPiece(item.first).starts_with("_")) {
+    if (include_internal || !str_util::StartsWith(item.first, "_")) {
       *out->Add() = item.second->op_def;
     }
   }
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index b57bdcb841592578de4a2026d70b0e91bae66b02..c782480f1fa859715c46785faa22d01675c3c16e 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -96,7 +97,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeOpDef(old_op_def) << " vs. "
                     << SummarizeOpDef(new_op_def);
     } else {
-      EXPECT_TRUE(StringPiece(status.error_message()).contains(error))
+      EXPECT_TRUE(str_util::StrContains(status.error_message(), error))
           << status << " does not contain " << error;
     }
   }
@@ -118,7 +119,7 @@ class OpCompatibilityTest : public OpsTestBase {
       ADD_FAILURE() << SummarizeNodeDef(*node_def());
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(validation_error))
+          str_util::StrContains(status.error_message(), validation_error))
           << status << " does not contain " << validation_error;
     }
 
@@ -179,7 +180,7 @@ class OpCompatibilityTest : public OpsTestBase {
                     << SummarizeOpDef(*new_op_def);
     } else {
       EXPECT_TRUE(
-          StringPiece(status.error_message()).contains(compatibility_error))
+          str_util::StrContains(status.error_message(), compatibility_error))
           << status << " does not contain " << compatibility_error;
     }
   }
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ba545a19949e5574086756dc2092033341be4b30..ca0e5e7133af61a4c8e15be7cf8df903eeb648b0 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -126,6 +126,12 @@ message OpDef {
   // -------------------------------------------------------------------------
   // Optimization constraints.
 
+  // Ops are marked as stateful if their behavior depends on some state beyond
+  // their input tensors (e.g. variable reading op) or if they have
+  // a side-effect (e.g. printing or asserting ops). Equivalently, stateless ops
+  // must always produce the same output for the same input and have
+  // no side-effects.
+  //
   // By default Ops may be moved between devices.  Stateful ops should
   // either not be moved, or should only be moved if that state can also
   // be moved (e.g. via some sort of save / restore).
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 962bc11ccbd2b9abdd4ce26dc3e75c45862cdc74..403bd0b5e22a314309ad0994a879d588c124fe54 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -112,9 +112,11 @@ bool ConsumeAttrNumber(StringPiece* sp, int64* out) {
 
 bool ConsumeCompoundAttrType(StringPiece* sp, StringPiece* out) {
   auto capture_begin = sp->begin();
-  if (sp->Consume("numbertype") || sp->Consume("numerictype") ||
-      sp->Consume("quantizedtype") || sp->Consume("realnumbertype") ||
-      sp->Consume("realnumberictype")) {
+  if (str_util::ConsumePrefix(sp, "numbertype") ||
+      str_util::ConsumePrefix(sp, "numerictype") ||
+      str_util::ConsumePrefix(sp, "quantizedtype") ||
+      str_util::ConsumePrefix(sp, "realnumbertype") ||
+      str_util::ConsumePrefix(sp, "realnumberictype")) {
     *out = StringPiece(capture_begin, sp->begin() - capture_begin);
     return true;
   }
@@ -155,32 +157,32 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   bool is_list = ConsumeListPrefix(&spec);
   string type;
   StringPiece type_string;  // Used if type == "type"
-  if (spec.Consume("string")) {
+  if (str_util::ConsumePrefix(&spec, "string")) {
     type = "string";
-  } else if (spec.Consume("int")) {
+  } else if (str_util::ConsumePrefix(&spec, "int")) {
     type = "int";
-  } else if (spec.Consume("float")) {
+  } else if (str_util::ConsumePrefix(&spec, "float")) {
     type = "float";
-  } else if (spec.Consume("bool")) {
+  } else if (str_util::ConsumePrefix(&spec, "bool")) {
     type = "bool";
-  } else if (spec.Consume("type")) {
+  } else if (str_util::ConsumePrefix(&spec, "type")) {
     type = "type";
-  } else if (spec.Consume("shape")) {
+  } else if (str_util::ConsumePrefix(&spec, "shape")) {
     type = "shape";
-  } else if (spec.Consume("tensor")) {
+  } else if (str_util::ConsumePrefix(&spec, "tensor")) {
     type = "tensor";
-  } else if (spec.Consume("func")) {
+  } else if (str_util::ConsumePrefix(&spec, "func")) {
     type = "func";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
     VERIFY(ProcessCompoundType(type_string, allowed),
            "Expected to see a compound type, saw: ", type_string);
-  } else if (spec.Consume("{")) {
+  } else if (str_util::ConsumePrefix(&spec, "{")) {
     // e.g. "{ int32, float, bool }" or "{ \"foo\", \"bar\" }"
     AttrValue* allowed = attr->mutable_allowed_values();
     str_util::RemoveLeadingWhitespace(&spec);
-    if (spec.starts_with("\"") || spec.starts_with("'")) {
+    if (str_util::StartsWith(spec, "\"") || str_util::StartsWith(spec, "'")) {
       type = "string";  // "{ \"foo\", \"bar\" }" or "{ 'foo', 'bar' }"
       while (true) {
         StringPiece escaped_string;
@@ -193,11 +195,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                "Trouble unescaping \"", escaped_string,
                "\", got error: ", error);
         allowed->mutable_list()->add_s(unescaped);
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after strings in list, not: '", spec, "'");
           break;
         }
@@ -215,11 +218,12 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
                  "Unrecognized type string '", type_string, "'");
           allowed->mutable_list()->add_type(dt);
         }
-        if (spec.Consume(",")) {
+        if (str_util::ConsumePrefix(&spec, ",")) {
           str_util::RemoveLeadingWhitespace(&spec);
-          if (spec.Consume("}")) break;  // Allow ending with ", }".
+          if (str_util::ConsumePrefix(&spec, "}"))
+            break;  // Allow ending with ", }".
         } else {
-          VERIFY(spec.Consume("}"),
+          VERIFY(str_util::ConsumePrefix(&spec, "}"),
                  "Expected , or } after types in list, not: '", spec, "'");
           break;
         }
@@ -232,7 +236,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
 
   // Write the type into *attr.
   if (is_list) {
-    VERIFY(spec.Consume(")"), "Expected ) to close 'list(', not: '", spec, "'");
+    VERIFY(str_util::ConsumePrefix(&spec, ")"),
+           "Expected ) to close 'list(', not: '", spec, "'");
     str_util::RemoveLeadingWhitespace(&spec);
     attr->set_type(strings::StrCat("list(", type, ")"));
   } else {
@@ -240,7 +245,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Read optional minimum constraint at the end.
-  if ((is_list || type == "int") && spec.Consume(">=")) {
+  if ((is_list || type == "int") && str_util::ConsumePrefix(&spec, ">=")) {
     int64 min_limit = -999;
     VERIFY(ConsumeAttrNumber(&spec, &min_limit),
            "Could not parse integer lower limit after '>=', found '", spec,
@@ -250,7 +255,7 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
   }
 
   // Parse default value, if present.
-  if (spec.Consume("=")) {
+  if (str_util::ConsumePrefix(&spec, "=")) {
     str_util::RemoveLeadingWhitespace(&spec);
     VERIFY(ParseAttrValue(attr->type(), spec, attr->mutable_default_value()),
            "Could not parse default value '", spec, "'");
diff --git a/tensorflow/core/framework/op_def_util.cc b/tensorflow/core/framework/op_def_util.cc
index c80802aad3a6b84572096f726d90133ac5536526..9be0dc69d2c190274b3f8d473df170f3b4ed3660 100644
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -239,7 +240,7 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
 Status ValidateOpDef(const OpDef& op_def) {
   using ::tensorflow::strings::Scanner;
 
-  if (!StringPiece(op_def.name()).starts_with("_")) {
+  if (!str_util::StartsWith(op_def.name(), "_")) {
     VALIDATE(Scanner(op_def.name())
                  .One(Scanner::UPPERLETTER)
                  .Any(Scanner::LETTER_DIGIT)
@@ -259,11 +260,11 @@ Status ValidateOpDef(const OpDef& op_def) {
 
     // Validate type
     StringPiece type(attr.type());
-    bool is_list = type.Consume("list(");
+    bool is_list = str_util::ConsumePrefix(&type, "list(");
     bool found = false;
     for (StringPiece valid : {"string", "int", "float", "bool", "type", "shape",
                               "tensor", "func"}) {
-      if (type.Consume(valid)) {
+      if (str_util::ConsumePrefix(&type, valid)) {
         found = true;
         break;
       }
@@ -271,8 +272,9 @@ Status ValidateOpDef(const OpDef& op_def) {
     VALIDATE(found, "Unrecognized type '", type, "' in attr '", attr.name(),
              "'");
     if (is_list) {
-      VALIDATE(type.Consume(")"), "'list(' is missing ')' in attr ",
-               attr.name(), "'s type ", attr.type());
+      VALIDATE(str_util::ConsumePrefix(&type, ")"),
+               "'list(' is missing ')' in attr ", attr.name(), "'s type ",
+               attr.type());
     }
     VALIDATE(type.empty(), "Extra '", type, "' at the end of attr ",
              attr.name(), "'s type ", attr.type());
diff --git a/tensorflow/core/framework/op_def_util_test.cc b/tensorflow/core/framework/op_def_util_test.cc
index 2b9812d4fcbc145540155959b19dd37cf902c1a2..4514d92e387b9de90b767d0a775272469006cf04 100644
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@@ -57,7 +57,7 @@ class ValidateOpDefTest : public ::testing::Test {
     EXPECT_FALSE(status.ok()) << "Did not see error with: " << message;
     if (!status.ok()) {
       LOG(INFO) << "message: " << status;
-      EXPECT_TRUE(StringPiece(status.ToString()).contains(message))
+      EXPECT_TRUE(str_util::StrContains(status.ToString(), message))
           << "Actual: " << status << "\nExpected to contain: " << message;
     }
   }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 5f2eb9d99ab11f9862bd277d93af61c05e2517f4..7f23272871abe96dfa2fd7240bfc82015178bda6 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -50,10 +50,10 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
     StringPiece to_append = str.substr(0, space);
     str.remove_prefix(space + 1);
     // Remove spaces at break.
-    while (to_append.ends_with(" ")) {
+    while (str_util::EndsWith(to_append, " ")) {
       to_append.remove_suffix(1);
     }
-    while (str.Consume(" ")) {
+    while (str_util::ConsumePrefix(&str, " ")) {
     }
 
     // Go on to the next line.
@@ -65,8 +65,9 @@ string WordWrap(StringPiece prefix, StringPiece str, int width) {
 }
 
 bool ConsumeEquals(StringPiece* description) {
-  if (description->Consume("=")) {
-    while (description->Consume(" ")) {  // Also remove spaces after "=".
+  if (str_util::ConsumePrefix(description, "=")) {
+    while (str_util::ConsumePrefix(description,
+                                   " ")) {  // Also remove spaces after "=".
     }
     return true;
   }
@@ -98,7 +99,7 @@ static bool StartsWithFieldName(StringPiece line,
                                 const std::vector<string>& multi_line_fields) {
   StringPiece up_to_colon;
   if (!SplitAt(':', &line, &up_to_colon)) return false;
-  while (up_to_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&up_to_colon, " "))
     ;  // Remove leading spaces.
   for (const auto& field : multi_line_fields) {
     if (up_to_colon == field) {
@@ -119,9 +120,9 @@ static bool ConvertLine(StringPiece line,
   StringPiece up_to_colon;
   StringPiece after_colon = line;
   SplitAt(':', &after_colon, &up_to_colon);
-  while (after_colon.Consume(" "))
+  while (str_util::ConsumePrefix(&after_colon, " "))
     ;  // Remove leading spaces.
-  if (!after_colon.Consume("\"")) {
+  if (!str_util::ConsumePrefix(&after_colon, "\"")) {
     // We only convert string fields, so don't convert this line.
     return false;
   }
@@ -181,9 +182,9 @@ string PBTxtToMultiline(StringPiece pbtxt,
 static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   if (colon == StringPiece::npos) return false;
   line.remove_prefix(colon + 1);
-  while (line.Consume(" ")) {
+  while (str_util::ConsumePrefix(&line, " ")) {
   }
-  if (line.Consume("<<")) {
+  if (str_util::ConsumePrefix(&line, "<<")) {
     *end = line.ToString();
     return true;
   }
@@ -228,7 +229,7 @@ string PBTxtFromMultiline(StringPiece multiline_pbtxt) {
     string suffix;
     while (!multiline_pbtxt.empty()) {
       SplitAt('\n', &multiline_pbtxt, &line);
-      if (line.Consume(end)) break;
+      if (str_util::ConsumePrefix(&line, end)) break;
       if (first) {
         first = false;
       } else {
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 9ec1c213c3b465fc6a42b076c97bf936066a6b40..05171006b0c7b2f389fd48f57af191bbf13d88b8 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -96,7 +96,7 @@ OpKernel::OpKernel(OpKernelConstruction* context,
       output_memory_types_(context->output_memory_types().begin(),
                            context->output_memory_types().end()),
       graph_def_version_(context->graph_def_version()),
-      is_internal_(StringPiece(type_string()).starts_with("_")),
+      is_internal_(str_util::StartsWith(type_string(), "_")),
       input_name_map_(context->num_inputs()),
       output_name_map_(context->num_outputs()) {
   OP_REQUIRES_OK(context,
@@ -365,7 +365,7 @@ Status OpKernelContext::input_ref_mutex(StringPiece name, mutex** out_mutex) {
 
 const Tensor& OpKernelContext::input(int index) {
   DCHECK_GE(index, 0);
-  DCHECK_LT(index, num_inputs());
+  DCHECK_LT(index, num_inputs()) << " name: " << op_kernel().name();
   DCHECK(!input_is_ref(index));
   const Tensor& tensor = *((*params_->inputs)[index].tensor);
   record_tensor_reference(tensor);
@@ -420,8 +420,8 @@ bool OpKernelContext::forward_input_to_output_with_shape(
                                ? AllocatorAttributes()
                                : output_alloc_attr(output_index);
   std::unique_ptr<Tensor> new_tensor = forward_input(
-      input_index, expected_output_dtype(output_index), output_shape,
-      output_memory_type(output_index), output_attr);
+      input_index, output_index, expected_output_dtype(output_index),
+      output_shape, output_memory_type(output_index), output_attr);
   if (new_tensor != nullptr) {
     // Transfer ownership to the output slot in OpKernelContext.
     outputs_[output_index] = TensorValue(new_tensor.release());
@@ -461,35 +461,66 @@ Status OpKernelContext::forward_input_to_output_with_shape(
 }
 
 std::unique_ptr<Tensor> OpKernelContext::forward_input(
-    int input_index, DataType output_dtype, const TensorShape& output_shape,
-    MemoryType output_memory_type, const AllocatorAttributes& output_attr) {
+    int input_index, int output_index, DataType output_dtype,
+    const TensorShape& output_shape, MemoryType output_memory_type,
+    const AllocatorAttributes& output_attr) {
   DCHECK_GE(input_index, 0);
   DCHECK_LT(input_index, num_inputs());
   const TensorValue& input = (*params_->inputs)[input_index];
-  // Check that input tensor exists, is not a ref, and has no other consumers.
-  if (input.tensor == nullptr || input.is_ref() || !input->RefCountIsOne()) {
+  // Check whether at graph construction time this output was marked
+  // either for no forwarding or with a reservation for this input.
+  // If it's reserved for this input we'll skip the refcount and
+  // AllocatorAttribute checks.
+  // TODO(tucker): Maybe we should skip all of the checks?
+  bool never_forward =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == Params::kNeverForward);
+  if (never_forward) return nullptr;
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && output_index >= 0 &&
+       params_->forward_from_array[output_index] == input_index);
+  if (!forward_expected && params_->forward_from_array != nullptr) {
+    // Check for possibly conflicting forward.
+    for (int i = 0; i < num_outputs(); ++i) {
+      if (params_->forward_from_array[i] == input_index) {
+        // This input is reserved for output i.
+        return nullptr;
+      }
+    }
+  }
+  // Check that input tensor exists and is not a ref.
+  if (input.tensor == nullptr || input.is_ref()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input type matches.
   if (input_dtype(input_index) != output_dtype) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that the input and output sizes are compatible.
   if (input.tensor->shape().num_elements() != output_shape.num_elements()) {
+    CHECK(!forward_expected);
     return nullptr;
   }
   // Check that input and output memory types match, i.e.
   // that they either both live in host or both live in device memory.
   if (input_memory_type(input_index) != output_memory_type) {
+    CHECK(!forward_expected);
     return nullptr;
   }
-  // Check that output allocator attributes are not more restrictive than
-  // input allocator attributes.
-  const auto input_attr = params_->input_alloc_attrs == nullptr
-                              ? AllocatorAttributes()
-                              : input_alloc_attr(input_index);
-  if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
-    return nullptr;
+  if (!forward_expected) {
+    if (!input->RefCountIsOne()) {
+      return nullptr;
+    }
+    // Check that output allocator attributes are not more restrictive than
+    // input allocator attributes.
+    const auto input_attr = params_->input_alloc_attrs == nullptr
+                                ? AllocatorAttributes()
+                                : input_alloc_attr(input_index);
+    if (!output_attr.IsEqualOrLessRestrictiveThan(input_attr)) {
+      return nullptr;
+    }
   }
   // TODO(rmlarsen): Use MakeUnique here. There is already a copy in
   // tensorflow/compiler/xla/ptr_util.h. Perhaps this should be part of
@@ -505,7 +536,8 @@ Status OpKernelContext::forward_input_or_allocate_temp(
     Tensor* out_temp) {
   for (int input_index : candidate_input_indices) {
     std::unique_ptr<Tensor> new_tensor =
-        forward_input(input_index, type, shape, DEVICE_MEMORY, allocator_attr);
+        forward_input(input_index, Params::kNoReservation /*output_index*/,
+                      type, shape, DEVICE_MEMORY, allocator_attr);
     if (new_tensor != nullptr) {
       *out_temp = std::move(*new_tensor);
       return Status::OK();
@@ -595,6 +627,14 @@ Status OpKernelContext::allocate_output(int index, const TensorShape& shape,
                                         Tensor** output) {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, num_outputs());
+  bool forward_expected =
+      (params_->forward_from_array != nullptr && index >= 0 &&
+       params_->forward_from_array[index] >= 0);
+  if (forward_expected) {
+    return errors::Internal(
+        "Explicit allocate_output call where input forwarding required.  Try "
+        "turning off the ScopedAllocator optimizer.");
+  }
   AllocatorAttributes attr = output_alloc_attr(index);
   return allocate_output(index, shape, output, attr);
 }
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 5ccd45efc980393aa02582595dde873be7426e26..67943377b9f5cd2dcb5f9dc347011db22fac1726 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -64,10 +64,11 @@ class AsyncOpKernel;
 class CallFrameInterface;
 class FunctionLibraryRuntime;
 class OpKernelConstruction;  // declared below
-class OpKernelContext;       // declared below
+class OpKernelContext;       // declared below,
 class OpRegistryInterface;
 class ResourceMgr;
 class ScopedStepContainer;
+class CollectiveExecutor;
 class StepStatsCollector;
 
 class OpKernel {
@@ -532,6 +533,10 @@ class OpKernelContext {
     // computations running on other devices.
     Rendezvous* rendezvous = nullptr;
 
+    // Mechanism for executing a collective op that needs to coordinate
+    // with parallel instances runing on other devices.
+    CollectiveExecutor* collective_executor = nullptr;
+
     // The session state for this op.
     SessionState* session_state = nullptr;
 
@@ -565,6 +570,12 @@ class OpKernelContext {
 
     // TensorSliceReaderCache support.
     checkpoint::TensorSliceReaderCacheWrapper* slice_reader_cache = nullptr;
+
+    // Support for forwarding reservations (used by ScopedAllocator).
+    static const int kNeverForward = -2;
+    static const int kNoReservation = -1;
+    // Values in [0,...) represent reservations for the indexed output.
+    const int* forward_from_array = nullptr;
   };
 
   // params must outlive the OpKernelContext.
@@ -707,14 +718,31 @@ class OpKernelContext {
   //     input[input_index] are compatible with those given in dtype, shape,
   //     memory_type, and attr,
   //   * refcount on the underlying buffer is one.
+  //   * Either there is no forwarding reservation for either input_index
+  //     or output_index or the specified input is reserved for the specified
+  //     output. More precisely:
+  //
+  //     These cases mean neither input nor output has a reservation:
+  //        forward_from_array = nullptr
+  //     OR (input_index is not in forward_from_array AND
+  //         (output_index == kNoReservation OR
+  //          forward_from_array[output_index] == kNoReservation))
+  //
+  //     This case means that input_index is reserved for output_index:
+  //        forward_from_array[output_index] == input_index
+  //
+  //     This case means the output is reserved to always be allocated,
+  //     never assigned a forwarded input:
+  //        forward_from_array[output_index] == kNeverForward
+  //
   // Otherwise returns nullptr.
   // NOTE: For Cuda kernels that read inputs using the __ldg() intrinsic,
   // forwarding is only safe if there are no reads via __ldg() after writes
   // to the same address.
   std::unique_ptr<Tensor> forward_input(
-      int input_index, DataType dtype, const TensorShape& shape,
-      MemoryType memory_type,
-      const AllocatorAttributes& attr) TF_MUST_USE_RESULT;
+      int input_index, int output_index, DataType output_dtype,
+      const TensorShape& output_shape, MemoryType output_memory_type,
+      const AllocatorAttributes& output_attr) TF_MUST_USE_RESULT;
 
   // Tries to forward one of the inputs given in input_indices to
   // output[output_index]. If none of the given inputs can be forwarded, calls
@@ -934,6 +962,10 @@ class OpKernelContext {
   // Rendezvous Send() and Recv().
   Rendezvous* rendezvous() const { return params_->rendezvous; }
 
+  CollectiveExecutor* collective_executor() const {
+    return params_->collective_executor;
+  }
+
   // An op kernel can access the session state it belongs to.
   SessionState* session_state() const { return params_->session_state; }
 
@@ -1101,7 +1133,8 @@ class OpKernelContext {
   void NotifyUseOfPersistentTensor(const Tensor& tensor);
 
   Status status_;
-  Params* params_;    // not owned
+  friend class CollectiveExecutor;  // for access to params_
+  Params* params_;                  // not owned
   mutable mutex mu_;  // mutable so const accessors can acquire the lock
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b53b877f28d2c80e969fb418aa316ad96c6e2eaa..bcd409e5c54b7d63137dd9d236d21bb3ec7b4f56 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -546,9 +546,9 @@ TEST_F(OpKernelBuilderTest, BuilderTypeListAttr) {
                                             {"T|list(type)|[DT_FLOAT]"}));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
-  EXPECT_TRUE(
-      StringPiece(GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}))
-          .contains("Invalid argument: "));
+  EXPECT_TRUE(str_util::StrContains(
+      GetKernelClassName("BuildTypeListAttr", DEVICE_CPU, {}),
+      "Invalid argument: "));
 
   ExpectFailure("BuildTypeListAttr", DEVICE_CPU, {"T|int|7"},
                 error::INVALID_ARGUMENT);
@@ -565,8 +565,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernel", DEVICE_CPU, {}, error::INVALID_ARGUMENT);
 }
@@ -585,8 +585,8 @@ TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("Multiple OpKernel registrations match NodeDef"));
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "Multiple OpKernel registrations match NodeDef"));
 
   ExpectFailure("DuplicateKernelForT", DEVICE_CPU, {"T|type|DT_FLOAT"},
                 error::INVALID_ARGUMENT);
@@ -606,8 +606,9 @@ TEST_F(OpKernelBuilderTest, BadConstraint) {
   DeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("OpKernel 'BadConstraint' has constraint on attr "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "OpKernel 'BadConstraint' has constraint on attr "
                             "'T' not in NodeDef"));
 
   ExpectFailure("BadConstraint", DEVICE_CPU, {"dtype|type|DT_FLOAT"},
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 9a458431e7c1b038c3177b2aa58e21dfa3e4e837..c84ea3b034cc20329b20af111f6b08ceebbfb80b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -319,14 +319,13 @@ class IsResourceInitialized : public OpKernel {
 // specified type. The type will be a part of the generated op name.
 // TODO(apassos): figure out how to get non-cpu-allocated tensors to work
 // through constant folding so this doesn't have to be marked as stateful.
-#define REGISTER_RESOURCE_HANDLE_OP(Type)                   \
-  REGISTER_OP(#Type "HandleOp")                             \
-      .Attr("container: string = ''")                       \
-      .Attr("shared_name: string = ''")                     \
-      .Output("resource: resource")                         \
-      .SetIsStateful()                                      \
-      .SetShapeFn(tensorflow::shape_inference::ScalarShape) \
-      .Doc("Creates a handle to a " #Type)
+#define REGISTER_RESOURCE_HANDLE_OP(Type) \
+  REGISTER_OP(#Type "HandleOp")           \
+      .Attr("container: string = ''")     \
+      .Attr("shared_name: string = ''")   \
+      .Output("resource: resource")       \
+      .SetIsStateful()                    \
+      .SetShapeFn(tensorflow::shape_inference::ScalarShape)
 
 // Utility op kernel to produce a handle to a resource of type T.
 template <typename T>
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 07272e2374cbf4fb46c5b8da5df73ef4d6858c62..798220d4c35c502df61c93b78ccd100d7c4b5ad5 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -71,7 +72,7 @@ string LookupOrCreate(ResourceMgr* rm, const string& container,
 }
 
 static void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 54ecaa5dd431d435fe04948223d565802d525be0..cc1ec47a83cd52f4ab9c88dd98b2962fe1c0ece6 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -726,6 +726,24 @@ ShapeHandle InferenceContext::Matrix(DimensionOrConstant dim1,
   return MakeShape({dim1, dim2});
 }
 
+Status InferenceContext::MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+    int input_idx, ShapeHandle* out) {
+  ShapeHandle input_shape;
+  TF_RETURN_IF_ERROR(WithRankAtMost(input(input_idx), 1, &input_shape));
+
+  requested_input_tensor_as_partial_shape_[input_idx] = true;
+  if (input_idx < input_tensors_as_shapes_.size() &&
+      input_tensors_as_shapes_[input_idx].IsSet() &&
+      RankKnown(input_tensors_as_shapes_[input_idx])) {
+    *out = input_tensors_as_shapes_[input_idx];
+    return Status::OK();
+  }
+
+  return InternalMakeShapeFromTensor(
+      true /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
+}
+
 Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
                                                   ShapeHandle* out) {
   ShapeHandle input_shape;
@@ -739,13 +757,31 @@ Status InferenceContext::MakeShapeFromShapeTensor(int input_idx,
     return Status::OK();
   }
 
-  return MakeShapeFromTensor(input_tensor(input_idx), input_shape, out);
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */,
+      input_tensor(input_idx), input_shape, out);
 }
 
 Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
                                              ShapeHandle tensor_shape,
                                              ShapeHandle* out) {
+  return InternalMakeShapeFromTensor(
+      false /* treat_unknown_scalar_tensor_as_unknown_shape */, t, tensor_shape,
+      out);
+}
+
+Status InferenceContext::InternalMakeShapeFromTensor(
+    bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+    ShapeHandle tensor_shape, ShapeHandle* out) {
+  // Only callers who have set
+  if (!treat_unknown_scalar_tensor_as_unknown_shape) {
+    TF_RETURN_IF_ERROR(WithRank(tensor_shape, 1, &tensor_shape));
+  }
   if (t == nullptr) {
+    // This is guarded by the check above.
+    if (Rank(tensor_shape) == 0) {
+      return ReturnUnknownShape(out);
+    }
     // Shape tensor is not known, but if the shape of the shape tensor is then
     // the right number of unknown dims can be created.
     DimensionHandle shape_dim = Dim(tensor_shape, 0);
@@ -759,10 +795,46 @@ Status InferenceContext::MakeShapeFromTensor(const Tensor* t,
     return ReturnCreatedShape(dims, out);
   }
 
+  if (t->shape().dims() == 0) {
+    if (t->dtype() == DataType::DT_INT32) {
+      auto flat_t = t->scalar<int32>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else if (t->dtype() == DataType::DT_INT64) {
+      auto flat_t = t->scalar<int64>();
+      if (flat_t() != -1) {
+        *out = nullptr;
+        return errors::InvalidArgument(
+            "Input tensor must be rank 1, or if its rank 0 it must have value "
+            "-1 "
+            "(representing an unknown shape).  Saw value: ",
+            flat_t());
+      }
+      return ReturnUnknownShape(out);
+    } else {
+      *out = nullptr;
+      return errors::InvalidArgument(
+          "Input tensor must be int32 or int64, but was ",
+          DataTypeString(t->dtype()));
+    }
+  }
+
   if (t->shape().dims() != 1) {
     *out = nullptr;
-    return errors::InvalidArgument("Input tensor must be rank 1, but was rank ",
-                                   t->shape().dims());
+    return errors::InvalidArgument(
+        "Input tensor must be rank 1, but was rank ", t->shape().dims(), ".",
+        ((t->shape().dims() == 0)
+             ? "If it is rank 0 rank 0 it must have statically known value -1 "
+               "(representing an unknown shape). "
+             : " "),
+        "Saw tensor shape ", t->shape().DebugString());
   }
   std::vector<DimensionHandle> dims;
   if (t->dtype() == DataType::DT_INT32) {
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index accc587000767554f87a195e0ea33640cd696244..cdb4bd79bbb9040ad0a40eaa2f30134f5de79786 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -463,6 +463,12 @@ class InferenceContext {
   // the input tensor is NULL, then an unknown shape is returned.
   Status MakeShapeFromShapeTensor(int input_idx, ShapeHandle* out);
 
+  // Like the function above, but treats scalar values as unknown
+  // shapes.  **NOTE** If the scalar is statically known, its value
+  // must be -1 or an error is returned.
+  Status MakeShapeFromShapeTensorTreatScalarAsUnknownShape(int input_idx,
+                                                           ShapeHandle* out);
+
   // Returns in <out> a new shape corresponding to <proto>.
   Status MakeShapeFromShapeProto(const TensorShapeProto& proto,
                                  ShapeHandle* out);
@@ -708,6 +714,11 @@ class InferenceContext {
     merged_dims_.clear();
   }
 
+  // Helper method for MakeShapeFromTensor and MakeShapeFromShapeTensor.
+  Status InternalMakeShapeFromTensor(
+      bool treat_unknown_scalar_tensor_as_unknown_shape, const Tensor* t,
+      ShapeHandle tensor_shape, ShapeHandle* out);
+
   ShapeManager shape_manager_;
 
   // inputs_, outputs_, and input_tensors_as_shapes_ refer to values from
diff --git a/tensorflow/core/framework/shape_inference_test.cc b/tensorflow/core/framework/shape_inference_test.cc
index f48a7b9c47df3cfa93434ccf585dda8c5a29a2ba..586c38e43bbe75fa0710b11bb7290ee7b3f627d9 100644
--- a/tensorflow/core/framework/shape_inference_test.cc
+++ b/tensorflow/core/framework/shape_inference_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -152,10 +153,9 @@ TEST_F(ShapeInferenceTest, Run) {
     };
     Status s = c.Run(fn);
     // Extra error message is attached when Run fails.
-    EXPECT_TRUE(StringPiece(s.ToString())
-                    .contains("Shape must be at most rank 0 but "
-                              "is rank 1 for 'foo' (op: "
-                              "'foo_op')"))
+    EXPECT_TRUE(str_util::StrContains(
+        s.ToString(),
+        "Shape must be at most rank 0 but is rank 1 for 'foo' (op: 'foo_op')"))
         << s;
   }
 }
@@ -367,10 +367,9 @@ TEST_F(ShapeInferenceTest, WithRankAtMost) {
 
   // WithRankAtMost on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtMost(in1, 2, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at most rank 2 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtMost(in1, 2, &s1).ToString(),
+      "Invalid argument: Shape must be at most rank 2 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtMost(in1, 3, &s1).ok());
@@ -406,10 +405,9 @@ TEST_F(ShapeInferenceTest, WithRankAtLeast) {
 
   // WithRankAtLeast on shape with known dimensionality.
   s1 = in1;
-  EXPECT_TRUE(
-      StringPiece(c.WithRankAtLeast(in1, 4, &s1).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 4 but is rank 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.WithRankAtLeast(in1, 4, &s1).ToString(),
+      "Invalid argument: Shape must be at least rank 4 but is rank 3"));
 
   EXPECT_FALSE(IsSet(s1));
   EXPECT_TRUE(c.WithRankAtLeast(in1, 3, &s1).ok());
@@ -449,12 +447,14 @@ TEST_F(ShapeInferenceTest, WithValue) {
   // WithValue on dimension with known size.
   out1 = d0;
 
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 0, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 0 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 0, &out1).ToString(),
+                            "Invalid argument: Dimension must be 0 but is 1"));
   EXPECT_FALSE(IsSet(out1));
   out1 = d0;
-  EXPECT_TRUE(StringPiece(c.WithValue(d0, 2, &out1).ToString())
-                  .contains("Invalid argument: Dimension must be 2 but is 1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.WithValue(d0, 2, &out1).ToString(),
+                            "Invalid argument: Dimension must be 2 but is 1"));
 
   EXPECT_FALSE(IsSet(out1));
   EXPECT_TRUE(c.WithValue(d0, 1, &out1).ok());
@@ -513,16 +513,14 @@ TEST_F(ShapeInferenceTest, MergeDim) {
   EXPECT_EQ(3, merged_dims.size());
 
   // Merging unequal values is an error.
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d2, d1, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 2 and 1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d2, d1, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 2 and 1"));
 
   EXPECT_FALSE(IsSet(out));
-  EXPECT_TRUE(
-      StringPiece(c.Merge(d1, d2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(d1, d2, &out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -729,26 +727,23 @@ TEST_F(ShapeInferenceTest, MergeShape) {
 
   // Incompatible merges give errors and set out to nullptr.
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_u_2, s_1_3, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 2 and 3"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_u_2, s_1_3, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 2 and 3"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1_3, s_u_2, &out).ToString())
-          .contains(
-              "Invalid argument: Dimension 1 in both shapes must be equal, but "
-              "are 3 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1_3, s_u_2, &out).ToString(),
+      "Invalid argument: Dimension 1 in both shapes must be equal, but "
+      "are 3 and 2"));
 
   EXPECT_FALSE(IsSet(out));
   out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(c.Merge(s_1, s_1_2, &out).ToString())
-          .contains(
-              "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Merge(s_1, s_1_2, &out).ToString(),
+      "Invalid argument: Shapes must be equal rank, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(out));
 
@@ -795,22 +790,18 @@ TEST_F(ShapeInferenceTest, MergePrefix) {
   // Incompatible merges give errors and set outs to nullptr.
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Dimensions must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_1_u_3, s_2_4, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Dimensions must be equal, but are 1 and 2"));
 
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 
   s_out = s_unknown;
   s_prefix_out = s_unknown;
-  EXPECT_TRUE(
-      StringPiece(
-          c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString())
-          .contains(
-              "Invalid argument: Shape must be at least rank 3 but is rank 2"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MergePrefix(s_2_4, s_1_u_3, &s_out, &s_prefix_out).ToString(),
+      "Invalid argument: Shape must be at least rank 3 but is rank 2"));
   EXPECT_FALSE(IsSet(s_out));
   EXPECT_FALSE(IsSet(s_prefix_out));
 }
@@ -868,24 +859,21 @@ TEST_F(ShapeInferenceTest, Subshape) {
 
   // Errors.
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 6, -3, &out).ToString())
-                  .contains("Invalid argument: Subshape must have computed "
-                            "start <= end, but is 5 "
-                            "and 2 (computed from start 6 and end -3 over "
-                            "shape with rank 5)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subshape(in0, 6, -3, &out).ToString(),
+      "Invalid argument: Subshape must have computed start <= end, but is 5 "
+      "and 2 (computed from start 6 and end -3 over shape with rank 5)"));
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, -50, 100, &out).ToString())
-                  .contains("Invalid argument: Subshape start out of "
-                            "bounds: -50, for shape with "
-                            "rank 5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, -50, 100, &out).ToString(),
+                                    "Invalid argument: Subshape start out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
   out = unknown;
-  EXPECT_TRUE(StringPiece(c.Subshape(in0, 0, -50, &out).ToString())
-                  .contains("Invalid argument: Subshape end out of bounds: "
-                            "-50, for shape with rank "
-                            "5"));
+  EXPECT_TRUE(str_util::StrContains(c.Subshape(in0, 0, -50, &out).ToString(),
+                                    "Invalid argument: Subshape end out of "
+                                    "bounds: -50, for shape with rank 5"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1093,28 +1081,36 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeTensor) {
   t = ::tensorflow::test::AsTensor<int64>({});
   EXPECT_EQ("[]", create(&t));
 
+  // Test negative scalar
+  t = ::tensorflow::test::AsScalar<int32>(-1);
+  EXPECT_EQ("?", create(&t));
+
   t = ::tensorflow::test::AsTensor<float>({1, 2, 3});
-  EXPECT_TRUE(
-      StringPiece(create(&t))
-          .contains("Input tensor must be int32 or int64, but was float"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Input tensor must be int32 or int64, but was float"));
 
   t = ::tensorflow::test::AsScalar<int32>(1);
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 0"));
+  auto s_scalar = create(&t);
+  EXPECT_TRUE(str_util::StrContains(
+      s_scalar,
+      "Input tensor must be rank 1, or if its rank 0 it must have value -1"))
+      << s_scalar;
 
   t = ::tensorflow::test::AsTensor<int32>({1, 2}, TensorShape{2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Input tensor must be rank 1, but was rank 2"));
+  auto s_matrix = create(&t);
+  EXPECT_TRUE(str_util::StrContains(
+      s_matrix, "Input tensor must be rank 1, but was rank 2"))
+      << s_matrix;
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int64>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test negative values for the dims.
   t = ::tensorflow::test::AsTensor<int32>({3, -2, 1});
-  EXPECT_TRUE(StringPiece(create(&t))
-                  .contains("Invalid value in tensor used for shape: -2"));
+  EXPECT_TRUE(str_util::StrContains(
+      create(&t), "Invalid value in tensor used for shape: -2"));
 
   // Test when the input shape is wrong.
   {
@@ -1172,9 +1168,9 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
   EXPECT_TRUE(c.MakeShapeFromShapeProto(proto, &out).ok());
   EXPECT_EQ("?", c.DebugString(out));
   proto.add_dim()->set_size(0);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("An unknown shape must not have any dimensions set."));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "An unknown shape must not have any dimensions set."));
   EXPECT_FALSE(IsSet(out));
 
   // With known rank.
@@ -1188,10 +1184,10 @@ TEST_F(ShapeInferenceTest, MakeShapeFromShapeProto) {
 
   // With invalid dimension value.
   proto.add_dim()->set_size(-2);
-  EXPECT_TRUE(
-      StringPiece(c.MakeShapeFromShapeProto(proto, &out).error_message())
-          .contains("Shape [0,?,1000,-2] has dimensions with values below -1 "
-                    "(where -1 means unknown)"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.MakeShapeFromShapeProto(proto, &out).error_message(),
+      "Shape [0,?,1000,-2] has dimensions with values below -1 "
+      "(where -1 means unknown)"));
 
   EXPECT_FALSE(IsSet(out));
 }
@@ -1257,9 +1253,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 
   // Same tests, with int64 values.
   t1 = tensorflow::test::AsScalar<int64>(20);
@@ -1267,9 +1264,10 @@ TEST_F(ShapeInferenceTest, MakeDimForScalarInput) {
   EXPECT_TRUE(c.MakeDimForScalarInput(0, &d).ok());
   EXPECT_EQ("20", c.DebugString(d));
 
-  EXPECT_TRUE(StringPiece(c.MakeDimForScalarInput(1, &d).error_message())
-                  .contains("Dimension size, given by scalar input 1, must "
-                            "be non-negative but is -1"));
+  EXPECT_TRUE(
+      str_util::StrContains(c.MakeDimForScalarInput(1, &d).error_message(),
+                            "Dimension size, given by scalar input 1, must be "
+                            "non-negative but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, GetAttr) {
@@ -1322,33 +1320,33 @@ TEST_F(ShapeInferenceTest, Divide) {
   EXPECT_TRUE(c.Divide(d_6, d_2, evenly_divisible, &out).ok());
   EXPECT_EQ("3", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 5, evenly_divisible, &out).error_message())
-          .contains("Dimension size must be evenly divisible by 5 but is 6"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 5, evenly_divisible, &out).error_message(),
+      "Dimension size must be evenly divisible by 5 but is 6"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, d_0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, d_0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 
   // Repeat error cases above with evenly_divisible=false.
   evenly_divisible = false;
   EXPECT_TRUE(c.Divide(d_6, 5, evenly_divisible, &out).ok());
   EXPECT_EQ("1", c.DebugString(out));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, 0, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is 0"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, 0, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is 0"));
 
-  EXPECT_TRUE(
-      StringPiece(c.Divide(d_6, -1, evenly_divisible, &out).error_message())
-          .contains("Divisor must be positive but is -1"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Divide(d_6, -1, evenly_divisible, &out).error_message(),
+      "Divisor must be positive but is -1"));
 }
 
 TEST_F(ShapeInferenceTest, Add) {
@@ -1396,11 +1394,9 @@ TEST_F(ShapeInferenceTest, Add) {
   EXPECT_TRUE(c.Add(d_0, d_6, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out)
-                      .error_message())
-          .contains(
-              "Dimension size overflow from adding 6 and 9223372036854775802"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Add(d_6, std::numeric_limits<int64>::max() - 5, &out).error_message(),
+      "Dimension size overflow from adding 6 and 9223372036854775802"));
 }
 
 TEST_F(ShapeInferenceTest, Subtract) {
@@ -1448,9 +1444,9 @@ TEST_F(ShapeInferenceTest, Subtract) {
   EXPECT_TRUE(c.Subtract(d_6, d_0, &out).ok());
   EXPECT_TRUE(SameHandle(out, d_6));
 
-  EXPECT_TRUE(
-      StringPiece(c.Subtract(d_5, d_6, &out).error_message())
-          .contains("Negative dimension size caused by subtracting 6 from 5"));
+  EXPECT_TRUE(str_util::StrContains(
+      c.Subtract(d_5, d_6, &out).error_message(),
+      "Negative dimension size caused by subtracting 6 from 5"));
 }
 
 TEST_F(ShapeInferenceTest, Multiply) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.cc b/tensorflow/core/framework/shape_inference_testutil.cc
index b4765ab0b2c41a1b510364d755984b6ae68dd07a..b54dd220ab919a640c9cd58e112459999762e4d1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.cc
+++ b/tensorflow/core/framework/shape_inference_testutil.cc
@@ -100,7 +100,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
       }
     }
 
-    if (expected.starts_with("in")) {
+    if (str_util::StartsWith(expected, "in")) {
       if (in_index == -1) {
         return Unknown(err_prefix,
                        " should have matched an input shape by "
@@ -135,7 +135,9 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
     }
 
     // Verify the dimensions.
-    CHECK(expected.starts_with("[") && expected.ends_with("]")) << expected;
+    CHECK(str_util::StartsWith(expected, "[") &&
+          str_util::EndsWith(expected, "]"))
+        << expected;
     expected.remove_prefix(1);
     expected.remove_suffix(1);
 
@@ -176,7 +178,7 @@ Status ShapeInferenceTestutil::InferShapes(ShapeInferenceTestOp op,
           return Unknown(err_prefix, " expected to be unknown but was ",
                          c.Value(out_dim), err_suffix);
         }
-      } else if (expected_dim.starts_with("d")) {
+      } else if (str_util::StartsWith(expected_dim, "d")) {
         // Compare the dimension values.
         auto v = str_util::Split(expected_dim, '|');
         if (in_dim_idx.first == -1) {
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 7977841482efa396c8e0797d8c80a40c11b4df56..2a99af7659d9be0dbab505fc7147e7fcc15d67c9 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
@@ -83,17 +84,17 @@ class ShapeInferenceTestutil {
       "", ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
               op, i, o)                                                       \
               .error_message())
-#define INFER_ERROR(error_substring, op, i)                                 \
-  {                                                                         \
-    string error_message =                                                  \
-        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
-            op, i, "e")                                                     \
-            .error_message();                                               \
-    const string& substring = error_substring;                              \
-    EXPECT_NE("", error_message);                                           \
-    EXPECT_TRUE(StringPiece(error_message).contains(substring))             \
-        << "Expected to see '" << substring << "' in '" << error_message    \
-        << "'";                                                             \
+#define INFER_ERROR(error_substring, op, i)                                    \
+  {                                                                            \
+    string error_message =                                                     \
+        ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes(    \
+            op, i, "e")                                                        \
+            .error_message();                                                  \
+    const string& substring = error_substring;                                 \
+    EXPECT_NE("", error_message);                                              \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(error_message, substring)) \
+        << "Expected to see '" << substring << "' in '" << error_message       \
+        << "'";                                                                \
   }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference_testutil_test.cc b/tensorflow/core/framework/shape_inference_testutil_test.cc
index 20a6807064bea96f41cbd6035327d7a6db2f73b8..a4405b502cb68444fd43ad21af5922e3bd42ec42 100644
--- a/tensorflow/core/framework/shape_inference_testutil_test.cc
+++ b/tensorflow/core/framework/shape_inference_testutil_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -25,10 +26,11 @@ namespace shape_inference {
 
 namespace {
 
-#define EXPECT_CONTAINS(str, substr)                                 \
-  do {                                                               \
-    string s = (str);                                                \
-    EXPECT_TRUE(StringPiece(s).contains(substr)) << "String: " << s; \
+#define EXPECT_CONTAINS(str, substr)                            \
+  do {                                                          \
+    string s = (str);                                           \
+    EXPECT_TRUE(::tensorflow::str_util::StrContains(s, substr)) \
+        << "String: " << s;                                     \
   } while (false)
 
 static OpShapeInferenceFn* global_fn_ptr = nullptr;
@@ -97,8 +99,8 @@ TEST(ShapeInferenceTestutilTest, Failures) {
   auto error_message = ShapeInferenceTestutil::InferShapes(
                            ShapeInferenceTestOp("NoSuchOp"), "", "")
                            .error_message();
-  EXPECT_TRUE(StringPiece(error_message)
-                  .starts_with("Op type not registered 'NoSuchOp'"));
+  EXPECT_TRUE(
+      str_util::StartsWith(error_message, "Op type not registered 'NoSuchOp'"));
 
   // Wrong shape error messages.
   EXPECT_CONTAINS(RunInferShapes(op, "[1];[2];[1]", "?", fn_copy_input_0),
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index e2111d60389d51702463f377602067ddc1bade08..d5a45c73c37bf0807e9437a4c886ca0d96dc5c67 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -610,11 +610,15 @@ bool Tensor::IsInitialized() const {
 }
 
 void Tensor::CheckType(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
 }
 
 void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
-  CHECK_EQ(dtype(), expected_dtype);
+  CHECK_EQ(dtype(), expected_dtype)
+      << DataTypeString(expected_dtype) << " expected, got "
+      << DataTypeString(dtype());
   CHECK(IsAligned()) << "CheckTypeAndIsAligned";
 }
 
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index adf4e1bae307d81d91e7e597fc882caf4c87601f..2280114de5110630a0b64742e1f050e589d00bd0 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -114,7 +114,7 @@ string DataTypeString(DataType dtype) {
 }
 
 bool DataTypeFromString(StringPiece sp, DataType* dt) {
-  if (sp.ends_with("_ref")) {
+  if (str_util::EndsWith(sp, "_ref")) {
     sp.remove_suffix(4);
     DataType non_ref;
     if (DataTypeFromString(sp, &non_ref) && !IsRefType(non_ref)) {
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 60f2b4135a68c4eed618e3efb07758fbab85fa07..16b069c70a7640b4859680a630920990dea087ce 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 
 #include "tensorflow/core/framework/type_traits.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -140,9 +141,8 @@ TEST(TypesTest, ComplexTypes) {
 TEST(TypesTest, IntegerTypes) {
   for (auto dt : AllTypes()) {
     const string name = DataTypeString(dt);
-    const StringPiece n = name;
-    EXPECT_EQ(DataTypeIsInteger(dt),
-              n.starts_with("int") || n.starts_with("uint"))
+    EXPECT_EQ(DataTypeIsInteger(dt), str_util::StartsWith(name, "int") ||
+                                         str_util::StartsWith(name, "uint"))
         << "DataTypeInteger failed for " << name;
   }
 }
diff --git a/tensorflow/core/framework/variant_op_copy_test.cc b/tensorflow/core/framework/variant_op_copy_test.cc
index 85e014f80434d2a2de2851d2cb361f4b0a0c9433..60fa7bd55937b81555d18dab455640326d98a73d 100644
--- a/tensorflow/core/framework/variant_op_copy_test.cc
+++ b/tensorflow/core/framework/variant_op_copy_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -259,8 +260,8 @@ TEST(VariantOpCopyTest, CreateConstOnGPUFailsGracefully) {
   ClientSession session(root);
   std::vector<Tensor> outputs;
   Status s = session.Run({create_const}, &outputs);
-  EXPECT_TRUE(StringPiece(s.error_message())
-                  .contains("GPU copy from non-DMA string tensor"))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "GPU copy from non-DMA string tensor"))
       << s.ToString();
 }
 
@@ -365,8 +366,9 @@ TEST(VariantOpCopyTest, CreateCopyCPUToGPUStringFailsSafely) {
   std::vector<Tensor> outputs;
   Status err = session.Run({create_op, identity}, &outputs);
   EXPECT_EQ(err.code(), errors::Code::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(err.error_message())
-                  .contains("During Variant Host->Device Copy: non-DMA-copy "
+  EXPECT_TRUE(
+      str_util::StrContains(err.error_message(),
+                            "During Variant Host->Device Copy: non-DMA-copy "
                             "attempted of tensor type: string"))
       << err.error_message();
 }
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 06ca211c762748b1dacd4eb9623ffd2d72762cca..7055e62c0e745f61e072914ff2af4d4ff582963a 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include "tensorflow/core/lib/strings/str_util.h"
 
 #define EIGEN_USE_THREADS
 
@@ -130,7 +131,7 @@ TEST(VariantOpShapeRegistryTest, TestBasic) {
   Variant v = vv_early_exit;
   Status s0 = (*shape_fn)(v, &shape);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit!"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit!"));
 
   VariantValue vv_ok{false /* early_exit */};
   v = vv_ok;
@@ -229,7 +230,7 @@ TEST(VariantOpZerosLikeRegistryTest, TestBasicCPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -254,7 +255,7 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
                                         ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(
-      StringPiece(s0.error_message()).contains("early exit zeros_like"));
+      str_util::StrContains(s0.error_message(), "early exit zeros_like"));
 
   VariantValue vv_ok{false /* early_exit */, 0 /* value */};
   v = vv_ok;
@@ -299,7 +300,7 @@ TEST(VariantOpAddRegistryTest, TestBasicCPU) {
   Status s0 = BinaryOpVariants<CPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
@@ -325,7 +326,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
-  EXPECT_TRUE(StringPiece(s0.error_message()).contains("early exit add"));
+  EXPECT_TRUE(str_util::StrContains(s0.error_message(), "early exit add"));
 
   VariantValue vv_ok{false /* early_exit */, 3 /* value */};
   v_a = vv_ok;
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index a7af5e2312af716ef25cb35c8f247d6feccb6d9c..fb8a6c39e6786c9dbf3f14c68b8af66b01a20f29 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -567,6 +567,11 @@ void Graph::ToGraphDefSubRange(GraphDef* graph_def, int from_node_id) const {
         inputs[edge->dst_input()] = edge;
       }
     }
+    // Sort the control inputs for more predictable serialization.
+    std::sort(inputs.begin() + node->num_inputs(), inputs.end(),
+              [](const Edge* a, const Edge* b) -> bool {
+                return a->src()->name() < b->src()->name();
+              });
     node_def->clear_input();
     node_def->mutable_input()->Reserve(inputs.size());
 
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 76ee88e684dcbf2b687dcf8ea6433225b0293c54..250992fb7a8738127ee90bd676509bccc6b78c51 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/public/version.h"
 
@@ -73,7 +74,7 @@ class GraphConstructor {
     Options(const ImportGraphDefOptions& in)  // NOLINT(runtime/explicit)
         : allow_internal_ops(false),
           expect_device_spec(false),
-          prefix(in.prefix.empty() || StringPiece(in.prefix).ends_with("/")
+          prefix(in.prefix.empty() || str_util::EndsWith(in.prefix, "/")
                      ? in.prefix
                      : in.prefix + "/"),
           uniquify_names(in.uniquify_names),
@@ -436,7 +437,7 @@ Status GraphConstructor::BuildNodeIndex() {
     bool in_control_dependence = false;
     for (int i = 0; i < node_def.input_size(); ++i) {
       StringPiece input_name = node_def.input(i);
-      if (!input_name.empty() && input_name.starts_with("^")) {
+      if (!input_name.empty() && str_util::StartsWith(input_name, "^")) {
         in_control_dependence = true;
       } else if (in_control_dependence) {
         return errors::InvalidArgument(
@@ -484,7 +485,7 @@ Status GraphConstructor::InitFromEdges() {
       bool has_loop_back_edge = false;
       for (int i = 0; i < node_def.input_size(); ++i) {
         StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(input_name, "^")) {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
@@ -534,7 +535,7 @@ Status GraphConstructor::ValidateColocationConstraints(
   if (iter == node_def.attr().end()) return Status::OK();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
-    if (s.Consume(kColocationGroupPrefix) &&
+    if (str_util::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
       return errors::InvalidArgument(
           "Node '", node_def.name(),
@@ -764,7 +765,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     // Skip remapped inputs (which already exist in g_ and are not being
     // imported).
     if (input_already_exists[i]) continue;
-    if (input.Consume("^")) {
+    if (str_util::ConsumePrefix(&input, "^")) {
       node_def->set_input(i, strings::StrCat("^", prefix_, input));
     } else {
       node_def->set_input(i, strings::StrCat(prefix_, input));
@@ -776,7 +777,7 @@ void GraphConstructor::AddPrefixToNodeDef(
         node_def->mutable_attr()->at(kColocationAttrName).mutable_list();
     for (int i = 0; i < list->s_size(); ++i) {
       StringPiece v(list->s(i));
-      if (v.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&v, kColocationGroupPrefix)) {
         list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
@@ -819,7 +820,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     bool updated = false;
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
-      if (val.Consume(kColocationGroupPrefix)) {
+      if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
         const auto& name_pair = uniquified_names_.find(val.ToString());
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
@@ -1018,12 +1019,7 @@ Status GraphConstructor::Convert() {
       }
     }
 
-    // Function shape inference is supported on an opt-in basis per
-    // ShapeRefiner.
-    if (refiner_->function_shape_inference_supported() ||
-        g_->flib_def().Find(node_def->name()) == nullptr) {
-      TF_RETURN_IF_ERROR(ValidateShape(node));
-    }
+    TF_RETURN_IF_ERROR(ValidateShape(node));
 
     // Update pending_count_ for outputs.
     UpdatePendingCountAndReady(outputs_, o, &pending_count_, &ready_);
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index 963c1dc024b4265e14314c610399fc92331f053c..c18ccf6ce442655de6a31d28d07de619eb84d8b8 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -156,7 +156,9 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     StringPiece loc(value[0]);
-    return loc.Consume(kColocationGroupPrefix) ? loc.ToString() : "";
+    return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
+               ? loc.ToString()
+               : "";
   }
 
   string GraphDebugString() const {
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 17a174101b2be479bea834a407544b3a74dc08cf..877e4f1b44e005b310667f48dcc0bfd0d0a7e1d5 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -372,7 +373,7 @@ string ControlLoopName(const string& name) {
 
 bool IsControlLoop(const Node* node) {
   const string& name = node->name();
-  return StringPiece(name).starts_with("_cloop");
+  return str_util::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 6841f2914989b22d6aef91831ac6101b0ba6555f..83b24cafe2cb364b2afd5dcb6533bf662dc40a1b 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -120,7 +121,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       if (ndef.op() == "_Recv") {
         bool has_control = false;
         for (const string& input_name : ndef.input()) {
-          if (StringPiece(input_name).starts_with("^")) {
+          if (str_util::StartsWith(input_name, "^")) {
             has_control = true;
             break;
           }
@@ -128,7 +129,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
         EXPECT_TRUE(has_control);
       }
       // Must have a control loop
-      if (StringPiece(ndef.name()).starts_with("_cloop")) {
+      if (str_util::StartsWith(ndef.name(), "_cloop")) {
         if (ndef.op() == "Enter") {
           has_control_enter = true;
         }
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e2ce0ba046f26b69bdb8f427afeb480727977844..c8c2b225fea721bd19683fbdb805601bb9be494b 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -408,7 +409,7 @@ TEST_F(GraphTest, NewName) {
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
-  EXPECT_TRUE(StringPiece(a1).starts_with("A")) << a1;
+  EXPECT_TRUE(str_util::StartsWith(a1, "A")) << a1;
 }
 
 TEST_F(GraphTest, IsValidNode) {
diff --git a/tensorflow/core/graph/quantize_training.cc b/tensorflow/core/graph/quantize_training.cc
index cb0fc8a1547a8498aa0bd089a2c9395119de2789..3b6e8cc2339a42285a68c6898c99b1ec4b585917 100644
--- a/tensorflow/core/graph/quantize_training.cc
+++ b/tensorflow/core/graph/quantize_training.cc
@@ -259,8 +259,14 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
   const string restore_op_name = strings::StrCat(name_prefix, "/RestoreV2");
   const string assign_op_name = strings::StrCat(name_prefix, "/Assign");
   for (Node* var : variables) {
-    string new_restore_op_name = graph->NewName(restore_op_name);
-    string new_assign_op_name = graph->NewName(assign_op_name);
+    // Add an extra prefix after calling graph->NewName because the "unique"
+    // name may conflict with names generated for Send nodes.
+    // TODO(b/77547936): fix this more generally and get rid of the extra prefix
+    // here.
+    string new_restore_op_name =
+        strings::StrCat(graph->NewName(restore_op_name), "_qt");
+    string new_assign_op_name =
+        strings::StrCat(graph->NewName(assign_op_name), "_qt");
     string tensor_names_op_name =
         strings::StrCat(new_restore_op_name, "/tensor_names");
     string shape_and_slices_op_name =
diff --git a/tensorflow/core/graph/quantize_training_test.cc b/tensorflow/core/graph/quantize_training_test.cc
index 2ad69dbd0c608fa79354c73e01167c3b02ff4fc2..e46f92bc24de9fc7d7923e4b9ebe0f04882beae4 100644
--- a/tensorflow/core/graph/quantize_training_test.cc
+++ b/tensorflow/core/graph/quantize_training_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
@@ -215,7 +216,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_QuantizeAndDequantize) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/QuantizeAndDequantizeV2"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
@@ -269,7 +270,7 @@ TEST_F(QuantizeTrainingTest, WithBackwardNodes_FakeQuant) {
   Node* found_node;
   Status s = FindNode(g, strings::StrCat(d->name(), "/FakeQuantWithMinMaxVars"),
                       &found_node);
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not found")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not found")) << s;
 
   // Ensure that m1 and m2's inputs were quantized.
   TF_ASSERT_OK(
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 7219d9812f3e4a01cffa4b6b17d38781f7d5e2b0..6c014a8d44388eaeff80fb0850ac1575d3ec023a 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -312,8 +312,8 @@ TEST_F(SubgraphTest, ChainOfFools) {
   EXPECT_TRUE(HasEdge("e", 0, "_send_e_0", 0));
 }
 
-static bool HasSubstr(const string& base, const string& substr) {
-  bool ok = StringPiece(base).contains(substr);
+static bool HasSubstr(StringPiece base, StringPiece substr) {
+  bool ok = str_util::StrContains(base, substr);
   EXPECT_TRUE(ok) << base << ", expected substring " << substr;
   return ok;
 }
diff --git a/tensorflow/core/graph/tensor_id.cc b/tensorflow/core/graph/tensor_id.cc
index 089ea5e527ab18322af01b6e80154cd759b9e980..8af1936d64e503d0cdcf10b7a492847b494c8664 100644
--- a/tensorflow/core/graph/tensor_id.cc
+++ b/tensorflow/core/graph/tensor_id.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 
@@ -45,7 +46,7 @@ TensorId ParseTensorName(StringPiece name) {
   if (p > base && *p == ':' && mul > 1) {
     id.first = StringPiece(base, p - base);
     id.second = index;
-  } else if (name.starts_with("^")) {
+  } else if (str_util::StartsWith(name, "^")) {
     // Control edge
     id.first = StringPiece(base + 1);
     id.second = Graph::kControlSlot;
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index cb6d107cadc153930d99ea13cad985dd60c8b393..d58cdc3c5baf02f89cff52ef0396816cb00b48a3 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -60,7 +61,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -83,7 +84,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
   Status s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 
   // Add the defaults.
   TF_ASSERT_OK(AddDefaultAttrsToGraphDef(&graph_def, *OpRegistry::Global(), 0));
@@ -91,7 +92,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // Validation should still fail.
   s = graph::ValidateGraphDef(graph_def, *OpRegistry::Global());
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("NodeDef missing attr"));
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "NodeDef missing attr"));
 }
 
 TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 2ca9b720ee127b892c06230efb3517f5afabea45..9dcc6765f5b356438c325f84c4891d70e0089efd 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -3,18 +3,6 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_library")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "op_types",
     srcs = ["op_types.cc"],
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index b653f902e857ce804f797a016ebde551bf3b6695..9ecf5a6cf789fed2c44508e5b53d352b73e1fdea 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -8,18 +8,6 @@ load(
     "tf_cuda_tests_tags",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 config_setting(
     name = "xsmm",
     licenses = ["notice"],
diff --git a/tensorflow/core/grappler/clusters/cluster.cc b/tensorflow/core/grappler/clusters/cluster.cc
index 39bfca244ed2d40544dd2a17a019dadbe50f6d29..8d8c6084ec9743dea4b45820a6d4a5b2d938979b 100644
--- a/tensorflow/core/grappler/clusters/cluster.cc
+++ b/tensorflow/core/grappler/clusters/cluster.cc
@@ -62,6 +62,10 @@ void Cluster::DisableOptimizer(bool disable) {
         options_.config.mutable_graph_options()->mutable_rewrite_options();
     rewriter_config->set_layout_optimizer(RewriterConfig::OFF);
     rewriter_config->set_disable_model_pruning(true);
+    rewriter_config->set_function_optimization(RewriterConfig::OFF);
+    rewriter_config->set_arithmetic_optimization(RewriterConfig::OFF);
+    rewriter_config->set_loop_optimization(RewriterConfig::OFF);
+    rewriter_config->set_dependency_optimization(RewriterConfig::OFF);
     rewriter_config->set_constant_folding(RewriterConfig::OFF);
     rewriter_config->set_memory_optimization(RewriterConfig::NO_MEM_OPT);
     rewriter_config->mutable_auto_parallel()->set_enable(false);
diff --git a/tensorflow/core/grappler/clusters/single_machine_test.cc b/tensorflow/core/grappler/clusters/single_machine_test.cc
index c6352c1448bb38ece78530007e2534d475ef7fb6..352f08fedecd426c06c8668ff8f3910286e6900a 100644
--- a/tensorflow/core/grappler/clusters/single_machine_test.cc
+++ b/tensorflow/core/grappler/clusters/single_machine_test.cc
@@ -196,10 +196,19 @@ TEST_F(SingleMachineTest, GraphOptimizations) {
   TF_CHECK_OK(cluster_->Run(item.graph, item.feed, item.fetch, &metadata));
   std::set<string> cost_nodes;
   for (const auto& node : metadata.cost_graph().node()) {
+#ifdef INTEL_MKL
+    // Skip the special nodes inserted by TF (and MKL): these are either
+    // prefixed with an underscore or contain "/_".
+    if (node.name()[0] == '_' || node.name().find("/_") != string::npos) {
+      continue;
+    }
+    cost_nodes.insert(node.name());
+#else
     // Skip nodes added by TF internally.
     if (node.name()[0] != '_') {
       cost_nodes.insert(node.name());
     }
+#endif
   }
   const std::set<string> expected_cost_nodes = {
       "zero",      "one",      "add",         "square",
diff --git a/tensorflow/core/grappler/clusters/utils.cc b/tensorflow/core/grappler/clusters/utils.cc
index b54b34959a53b56022a449ca286ff0ba823f2aa5..50d6e6468faac01de1a06b0a9dea5aa0aa7f3dd2 100644
--- a/tensorflow/core/grappler/clusters/utils.cc
+++ b/tensorflow/core/grappler/clusters/utils.cc
@@ -54,7 +54,7 @@ DeviceProperties GetLocalCPUInfo() {
 
   int64 free_mem = port::AvailableRam();
   if (free_mem < INT64_MAX) {
-    device.set_memory_size(free_mem * 1024);
+    device.set_memory_size(free_mem);
   }
 
   (*device.mutable_environment())["cpu_instruction_set"] =
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index ae70c9860823dae1a85ba20e00afe15b218cd2b4..abfa7bc48e6e2484acee59a1e2b0c2bfc4e60fb7 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -66,6 +66,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
   }
 
   Costs node_costs;
+  int node_id = 0;
   do {
     OpContext op_context = scheduler.GetCurrNode();
     node_costs = node_estimator_->PredictCosts(op_context);
@@ -73,6 +74,7 @@ Status VirtualCluster::Run(const GraphDef& graph,
       CostGraphDef::Node* cost_node =
           metadata->mutable_cost_graph()->add_node();
       const string& op_name = op_context.name;
+      cost_node->set_id(node_id++);
       cost_node->set_name(op_name);
       cost_node->set_device(op_context.device_name);
       cost_node->set_compute_cost(
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 5336df1f51dbb5dd5f48857a088ece1b1a04dbb5..33949319d5f050100d6b58e7ee324370e4232bec 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -6,18 +6,6 @@ load(
     "tf_protos_grappler",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "graph_properties_testdata",
     srcs = glob([
@@ -55,6 +43,7 @@ cc_library(
         ":utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 817247e3794ca3e165b2f0445ab164938577336f..9fa2b7a259b1e96e2f9cea1326f3b41aed4533e0 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/grappler/costs/utils.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -251,8 +252,7 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
 }
 
 bool IsQueue(const Node& node) {
-  StringPiece type(node.type_string());
-  return type.ends_with("QueueV2");
+  return str_util::EndsWith(node.type_string(), "QueueV2");
 }
 
 // Returns true if the node is an Enter op AND its input is a Queue.
@@ -920,13 +920,12 @@ Status GraphProperties::UpdateResource(
 }
 
 Status GraphProperties::InferStatically(bool assume_valid_feeds) {
-  Graph graph(OpRegistry::Global());
-  FunctionLibraryDefinition function_library(graph.op_registry(),
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item_.graph.library());
+  Graph graph(function_library);
   ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
   shape_refiner.set_require_shape_inference_fns(false);
   shape_refiner.set_disable_constant_propagation(true);
-  shape_refiner.set_function_library_for_shape_inference(&function_library);
   ImportGraphDefOptions options;
   // Graph optimization happens at the late stage of graph execution,
   // when colocation constraints are already validated previously and
@@ -1012,6 +1011,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
     // Skip any information that comes from fed nodes.
     if (fed_ports.find(node->name()) != fed_ports.end()) {
+      VLOG(2) << "Skipping feed node shape: " << node->name();
       continue;
     }
     for (const auto& merged_shapes : node_ctx->MergedShapes()) {
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 284d9d409bb4d9439cf007e1692838667caff26a..d3d89b59af7db2d834cdf7476b106b181352b657 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -755,25 +755,25 @@ TEST_F(GraphPropertiesTest, FunctionStaticShapeInference) {
       z = MyAdd(x, y)
       z = MyAdd(x, z)
   */
-  // Check that the shape of the second MyAdd node propagates
-  // correctly.
+  // Check that the shape inference code infers what it can.
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
                                  "simple_function.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
   GraphProperties properties(item);
   TF_CHECK_OK(properties.InferStatically(false));
-  const auto props = properties.GetOutputProperties("MyAdd_55e046a8_1");
-  const OpInfo::TensorProperties& prop = props[0];
-  EXPECT_EQ(DT_FLOAT, prop.dtype());
-  EXPECT_FALSE(prop.shape().unknown_rank());
-  EXPECT_EQ(2, prop.shape().dim_size());
-  EXPECT_EQ(1, prop.shape().dim(0).size());
-  EXPECT_EQ(2, prop.shape().dim(1).size());
-
-  PartialTensorShape shape(prop.shape());
-  EXPECT_TRUE(shape.IsFullyDefined());
-  EXPECT_FALSE(shape.unknown_rank());
+  const auto out_props = properties.GetOutputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& out_prop = out_props[0];
+  EXPECT_EQ(DT_FLOAT, out_prop.dtype());
+  EXPECT_TRUE(out_prop.shape().unknown_rank());
+
+  const auto in_props = properties.GetInputProperties("MyAdd_55e046a8");
+  const OpInfo::TensorProperties& in_prop = in_props[0];
+  EXPECT_EQ(DT_FLOAT, in_prop.dtype());
+  EXPECT_FALSE(in_prop.shape().unknown_rank());
+  EXPECT_EQ(2, in_prop.shape().dim_size());
+  EXPECT_EQ(1, in_prop.shape().dim(0).size());
+  EXPECT_EQ(2, in_prop.shape().dim(1).size());
 }
 
 TEST_F(GraphPropertiesTest, SymbolicShapes) {
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index ea4320687af366ccdd82e46cf28adf4ee9c100c0..833205ac6f12a73d96c93455bb355ee511d6700a 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <limits>
 
 #include "tensorflow/core/framework/cost_graph.pb.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -52,6 +53,8 @@ Status MeasuringCostEstimator::Initialize(const GrapplerItem& item) {
 Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
                                             CostGraphDef* cost_graph,
                                             Costs* costs) const {
+  const bool running_simulation = (cluster_->type() == "virtual");
+
   std::vector<double> times(measurement_steps_);
   BlockingCounter barrier(measurement_steps_);
 
@@ -80,9 +83,23 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
 
     const Costs::MicroSeconds finish = Env::Default()->NowMicros();
-    const double time = (finish - start).count() * 1e3;
-    times[step] = time;
-
+    if (running_simulation) {
+      // When running simulation, return the estimated runtime, not the time it
+      // takes to run the simulation.
+      double time = 0.0;
+      for (const DeviceStepStats& stepstats :
+           metadata.step_stats().dev_stats()) {
+        for (const NodeExecStats& node_stats : stepstats.node_stats()) {
+          const double completion_time =
+              node_stats.all_end_rel_micros() + node_stats.all_start_micros();
+          time = std::max(time, completion_time * 1e3);
+        }
+      }
+      times[step] = time;
+    } else {
+      const double time = (finish - start).count() * 1e3;
+      times[step] = time;
+    }
     if (cost_graph && (step + 1 == measurement_steps_)) {
       metadata.mutable_cost_graph()->Swap(cost_graph);
     }
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index fdbc61f3f18087c40bf20716b503d3a53d37a47d..087190ad2a659a9e3314ef0b49737764888f43be 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -30,6 +30,7 @@ constexpr char kConst[] = "Const";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
+constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -50,6 +51,12 @@ constexpr char kPreventGradient[] = "PreventGradient";
 constexpr char kGather[] = "Gather";
 constexpr char kGatherV2[] = "GatherV2";
 constexpr char kSlice[] = "Slice";
+constexpr char kMaxPool[] = "MaxPool";
+constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
+constexpr char kAvgPool[] = "AvgPool";
+constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
+constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
+constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -71,14 +78,39 @@ Padding GetPadding(const OpInfo& op_features) {
   return Padding::SAME;  // Default padding.
 }
 
+bool IsTraining(const OpInfo& op_info) {
+  if (op_info.attr().find("is_training") != op_info.attr().end() &&
+      op_info.attr().at("is_training").b()) {
+    return true;
+  }
+  return false;
+}
+
+// TODO(dyoon): support non-4D tensors in the c ost functions of convolution
+// related ops (Conv, Pool, BatchNorm, and their backprops) and the related
+// helper functions.
 std::vector<int64> GetStrides(const OpInfo& op_features) {
   if (op_features.attr().find("strides") != op_features.attr().end()) {
     const auto strides = op_features.attr().at("strides").list().i();
+    CHECK(strides.size() == 4) << "Attr strides is not a length-4 vector: "
+                               << op_features.DebugString();
     return {strides[0], strides[1], strides[2], strides[3]};
   }
   return {1, 1, 1, 1};
 }
 
+std::vector<int64> GetKernelSize(const OpInfo& op_info) {
+  if (op_info.attr().find("ksize") != op_info.attr().end()) {
+    const auto ksize = op_info.attr().at("ksize").list().i();
+    CHECK(ksize.size() == 4)
+        << "Attr ksize is not a length-4 vector: " << op_info.DebugString();
+    return {ksize[0], ksize[1], ksize[2], ksize[3]};
+  }
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  return {1, 1, 1, 1};
+}
+
 int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
                     const Padding& padding) {
   // Logic for calculating output shape is from GetWindowedOutputSizeVerbose()
@@ -165,6 +197,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
       {kConv2dBackpropInput,
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
+      {kFusedConv2dBiasActivation,
+       wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
@@ -190,7 +224,15 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
 
       {kRank, wrap(&OpLevelCostEstimator::PredictMetadata)},
       {kShape, wrap(&OpLevelCostEstimator::PredictMetadata)},
-      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)}};
+      {kSize, wrap(&OpLevelCostEstimator::PredictMetadata)},
+      {kMaxPool, wrap(&OpLevelCostEstimator::PredictMaxPool)},
+      {kMaxPoolGrad, wrap(&OpLevelCostEstimator::PredictMaxPoolGrad)},
+      {kAvgPool, wrap(&OpLevelCostEstimator::PredictAvgPool)},
+      {kAvgPoolGrad, wrap(&OpLevelCostEstimator::PredictAvgPoolGrad)},
+      {kFusedBatchNorm, wrap(&OpLevelCostEstimator::PredictFusedBatchNorm)},
+      {kFusedBatchNormGrad,
+       wrap(&OpLevelCostEstimator::PredictFusedBatchNormGrad)},
+  };
 
 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
 
@@ -255,6 +297,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                       {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
                       {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
                       {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
+                      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
                       {"SquareDifference", 1},
                       {"Sub", EIGEN_COST(scalar_difference_op<float>)},
                       {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
@@ -419,11 +462,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
   Costs costs;
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
-  if (compute_memory_overlap_) {
-    costs.execution_time = std::max(compute_cost, memory_cost);
-  } else {
-    costs.execution_time = compute_cost + memory_cost;
-  }
+  CombineCostsAndUpdateExecutionTime(&costs);
   return costs;
 }
 
@@ -509,7 +548,6 @@ int64 OpLevelCostEstimator::CountConv2DOperations(
   ops *= conv_dims.kx * conv_dims.ky;
   ops *= conv_dims.iz * conv_dims.oz;
   ops *= kOpsPerMac;
-  VLOG(1) << "Operations for Conv2D " << ops;
 
   if (conv_info != nullptr) {
     *conv_info = conv_dims;
@@ -774,6 +812,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    input_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       input_shape.add_dim()->set_size(1);
     }
@@ -816,6 +855,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   }
   if (!shape_found) {
     // Set the minimum filter size that's feasible.
+    filter_shape.Clear();
     for (int i = 0; i < 4; ++i) {
       filter_shape.add_dim()->set_size(1);
     }
@@ -945,6 +985,91 @@ Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedConv2DBiasActivation(
+    const OpContext& op_context) const {
+  // FusedConv2DBiasActivation computes a fused kernel which implements:
+  // 2D convolution, adds side input with separate scaling on convolution and
+  // side inputs, then adds bias, and finally applies the ReLU activation
+  // function to the result:
+  //
+  // Input -> Conv2D  ->  Add  -> BiasAdd  -> ReLU
+  //            ^          ^         ^
+  //          Filter   Side Input   Bias
+  //
+  // Note that when adding the side input, the operation multiplies the output
+  // of Conv2D by conv_input_scale, confusingly, and the side_input by
+  // side_input_scale.
+  //
+  // Note that in the special case that side_input_scale is 0, which we infer
+  // from side_input having dimensions [], we skip that addition operation.
+  //
+  // For more information, see
+  // contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+  auto& conv_input = op_context.op_info.inputs(0);
+  auto& filter = op_context.op_info.inputs(1);
+  auto& bias = op_context.op_info.inputs(2);
+  auto& side_input = op_context.op_info.inputs(3);
+  auto& conv_input_scale = op_context.op_info.inputs(4);
+  auto& side_input_scale = op_context.op_info.inputs(5);
+
+  // Manually compute our convolution dimensions.
+  bool found_unknown_shapes = false;
+  auto dims = ConvolutionDimensionsFromInputs(
+      conv_input.shape(), filter.shape(), op_context.op_info,
+      &found_unknown_shapes);
+
+  // Construct the shape of our output tensor from our convolution dimensions
+  // and format, as it may not be available yet.
+  //
+  // TODO(varomodt): should we centralize the Conv2D input/output shapes?
+  bool unknown_conv_format = false;
+  OpInfo::TensorProperties output;
+  switch (GetConvolutionFormat(op_context)) {
+    case NCHW:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.oz, dims.ox, dims.oy});
+      break;
+    case NHWC:
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      break;
+    default:
+      // TODO(b/77722245): support cost estimation for NCHW_VECT_C.
+      LOG(WARNING) << "unsupported data format: "
+                   << GetDataFormat(op_context.op_info)
+                   << " Defaulting to NHWC.";
+      output =
+          DescribeTensor(DT_FLOAT, {dims.batch, dims.ox, dims.oy, dims.oz});
+      unknown_conv_format = true;
+      break;
+  }
+
+  // Add the operations the fused op always computes.
+  std::vector<OpContext> component_ops = {
+      FusedChildContext(op_context, "Conv2D", output, {conv_input, filter}),
+      FusedChildContext(op_context, "Mul", output, {output, conv_input_scale}),
+      FusedChildContext(op_context, "BiasAdd", output, {output, bias}),
+      FusedChildContext(op_context, "Relu", output, {output})};
+
+  // Add our side_input iff it's non-empty.
+  if (side_input.shape().dim_size() > 0) {
+    component_ops.push_back(FusedChildContext(op_context, "Mul", side_input,
+                                              {side_input, side_input_scale}));
+    component_ops.push_back(
+        FusedChildContext(op_context, "Add", output, {side_input, output}));
+  }
+
+  // Construct an op_context which definitely has our output shape.
+  auto op_context_with_output = op_context;
+  op_context_with_output.op_info.mutable_outputs()->Clear();
+  *op_context_with_output.op_info.mutable_outputs()->Add() = output;
+
+  // Construct component operations and run the cost computation.
+  auto costs = PredictFusedOp(op_context_with_output, component_ops);
+  costs.inaccurate |= found_unknown_shapes || unknown_conv_format;
+  return costs;
+}
+
 Costs OpLevelCostEstimator::PredictMatMul(const OpContext& op_context) const {
   const auto& op_features = op_context.op_info;
   bool found_unknown_shapes = false;
@@ -1013,6 +1138,13 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
   // part of it. For these op the size of the output determines the memory cost.
   const auto& op_info = op_context.op_info;
 
+  const int inputs_needed = op_info.op() == "Slice" ? 3 : 2;
+  if (op_info.outputs_size() == 0 || op_info.inputs_size() < inputs_needed) {
+    Costs costs = Costs::ZeroCosts();
+    costs.inaccurate = true;
+    return costs;
+  }
+
   bool unknown_shapes = false;
 
   // Each output element is a copy of some element from input.
@@ -1041,5 +1173,373 @@ Costs OpLevelCostEstimator::PredictGatherOrSlice(
   return costs;
 }
 
+Costs OpLevelCostEstimator::PredictFusedOp(
+    const OpContext& op_context,
+    const std::vector<OpContext>& fused_op_contexts) const {
+  // Note that PredictOpCountBasedCost will get the correct memory_time from
+  // the node's inputs and outputs; but we don't want to have to re-implement
+  // the logic for computing the operation count of each of our component
+  // operations here; so we simply add the compute times of each component
+  // operation, then update the execution time.
+  Costs fused_cost = PredictOpCountBasedCost(0, op_context.op_info);
+  fused_cost.compute_time = 0;
+  fused_cost.inaccurate = false;
+  for (auto& fused_op : fused_op_contexts) {
+    auto op_cost = PredictCosts(fused_op);
+    fused_cost.compute_time += op_cost.compute_time;
+    fused_cost.inaccurate |= op_cost.inaccurate;
+  }
+
+  CombineCostsAndUpdateExecutionTime(&fused_cost);
+  return fused_cost;
+}
+
+/* static */
+OpContext OpLevelCostEstimator::FusedChildContext(
+    const OpContext& parent, const string& op_name,
+    const OpInfo::TensorProperties& output,
+    const std::vector<OpInfo::TensorProperties>& inputs) {
+  // Setup the base parameters of our new context.
+  OpContext new_context;
+  new_context.name = op_name;
+  new_context.device_name = parent.device_name;
+  new_context.op_info = parent.op_info;
+  new_context.op_info.set_op(op_name);
+
+  // Setup the inputs of our new context.
+  new_context.op_info.mutable_inputs()->Clear();
+  for (const auto& input : inputs) {
+    *new_context.op_info.mutable_inputs()->Add() = input;
+  }
+
+  // Setup the output of our new context.
+  new_context.op_info.mutable_outputs()->Clear();
+  *new_context.op_info.mutable_outputs()->Add() = output;
+
+  return new_context;
+}
+
+/* static */
+OpInfo::TensorProperties OpLevelCostEstimator::DescribeTensor(
+    DataType type, const std::vector<int64>& dims) {
+  OpInfo::TensorProperties ret;
+  ret.set_dtype(type);
+
+  auto shape = ret.mutable_shape();
+  for (const int dim : dims) {
+    shape->add_dim()->set_size(dim);
+  }
+
+  return ret;
+}
+
+/* static */
+OpLevelCostEstimator::ConvolutionDimensions
+OpLevelCostEstimator::OpDimensionsFromInputs(
+    const TensorShapeProto& original_image_shape, const OpInfo& op_info,
+    bool* found_unknown_shapes) {
+  VLOG(2) << "op features: " << op_info.DebugString();
+  VLOG(2) << "Original image shape: " << original_image_shape.DebugString();
+  auto image_shape =
+      MaybeGetMinimumShape(original_image_shape, 4, found_unknown_shapes);
+  VLOG(2) << "Image shape: " << image_shape.DebugString();
+
+  int x_index, y_index, channel_index;
+  const string& data_format = GetDataFormat(op_info);
+  if (data_format == "NCHW") {
+    x_index = 2;
+    y_index = 3;
+    channel_index = 1;
+  } else {
+    x_index = 1;
+    y_index = 2;
+    channel_index = 3;
+  }
+  int64 batch = image_shape.dim(0).size();
+  int64 ix = image_shape.dim(x_index).size();
+  int64 iy = image_shape.dim(y_index).size();
+  int64 iz = image_shape.dim(channel_index).size();
+
+  // Note that FusedBatchNorm doesn't have ksize attr, but GetKernelSize returns
+  // {1, 1, 1, 1} in that case.
+  std::vector<int64> ksize = GetKernelSize(op_info);
+  int64 kx = ksize[x_index];
+  int64 ky = ksize[y_index];
+
+  std::vector<int64> strides = GetStrides(op_info);
+  int64 sx = strides[x_index];
+  int64 sy = strides[y_index];
+  const auto padding = GetPadding(op_info);
+
+  int64 ox = GetOutputSize(ix, kx, sx, padding);
+  int64 oy = GetOutputSize(iy, ky, sy, padding);
+  int64 oz = iz;
+
+  OpLevelCostEstimator::ConvolutionDimensions conv_dims = {
+      batch, ix, iy, iz, kx, ky, oz, ox, oy, sx, sy, padding};
+  return conv_dims;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  // kx * ky - 1 comparisons per output (kx * xy > 1)
+  // or 1 copy per output (kx * k1 = 1).
+  int per_output_ops = dims.kx * dims.ky == 1 ? 1 : dims.kx * dims.ky - 1;
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * per_output_ops;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // Vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictMaxPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // y: op_info.inputs(1)
+  // y_grad: op_info.inputs(2)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx == 1 && dims.ky == 1) {
+    // 1x1 window. No need to know which input was max.
+    ops = dims.batch * dims.ix * dims.iy * dims.iz;
+  } else if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window: re-run maxpool, then assign zero or y_grad.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy);
+  } else {
+    // Overlapping window: initialize with zeros, re-run maxpool, then
+    // accumulate y_gad to proper x_grad locations.
+    ops = dims.batch * dims.iz *
+          (dims.ox * dims.oy * (dims.kx * dims.ky - 1) + dims.ix * dims.iy * 2);
+  }
+
+  // Just read x and y_grad; no need to read y as we assume MaxPoolGrad re-run
+  // MaxPool internally.
+  double total_input_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  total_input_size +=
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  // Write x_grad; size equal to x.
+  const double total_output_size =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPool(const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+
+  // kx * ky - 1 additions and 1 multiplication per output.
+  int64 ops = dims.batch * dims.ox * dims.oy * dims.oz * dims.kx * dims.ky;
+
+  double total_input_size = 0;
+  if (dims.ky >= dims.sy) {
+    total_input_size =
+        CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  } else {  // dims.ky < dims.sy
+    // vertical stride is larger than vertical kernel; assuming row-major
+    // format, skip unnecessary rows (or read every kx rows per sy rows, as the
+    // others are not used for output).
+    const auto data_size = DataTypeSize(BaseType(op_info.inputs(0).dtype()));
+    total_input_size =
+        data_size * dims.batch * dims.ix * dims.ky * dims.oy * dims.iz;
+  }
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictAvgPoolGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x's shape: op_info.inputs(0)
+  // y_grad: op_info.inputs(1)
+
+  // Extract x_shape from op_info.inputs(0).value() or op_info.outputs(0).
+  bool shape_found = false;
+  TensorShapeProto x_shape;
+  if (op_info.inputs_size() >= 1 && op_info.inputs(0).has_value()) {
+    const TensorProto& value = op_info.inputs(0).value();
+    shape_found = GetTensorShapeProtoFromTensorProto(value, &x_shape);
+  }
+  if (!shape_found && op_info.outputs_size() > 0) {
+    x_shape = op_info.outputs(0).shape();
+    shape_found = true;
+  }
+  if (!shape_found) {
+    // Set the minimum shape that's feasible.
+    x_shape.Clear();
+    for (int i = 0; i < 4; ++i) {
+      x_shape.add_dim()->set_size(1);
+    }
+    found_unknown_shapes = true;
+  }
+
+  ConvolutionDimensions dims =
+      OpDimensionsFromInputs(x_shape, op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  if (dims.kx <= dims.sx && dims.ky <= dims.sy) {
+    // Non-overlapping window.
+    ops = dims.batch * dims.iz * (dims.ix * dims.iy + dims.ox * dims.oy);
+  } else {
+    // Overlapping window.
+    ops = dims.batch * dims.iz *
+          (dims.ix * dims.iy + dims.ox * dims.oy * (dims.kx * dims.ky + 1));
+  }
+
+  const double total_input_size =
+      CalculateInputSize(op_info, &found_unknown_shapes);
+  const double total_output_size =
+      CalculateOutputSize(op_info, &found_unknown_shapes);
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size, op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNorm(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // x: op_info.inputs(0)
+  // scale: op_info.inputs(1)
+  // offset: op_info.inputs(2)
+  // mean: op_info.inputs(3)  --> only for inference
+  // variance: op_info.inputs(4) --> only for inference
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(0).shape(), op_info, &found_unknown_shapes);
+  const bool is_training = IsTraining(op_info);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  if (is_training) {
+    ops = dims.iz * (dims.batch * dims.ix * dims.iy * 4 + 6 + rsqrt_cost);
+  } else {
+    ops = dims.batch * dims.ix * dims.iy * dims.iz * 2;
+  }
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(0), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  double total_input_size = 0.0;
+  double total_internal_read_size = 0.0;
+  double total_output_size = 0.0;
+  if (is_training) {
+    total_input_size = size_nhwc + size_c * 2;
+    total_output_size = size_nhwc + size_c * 4;
+    total_internal_read_size = size_nhwc;
+  } else {
+    total_input_size = size_nhwc + size_c * 4;
+    total_output_size = size_nhwc;
+  }
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+Costs OpLevelCostEstimator::PredictFusedBatchNormGrad(
+    const OpContext& op_context) const {
+  bool found_unknown_shapes = false;
+  const auto& op_info = op_context.op_info;
+  // y_backprop: op_info.inputs(0)
+  // x: op_info.inputs(1)
+  // scale: op_info.inputs(2)
+  // mean: op_info.inputs(3)
+  // variance or inverse of variance: op_info.inputs(4)
+  ConvolutionDimensions dims = OpDimensionsFromInputs(
+      op_info.inputs(1).shape(), op_info, &found_unknown_shapes);
+
+  int64 ops = 0;
+  const auto rsqrt_cost = Eigen::internal::functor_traits<
+      Eigen::internal::scalar_rsqrt_op<float>>::Cost;
+  ops = dims.iz * (dims.batch * dims.ix * dims.iy * 11 + 5 + rsqrt_cost);
+
+  const double size_nhwc =
+      CalculateTensorSize(op_info.inputs(1), &found_unknown_shapes);
+  const double size_c =
+      CalculateTensorSize(op_info.inputs(2), &found_unknown_shapes);
+  double total_input_size = size_nhwc * 2 + size_c * 2;
+  double total_internal_read_size = size_nhwc;
+  double total_output_size = size_nhwc * 1 + size_c * 2;
+
+  Costs costs = PredictOpCountBasedCost(
+      ops, total_input_size + total_output_size + total_internal_read_size,
+      op_info);
+  costs.inaccurate = found_unknown_shapes;
+  costs.max_memory = total_output_size;
+  return costs;
+}
+
+/* static */
+OpLevelCostEstimator::ConvolutionFormat
+OpLevelCostEstimator::GetConvolutionFormat(const OpContext& op_context) {
+  auto data_format = GetDataFormat(op_context.op_info);
+  if (data_format == "NCHW") {
+    return NCHW;
+  } else if (data_format == "NHWC") {
+    return NHWC;
+  } else if (data_format == "NCHW_VECT_C") {
+    return NCHW_VECT_C;
+  }
+
+  return UNKNOWN_CONVOLUTION_FORMAT;
+}
+
+void OpLevelCostEstimator::CombineCostsAndUpdateExecutionTime(
+    Costs* costs) const {
+  if (compute_memory_overlap_) {
+    costs->execution_time = std::max(costs->compute_time, costs->memory_time);
+  } else {
+    costs->execution_time = costs->compute_time + costs->memory_time;
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 1b3babb2066afc2b794ff268929ebdd01ad61e89..35649f7ee959a292dbf68246221bc98c52f2db37 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -82,6 +82,13 @@ class OpLevelCostEstimator {
     int64 sy;         // Stride y.
     Padding padding;  // SAME or VALID.
   };
+  enum ConvolutionFormat {
+    UNKNOWN_CONVOLUTION_FORMAT,
+    NHWC,
+    NCHW,
+    NCHW_VECT_C,
+    NCHW_VECT_W,
+  };
   int64 CountConv2DOperations(const OpInfo& op_features,
                               bool* found_unknown_shapes) const;
   int64 CountConv2DOperations(const OpInfo& op_features,
@@ -138,6 +145,7 @@ class OpLevelCostEstimator {
   Costs PredictCwiseOp(const OpContext& op_context) const;
   Costs PredictConv2DBackpropInput(const OpContext& op_context) const;
   Costs PredictConv2DBackpropFilter(const OpContext& op_context) const;
+  Costs PredictFusedConv2DBiasActivation(const OpContext& op_context) const;
   Costs PredictMatMul(const OpContext& op_context) const;
   Costs PredictNoOp(const OpContext& op_context) const;
   Costs PredictIdentity(const OpContext& op_context) const;
@@ -145,6 +153,16 @@ class OpLevelCostEstimator {
   Costs PredictBatchMatMul(const OpContext& op_context) const;
   Costs PredictMetadata(const OpContext& op_context) const;
   Costs PredictGatherOrSlice(const OpContext& op_context) const;
+  Costs PredictMaxPool(const OpContext& op_context) const;
+  Costs PredictMaxPoolGrad(const OpContext& op_context) const;
+  Costs PredictAvgPool(const OpContext& op_context) const;
+  Costs PredictAvgPoolGrad(const OpContext& op_context) const;
+  Costs PredictFusedBatchNorm(const OpContext& op_context) const;
+  Costs PredictFusedBatchNormGrad(const OpContext& op_context) const;
+
+  // Generic cost prediction method for fused operations.
+  Costs PredictFusedOp(const OpContext& op_context,
+                       const std::vector<OpContext>& fused_op_contexts) const;
 
   // Utility function for safe division. Returns 0
   // if rhs is 0 or negative.
@@ -156,11 +174,36 @@ class OpLevelCostEstimator {
     }
   }
 
+  // For convolution and its grad ops.
   static ConvolutionDimensions ConvolutionDimensionsFromInputs(
       const TensorShapeProto& original_image_shape,
-      const TensorShapeProto& original_filter_shape, const OpInfo& op_features,
+      const TensorShapeProto& original_filter_shape, const OpInfo& op_info,
+      bool* found_unknown_shapes);
+
+  // For Pooling, FusedBatchNorm, and their grad ops.
+  static ConvolutionDimensions OpDimensionsFromInputs(
+      const TensorShapeProto& original_image_shape, const OpInfo& op_info,
       bool* found_unknown_shapes);
 
+  // Helper to construct child operation contexts for the component operations
+  // of fused ops.
+  static OpContext FusedChildContext(
+      const OpContext& parent, const string& op_name,
+      const OpInfo::TensorProperties& output,
+      const std::vector<OpInfo::TensorProperties>& inputs);
+
+  // Helper to construct tensor shapes.
+  static OpInfo::TensorProperties DescribeTensor(
+      DataType type, const std::vector<int64>& dims);
+
+  // Returns the Conv2D format for this operation.
+  static ConvolutionFormat GetConvolutionFormat(const OpContext& op_context);
+
+  // This method calculates the execution time depending on whether IO can
+  // overlap with computation. It assumes the memory and the compute times have
+  // already been calculated.
+  void CombineCostsAndUpdateExecutionTime(Costs* costs) const;
+
  protected:
   std::map<string, int> elementwise_ops_;
   typedef std::function<Costs(const OpContext& op_context)> CostImpl;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index f2a9615dfb7df328c983de4dd37f3d1f4ec7d704..13ea43bed692828f00e89b7f964c3abcdcdb6483 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -91,6 +93,14 @@ OpContext DescribeBatchMatMul(const std::vector<int>& dims_a,
   return op_context;
 }
 
+// Wrangles the minimum number of proto fields to set up a 1D Tensor for cost
+// estimation purposes.
+void DescribeTensor1D(int dim0, OpInfo::TensorProperties* tensor) {
+  auto shape = tensor->mutable_shape();
+  shape->add_dim()->set_size(dim0);
+  tensor->set_dtype(DT_FLOAT);
+}
+
 // Wrangles the minimum number of proto fields to set up a 4D Tensor for cost
 // estimation purposes.
 void DescribeTensor4D(int dim0, int dim1, int dim2, int dim3,
@@ -118,6 +128,38 @@ OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
   return op_context;
 }
 
+// DescribeFusedConv2DBiasActivation constructs an OpContext for a
+// FusedConv2DBiasActivation applied to a convolution input tensor with shape
+// (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
+// bias tensor with shape (oz), a side input tensor with shape
+// (batch, ox, oy, oz) if has_side_input is set, and two scaling tensors with
+// shape (1).
+//
+// Note that this assumes the NHWC data format.
+OpContext DescribeFusedConv2DBiasActivation(int batch, int ix, int iy, int iz1,
+                                            int iz2, int kx, int ky, int ox,
+                                            int oy, int oz,
+                                            bool has_side_input) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("FusedConv2DBiasActivation");
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, oz, op_context.op_info.add_inputs());
+  DescribeTensor1D(oz, op_context.op_info.add_inputs());
+
+  // Add the side_input, if any.
+  auto side_input = op_context.op_info.add_inputs();
+  if (has_side_input) {
+    DescribeTensor4D(batch, ox, oy, oz, side_input);
+  }
+
+  // Add the scaling tensors.
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+  DescribeTensor1D(1, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
 // DescribeUnaryOp constructs an OpContext for the given operation applied to
 // a 4-tensor with shape (size1, 1, 1, 1).
 OpContext DescribeUnaryOp(const string& op, int size1) {
@@ -160,15 +202,172 @@ OpContext DescribeBiasAdd(int size1, int size2) {
   op_context.op_info.set_op("BiasAdd");
 
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_inputs());
+  DescribeTensor1D(size1, op_context.op_info.add_inputs());
   DescribeTensor4D(1, 1, size2, size1, op_context.op_info.add_outputs());
 
-  auto bias = op_context.op_info.add_inputs();
-  bias->mutable_shape()->add_dim()->set_size(size1);
-  bias->set_dtype(DT_FLOAT);
+  return op_context;
+}
 
+int GetOutputSize(const int x, const int k, const int s,
+                  const string& padding) {
+  if (padding == "SAME") {
+    return (x + s - 1) / s;
+  } else {
+    return (x - k + s) / s;
+  }
+}
+
+std::vector<int> GetPoolingOutputSize(const std::vector<int>& input,
+                                      const std::vector<int>& ksize,
+                                      const std::vector<int>& strides,
+                                      const string& data_format,
+                                      const string& padding) {
+  // h, w, and c indices: default with NHWC.
+  int h_index = 1;
+  int w_index = 2;
+  int c_index = 3;
+  if (data_format == "NCHW") {
+    h_index = 2;
+    w_index = 3;
+    c_index = 1;
+  }
+  // Extract parameters.
+  int n = input[0];
+  int h = input[h_index];
+  int w = input[w_index];
+  int c = input[c_index];
+  int sx = strides[h_index];
+  int sy = strides[w_index];
+  int kx = ksize[h_index];
+  int ky = ksize[w_index];
+
+  // Output activation size: default with VALID padding.
+  int ho = GetOutputSize(h, kx, sx, padding);
+  int wo = GetOutputSize(w, ky, sy, padding);
+
+  std::vector<int> output;
+  if (data_format == "NHWC") {
+    output = {n, ho, wo, c};
+  } else {
+    output = {n, c, ho, wo};
+  }
+  return output;
+}
+
+// Helper functions for testing GetTensorShapeProtoFromTensorProto().
+void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
+                    const std::vector<int64> values, const bool tensor_content,
+                    TensorProto* tensor_proto) {
+  tensor_proto->Clear();
+  TensorProto temp_tensor_proto;
+  temp_tensor_proto.set_dtype(dtype);
+  for (const auto& x : shape) {
+    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
+  }
+  for (const auto& x : values) {
+    if (dtype == DT_INT64) {
+      temp_tensor_proto.add_int64_val(x);
+    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
+               dtype == DT_UINT8) {
+      temp_tensor_proto.add_int_val(x);
+    } else if (dtype == DT_UINT32) {
+      temp_tensor_proto.add_uint32_val(x);
+    } else if (dtype == DT_UINT64) {
+      temp_tensor_proto.add_uint64_val(x);
+    } else {
+      CHECK(false) << "Unsupported dtype: " << dtype;
+    }
+  }
+  Tensor tensor(dtype);
+  CHECK(tensor.FromProto(temp_tensor_proto));
+  if (tensor_content) {
+    tensor.AsProtoTensorContent(tensor_proto);
+  } else {
+    tensor.AsProtoField(tensor_proto);
+  }
+}
+
+OpContext DescribePoolingOp(const string& op_name, const std::vector<int>& x,
+                            const std::vector<int>& ksize,
+                            const std::vector<int>& strides,
+                            const string& data_format, const string& padding) {
+  OpContext op_context;
+  auto& op_info = op_context.op_info;
+  SetCpuDevice(&op_info);
+  op_info.set_op(op_name);
+
+  const std::vector<int> y =
+      GetPoolingOutputSize(x, ksize, strides, data_format, padding);
+  if (op_name == "AvgPool" || op_name == "MaxPool") {
+    // input: x, output: y.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_outputs());
+  } else if (op_name == "AvgPoolGrad") {
+    // input: x's shape, y_grad, output: x_grad.
+    DescribeArbitraryRankInput({4}, DT_INT32, &op_info);
+    auto* tensor_proto = op_info.mutable_inputs(0)->mutable_value();
+    GetTensorProto(DT_INT32, {4}, {x[0], x[1], x[2], x[3]},
+                   /*tensor_content=*/false, tensor_proto);
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  } else if (op_name == "MaxPoolGrad") {
+    // input: x, y, y_grad, output: x_grad.
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(y[0], y[1], y[2], y[3], op_info.add_inputs());
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_outputs());
+  }
+  auto* attr = op_info.mutable_attr();
+  SetAttrValue(data_format, &(*attr)["data_format"]);
+  SetAttrValue(padding, &(*attr)["padding"]);
+  SetAttrValue(strides, &(*attr)["strides"]);
+  SetAttrValue(ksize, &(*attr)["ksize"]);
   return op_context;
 }
 
+OpContext DescribeFusedBatchNorm(const bool is_training, const bool is_grad,
+                                 const std::vector<int>& x,
+                                 const string& data_format) {
+  // First, get MaxPool op info with unit stride and unit window.
+  OpContext op_context = DescribePoolingOp("MaxPool", x, {1, 1, 1, 1},
+                                           {1, 1, 1, 1}, data_format, "SAME");
+  auto& op_info = op_context.op_info;
+  // Override op name.
+  if (is_grad) {
+    op_info.set_op("FusedBatchNormGrad");
+  } else {
+    op_info.set_op("FusedBatchNorm");
+  }
+
+  // Add additional input output tensors.
+  if (is_grad) {
+    DescribeTensor4D(x[0], x[1], x[2], x[3], op_info.add_inputs());
+  }
+  int num_1d_inputs = is_grad ? 3 : 4;
+  for (int i = 0; i < num_1d_inputs; i++) {
+    auto* tensor = op_info.add_inputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+  for (int i = 0; i < 4; i++) {
+    auto* tensor = op_info.add_outputs();
+    auto* shape = tensor->mutable_shape();
+    shape->add_dim()->set_size(x[3]);
+    tensor->set_dtype(DT_FLOAT);
+  }
+
+  // Delete unnecessary attr.
+  auto* attr = op_context.op_info.mutable_attr();
+  attr->erase("ksize");
+  attr->erase("strides");
+  attr->erase("padding");
+
+  // Additional attrs for FusedBatchNorm.
+  SetAttrValue(is_training, &(*attr)["is_training"]);
+
+  return op_context;
+}
 }  // namespace
 
 class OpLevelCostEstimatorTest : public ::testing::Test {
@@ -192,6 +391,50 @@ class OpLevelCostEstimatorTest : public ::testing::Test {
     estimator_.compute_memory_overlap_ = value;
   }
 
+  void ValidateOpDimensionsFromImputs(const int n, const int h, const int w,
+                                      const int c, const int kx, const int ky,
+                                      const int sx, const int sy,
+                                      const string& data_format,
+                                      const string& padding) {
+    OpContext op_context;
+    int ho;
+    int wo;
+    if (data_format == "NHWC") {
+      op_context = DescribePoolingOp("MaxPool", {n, h, w, c}, {1, kx, ky, 1},
+                                     {1, sx, sy, 1}, "NHWC", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(1).size();
+      wo = op_context.op_info.outputs(0).shape().dim(2).size();
+    } else {
+      op_context = DescribePoolingOp("MaxPool", {n, c, h, w}, {1, 1, kx, ky},
+                                     {1, 1, sx, sy}, "NCHW", padding);
+      ho = op_context.op_info.outputs(0).shape().dim(2).size();
+      wo = op_context.op_info.outputs(0).shape().dim(3).size();
+    }
+
+    bool found_unknown_shapes;
+    auto dims = OpLevelCostEstimator::OpDimensionsFromInputs(
+        op_context.op_info.inputs(0).shape(), op_context.op_info,
+        &found_unknown_shapes);
+    Padding padding_enum;
+    if (padding == "VALID") {
+      padding_enum = Padding::VALID;
+    } else {
+      padding_enum = Padding::SAME;
+    }
+    EXPECT_EQ(n, dims.batch);
+    EXPECT_EQ(h, dims.ix);
+    EXPECT_EQ(w, dims.iy);
+    EXPECT_EQ(c, dims.iz);
+    EXPECT_EQ(kx, dims.kx);
+    EXPECT_EQ(ky, dims.ky);
+    EXPECT_EQ(sx, dims.sx);
+    EXPECT_EQ(sy, dims.sy);
+    EXPECT_EQ(ho, dims.ox);
+    EXPECT_EQ(wo, dims.oy);
+    EXPECT_EQ(c, dims.oz);
+    EXPECT_EQ(padding_enum, dims.padding);
+  }
+
   OpLevelCostEstimator estimator_;
 };
 
@@ -212,6 +455,22 @@ TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
   EXPECT_FALSE(cost.inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("Gather");
+
+  // Huge first input shouldn't affect Gather execution and memory costs.
+  DescribeArbitraryRankInput({10000000, 10}, DT_FLOAT, &op_context.op_info);
+  DescribeArbitraryRankInput({16}, DT_INT64, &op_context.op_info);
+
+  auto cost = estimator_.PredictCosts(op_context);
+  EXPECT_EQ(Costs::Duration(0), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(0), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(0), cost.execution_time);
+  EXPECT_TRUE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
   OpContext op_context;
   SetCpuDevice(&op_context.op_info);
@@ -264,6 +523,25 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
+TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ true));
+  EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
+TEST_F(OpLevelCostEstimatorTest,
+       FusedConv2DBiasActivationNoSideInputExecutionTime) {
+  auto cost = PredictCosts(DescribeFusedConv2DBiasActivation(
+      16, 19, 19, 48, 48, 5, 5, 19, 19, 256, /* has_side_input = */ false));
+  EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   auto cost = PredictCosts(DescribeBinaryOp("Mul", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
@@ -337,39 +615,6 @@ TEST_F(OpLevelCostEstimatorTest, BatchMatMul) {
   EXPECT_NE(matmul_inaccurate, batch_matmul_inaccurate);
 }
 
-// Helper functions for testing GetTensorShapeProtoFromTensorProto().
-void GetTensorProto(const DataType dtype, const std::vector<int64>& shape,
-                    const std::vector<int64> values, const bool tensor_content,
-                    TensorProto* tensor_proto) {
-  tensor_proto->Clear();
-  TensorProto temp_tensor_proto;
-  temp_tensor_proto.set_dtype(dtype);
-  for (const auto& x : shape) {
-    temp_tensor_proto.mutable_tensor_shape()->add_dim()->set_size(x);
-  }
-  for (const auto& x : values) {
-    if (dtype == DT_INT64) {
-      temp_tensor_proto.add_int64_val(x);
-    } else if (dtype == DT_INT32 || dtype == DT_INT16 || dtype == DT_INT8 ||
-               dtype == DT_UINT8) {
-      temp_tensor_proto.add_int_val(x);
-    } else if (dtype == DT_UINT32) {
-      temp_tensor_proto.add_uint32_val(x);
-    } else if (dtype == DT_UINT64) {
-      temp_tensor_proto.add_uint64_val(x);
-    } else {
-      CHECK(false) << "Unsupported dtype: " << dtype;
-    }
-  }
-  Tensor tensor(dtype);
-  CHECK(tensor.FromProto(temp_tensor_proto));
-  if (tensor_content) {
-    tensor.AsProtoTensorContent(tensor_proto);
-  } else {
-    tensor.AsProtoField(tensor_proto);
-  }
-}
-
 void ExpectTensorShape(const std::vector<int64>& expected,
                        const TensorShapeProto& tensor_shape_proto) {
   TensorShape tensor_shape_expected(expected);
@@ -440,5 +685,226 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   }
 }
 
+TEST_F(OpLevelCostEstimatorTest, OpDimensionsFromInputs) {
+  std::vector<string> paddings = {"VALID", "SAME"};
+  std::vector<string> formats = {"NHWC", "NCHW"};
+  for (const auto& p : paddings) {
+    for (const auto& f : formats) {
+      // n, h, w, c, kx, ky, sx, sy, data_format, padding.
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 3, 3, 2, 2, f, p);
+      ValidateOpDimensionsFromImputs(10, 20, 20, 100, 1, 1, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 200, 200, 100, 5, 5, 3, 3, f, p);
+      ValidateOpDimensionsFromImputs(10, 14, 14, 3840, 3, 3, 2, 2, f, p);
+    }
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPool) {
+  auto predict_max_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "MaxPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(561792), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(56448), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
+  auto predict_max_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("MaxPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_max_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1536000), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_max_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(1514112), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(210048), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(1304064), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
+  auto predict_avg_pool = [this](const int n, const int in, const int c,
+                                 const int k, const int s,
+                                 const string& padding) -> Costs {
+    OpContext op_context = DescribePoolingOp(
+        "AvgPool", {n, in, in, c}, {1, k, k, 1}, {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(499200), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(38400), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(460800), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(580608), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(75264), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(505344), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
+  auto predict_avg_pool_grad = [this](const int n, const int in, const int c,
+                                      const int k, const int s,
+                                      const string& padding) -> Costs {
+    OpContext op_context =
+        DescribePoolingOp("AvgPoolGrad", {n, in, in, c}, {1, k, k, 1},
+                          {1, s, s, 1}, "NHWC", padding);
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    // Typical 3xz3 window with 2x2 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 3, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 1, 2, "SAME");
+    EXPECT_EQ(Costs::Duration(960002), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(192000), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+  {
+    // 2x2 window with 3x3 stride.
+    auto costs = predict_avg_pool_grad(10, 20, 384, 2, 3, "VALID");
+    EXPECT_EQ(Costs::Duration(862082), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(172416), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(689666), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNorm) {
+  auto predict_fused_bn = [this](const int n, const int in, const int c,
+                                 const bool is_training) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        is_training, /*is_grad=*/false, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/true);
+    EXPECT_EQ(Costs::Duration(204913), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(51236), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(153677), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 96, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(384154), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(76800), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(307354), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn(10, 20, 32, /*is_training=*/false);
+    EXPECT_EQ(Costs::Duration(128052), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(25600), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(102452), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
+
+TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
+  auto predict_fused_bn_grad = [this](const int n, const int in,
+                                      const int c) -> Costs {
+    OpContext op_context = DescribeFusedBatchNorm(
+        /*is_training=*/false, /*is_grad=*/true, {n, in, in, c}, "NHWC");
+    return estimator_.PredictCosts(op_context);
+  };
+
+  {
+    auto costs = predict_fused_bn_grad(10, 20, 96);
+    EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+
+  {
+    auto costs = predict_fused_bn_grad(128, 7, 384);
+    EXPECT_EQ(Costs::Duration(6503809), costs.execution_time);
+    EXPECT_EQ(Costs::Duration(2649677), costs.compute_time);
+    EXPECT_EQ(Costs::Duration(3854132), costs.memory_time);
+    EXPECT_FALSE(costs.inaccurate);
+  }
+}
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/robust_stats.cc b/tensorflow/core/grappler/costs/robust_stats.cc
index 9866bc86887e2fa1a1fcfe95e3e9673b7df1a8f3..5151b87c59cc09934871b225c70e785c8f9093dd 100644
--- a/tensorflow/core/grappler/costs/robust_stats.cc
+++ b/tensorflow/core/grappler/costs/robust_stats.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/robust_stats.h"
 #include <algorithm>
 #include <cmath>
+#include <utility>
 
 namespace tensorflow {
 namespace grappler {
diff --git a/tensorflow/core/grappler/costs/utils.cc b/tensorflow/core/grappler/costs/utils.cc
index 076945d5c626b9609448e339fcbd96de3e9d137f..f318e3911c2654412fc315e3a0356181e2b49333 100644
--- a/tensorflow/core/grappler/costs/utils.cc
+++ b/tensorflow/core/grappler/costs/utils.cc
@@ -212,8 +212,8 @@ DeviceProperties GetDeviceInfo(const string& device_str) {
       CudaGpuId cuda_gpu_id;
       Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
       if (!s.ok()) {
-        LOG(ERROR) << s;
-        return unknown;
+        // We are probably running simulation without linking cuda libraries.
+        cuda_gpu_id = CudaGpuId(parsed.id);
       }
       return GetLocalGPUInfo(cuda_gpu_id);
     } else if (parsed.type == "CPU") {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 3ac3ae0f8f835226bbc3ec5d6cec6cb890a6998f..0e5c654acfa59a3c1cba5d853dcf9fb87554bbaf 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -44,6 +44,8 @@ Costs CombineCosts(const Costs& left, const Costs& right) {
 
   Costs result = left;
   result.execution_time += right.execution_time;
+  result.compute_time += right.compute_time;
+  result.memory_time += right.memory_time;
   if (right.inaccurate) {
     result.inaccurate = true;
   }
@@ -841,6 +843,8 @@ bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
 Costs VirtualScheduler::Summary() const {
   // Print out basic execution summary.
   VLOG(1) << "Expected execution time: " << graph_costs_.execution_time.count();
+  VLOG(1) << "Expected compute time: " << graph_costs_.compute_time.count();
+  VLOG(1) << "Expected memory time: " << graph_costs_.memory_time.count();
   VLOG(1) << "Expected max memory: " << graph_costs_.max_memory;
   VLOG(1) << "Expected max per-op buffers: " << graph_costs_.max_per_op_buffers;
   VLOG(1) << "Expected max per-op streaming buffers: "
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index b683216590ede887d9c65003a23e712e0d612622..ffa204028cca828147810c99277fdcd9cb05f5ee 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "utils",
     srcs = [
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 1a6751befceffa372e6e39ba40ea75c0da61f1a1..9c45aed62ffa9e34463e05c3bdfb292ba0d1a3a3 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -68,6 +69,10 @@ bool IsBitcast(const NodeDef& node) { return node.op() == "Bitcast"; }
 
 bool IsCast(const NodeDef& node) { return node.op() == "Cast"; }
 
+bool IsCheckNumerics(const NodeDef& node) {
+  return node.op() == "CheckNumerics";
+}
+
 bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
 
 bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
@@ -240,6 +245,8 @@ bool IsPolygamma(const NodeDef& node) { return node.op() == "Polygamma"; }
 
 bool IsPow(const NodeDef& node) { return node.op() == "Pow"; }
 
+bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
+
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
@@ -309,6 +316,8 @@ bool IsSplitV(const NodeDef& node) { return node.op() == "SplitV"; }
 
 bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
 
+bool IsSquare(const NodeDef& node) { return node.op() == "Square"; }
+
 bool IsSquaredDifference(const NodeDef& node) {
   return node.op() == "SquaredDifference";
 }
@@ -358,6 +367,8 @@ bool IsTruncateDiv(const NodeDef& node) { return node.op() == "TruncateDiv"; }
 
 bool IsTruncateMod(const NodeDef& node) { return node.op() == "TruncateMod"; }
 
+bool IsUnpack(const NodeDef& node) { return node.op() == "Unpack"; }
+
 bool IsVariable(const NodeDef& node) {
   const auto& op = node.op();
   return op == "Variable" || op == "VariableV2" || op == "AutoReloadVariable" ||
@@ -402,8 +413,18 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
+
+  // Ops that modify resource variables effectively modify one of their inputs.
+  if (op_name == "AssignVariableOp" || op_name == "AssignAddVariableOp" ||
+      op_name == "AssignSubVariableOp" || op_name == "ResourceScatterUpdate" ||
+      op_name == "ResourceScatterAdd" || op_name == "ResourceScatterSub" ||
+      op_name == "ResourceScatterMul" || op_name == "ResourceScatterDiv" ||
+      op_name == "ResourceScatterMin" || op_name == "ResourceScatterMax") {
+    return false;
+  }
+
   std::transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
-  if (StringPiece(op_name).contains("inplace")) {
+  if (str_util::StrContains(op_name, "inplace")) {
     return true;
   }
   return GetBoolAttr(node, "in_place") || GetBoolAttr(node, "inplace");
@@ -435,15 +456,38 @@ bool IsInvolution(const NodeDef& node) {
   return involution_ops.count(node.op()) > 0;
 }
 
-bool IsValuePreserving(const NodeDef& node) {
+bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
+  const std::unordered_set<string> value_and_order_preserving_ops{
+      "CheckNumerics",
+      "DebugGradientIdentity",
+      "DeepCopy"
+      "Enter",
+      "Exit",
+      "ExpandDims",
+      "Identity",
+      "IdentityN",
+      "PreventGradient",
+      "Print",
+      "Reshape",
+      "Snapshot",
+      "Squeeze",
+      "StopGradient",
+  };
+  return value_and_order_preserving_ops.count(node.op()) > 0;
+}
+
+bool IsValuePreserving(const NodeDef& node) {
   const std::unordered_set<string> value_preserving_ops{
-      "Transpose",  "Reshape",      "Identity",        "InvertPermutation",
-      "Reverse",    "StopGradient", "PreventGradient", "CheckNumerics",
-      "ExpandDims", "Squeeze"};
-  return value_preserving_ops.count(node.op()) > 0;
+      "InvertPermutation",
+      "Reverse",
+      "Roll",
+      "Transpose",
+  };
+  return IsValueAndOrderPreserving(node) ||
+         value_preserving_ops.count(node.op()) > 0;
 }
 
 bool HasOpDef(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 1ec1cd46e362502776a3edcd66bb4a35fe54fd31..79fd05e1870684c0c7ab598a90999890b6e24cf6 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -37,6 +37,7 @@ bool IsBiasAdd(const NodeDef& node);
 bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
 bool IsCast(const NodeDef& node);
+bool IsCheckNumerics(const NodeDef& node);
 bool IsComplex(const NodeDef& node);
 bool IsComplexAbs(const NodeDef& node);
 bool IsConj(const NodeDef& node);
@@ -94,6 +95,7 @@ bool IsNoOp(const NodeDef& node);
 bool IsNotEqual(const NodeDef& node);
 bool IsPlaceholder(const NodeDef& node);
 bool IsPolygamma(const NodeDef& node);
+bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
 bool IsReal(const NodeDef& node);
@@ -121,6 +123,7 @@ bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
 bool IsSplitV(const NodeDef& node);
 bool IsSqrtGrad(const NodeDef& node);
+bool IsSquare(const NodeDef& node);
 bool IsSquaredDifference(const NodeDef& node);
 bool IsSqueeze(const NodeDef& node);
 bool IsStackOp(const NodeDef& node);
@@ -138,6 +141,7 @@ bool IsTile(const NodeDef& node);
 bool IsTranspose(const NodeDef& node);
 bool IsTruncateDiv(const NodeDef& node);
 bool IsTruncateMod(const NodeDef& node);
+bool IsUnpack(const NodeDef& node);
 bool IsVariable(const NodeDef& node);
 bool IsZeta(const NodeDef& node);
 
@@ -164,6 +168,10 @@ bool ModifiesInputsInPlace(const NodeDef& node);
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
 
+// Returns true if the op preserves the order and value of elements in its
+// first input tensor and possible changes its shape.
+bool IsValueAndOrderPreserving(const NodeDef& node);
+
 // Returns true if the op in node only rearranges the order of elements in its
 // first input tensor and possible changes its shape. More precisely, this
 // function returns true if the op commutes with all element-wise operations.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 601984fcfd45963ad0a06b4a5a0ca14649e52fad..35442c362204ff73b8e30f0888aaf959259c38f9 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -11,17 +11,9 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_protos_grappler",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "if_static",
 )
 
 cc_library(
@@ -263,6 +255,7 @@ cc_library(
         ":constant_folding",
         ":graph_optimizer",
         ":graph_optimizer_stage",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -272,6 +265,7 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
+        "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
 
@@ -284,6 +278,11 @@ tf_cuda_cc_test(
         ":constant_folding",
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -362,9 +361,11 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -373,6 +374,7 @@ tf_kernel_library(
     srcs = [
         "gpu_swapping_kernels.cc",
     ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -385,6 +387,7 @@ cc_library(
     srcs = [
         "gpu_swapping_ops.cc",
     ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -418,10 +421,7 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:topological_sort",
         "//tensorflow/core/grappler/utils:traversal",
-    ] + if_cuda([
-        ":gpu_swapping_kernels",
-        ":gpu_swapping_ops",
-    ]),
+    ],
 )
 
 tf_cuda_only_cc_test(
@@ -429,6 +429,8 @@ tf_cuda_only_cc_test(
     srcs = ["memory_optimizer_test.cc"],
     tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
     deps = [
+        ":gpu_swapping_kernels",
+        ":gpu_swapping_ops",
         ":memory_optimizer",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:ops",
@@ -484,6 +486,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/costs:virtual_placer",
     ],
 )
 
@@ -512,6 +515,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:colocation",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -534,11 +538,28 @@ tf_cuda_cc_test(
     ],
 )
 
+# This rule is header-only unless the build is static (--config=monolithic). Its
+# implementation is included directly in the framework shared object.
 cc_library(
     name = "custom_graph_optimizer_registry",
-    srcs = ["custom_graph_optimizer_registry.cc"],
     hdrs = ["custom_graph_optimizer_registry.h"],
     visibility = ["//visibility:public"],
+    deps = [
+        ":custom_graph_optimizer",
+        "//tensorflow/core:lib",
+    ] + if_static(
+        [":custom_graph_optimizer_registry_impl"],
+    ),
+)
+
+# This rule contains static variables for the optimizer registry. Do not depend
+# on it directly; use :custom_graph_optimizer_registry, and link against
+# libtensorflow_framework.so for the registry symbols.
+cc_library(
+    name = "custom_graph_optimizer_registry_impl",
+    srcs = ["custom_graph_optimizer_registry.cc"],
+    hdrs = ["custom_graph_optimizer_registry.h"],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":custom_graph_optimizer",
         "//tensorflow/core:lib",
@@ -630,7 +651,10 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:graph_optimizer",
     ],
@@ -646,6 +670,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:grappler_test",
     ],
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 23e21855c8d1cb51d6087dd0e9b1abbc86a02fbc..463c33285893dce8ef2e76d8022009284b8af2c6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
@@ -31,10 +32,12 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer_stage.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -196,54 +199,23 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
 
 bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
-const char kOutputShapesAttr[] = "_output_shapes";
-
-// Shape is symbolically defined if it has a known rank, and each dimension is
-// defined, or is an unknown symbol (dim.size <= -2).
-bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape) {
-  return !shape.unknown_rank() &&
-         std::all_of(
-             shape.dim().begin(), shape.dim().end(),
-             [](const TensorShapeProto::Dim& dim) { return dim.size() != -1; });
-}
-
-bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
-  return ShapeIsSymbolicallyDefined(properties.shape());
-}
-
-bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
-                             const TensorShapeProto& right) {
-  if (left.unknown_rank() || right.unknown_rank() ||
-      left.dim_size() != right.dim_size()) {
-    return false;
-  }
-  for (int i = 0; i < left.dim_size(); ++i) {
-    if (left.dim(i).size() == -1 || right.dim(i).size() == -1 ||
-        left.dim(i).size() != right.dim(i).size()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
-                             const OpInfo::TensorProperties& right) {
-  return ShapesSymbolicallyEqual(left.shape(), right.shape());
-}
-
 // Returns whether `reshape` is an identity op. The tensor that `reshape`
 // reshapes is the `output_pos`-th output of node `input`.
 bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
-                       const int output_pos) {
-  if (!reshape.attr().count(kOutputShapesAttr) ||
-      !input.attr().count(kOutputShapesAttr)) {
+                       const int output_pos,
+                       const GraphProperties& graph_properties) {
+  const std::vector<OpInfo::TensorProperties>& reshape_props =
+      graph_properties.GetOutputProperties(reshape.name());
+  const std::vector<OpInfo::TensorProperties>& input_props =
+      graph_properties.GetOutputProperties(input.name());
+  if (reshape_props.empty() || input_props.empty() ||
+      input_props.size() <= output_pos) {
     return false;
   }
 
-  PartialTensorShape src_shape(
-      input.attr().at(kOutputShapesAttr).list().shape(output_pos));
-  PartialTensorShape dst_shape(
-      reshape.attr().at(kOutputShapesAttr).list().shape(0));
+  const PartialTensorShape& src_shape = input_props[output_pos].shape();
+  const PartialTensorShape& dst_shape = reshape_props[0].shape();
+
   if (src_shape.unknown_rank() || dst_shape.unknown_rank()) {
     return false;
   }
@@ -256,7 +228,8 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
   // sizes.
   auto num_unknown_dim_sizes = [](const PartialTensorShape& partial_shape) {
     auto dim_sizes = partial_shape.dim_sizes();
-    return std::count(dim_sizes.begin(), dim_sizes.end(), -1);
+    return std::count_if(dim_sizes.begin(), dim_sizes.end(),
+                         [](int dim) { return dim < 0; });
   };
   int src_num_unknown_dim_sizes = num_unknown_dim_sizes(src_shape);
   int dst_num_unknown_dim_sizes = num_unknown_dim_sizes(dst_shape);
@@ -264,17 +237,16 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
     return false;
   }
 
-  // Now, src_shape and dst_shape have at most one dimension with unknown
-  // sizes, and are compatible. Therefore, the reshape is a no-op when
-  //
-  // 1. at least one of them is fully-defined, or
-  // 2. both are partially defined and the -1 appears on the same dimension,
-  //    i.e., IsIdenticalTo returns true.
-  if (src_num_unknown_dim_sizes == 1 && dst_num_unknown_dim_sizes == 1) {
-    return dst_shape.IsIdenticalTo(src_shape);
+  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
+  // shape inference in subsequent passes if we removed this reshape.
+  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
+    return false;
   }
 
-  return true;
+  // Remove the reshape if both are fully defined or partially defined and the
+  // unknown or symbolic shape appears on the same dimension, i.e., if
+  // IsIdenticalTo returns true.
+  return dst_shape.IsIdenticalTo(src_shape);
 }
 
 NodeDef* GetTailOfValuePreservingChain(
@@ -307,6 +279,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
         ctx_ext_(ctx_ext) {}
   virtual ~ArithmeticOptimizerStage() = default;
 
+ protected:
   // Simplification graph rewrite can create additional nodes that are inputs
   // to final simplified node, they can be also added to the arithmetic
   // optimizer queue for further optimization.
@@ -316,118 +289,148 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
 
   // TODO(ezhulenev): remove this method from ArithmeticOptimizer when all
   // optimizations will be migrated to stages
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep) {
-    const auto frame_it = ctx_.frame_map->find(old_node);
-    if (frame_it != ctx_.frame_map->end()) {
-      for (auto node : new_nodes) {
-        ctx_.frame_map->emplace(node, frame_it->second);
-      }
-      if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            source_for_ctrl_dep, ctx_.optimized_graph, ctx_.node_map);
-        for (auto node : sinks_for_control_dep) {
-          MaybeAddControlInput(ctrl_dep, node, ctx_.optimized_graph,
-                               ctx_.node_map);
+  void ForwardControlDependencies(
+      NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+    for (const auto& src : src_nodes) {
+      for (int i = src->input_size() - 1; i >= 0; --i) {
+        if (IsControlInput(src->input(i))) {
+          *target_node->add_input() = src->input(i);
+          ctx_.node_map->AddOutput(NodeName(src->input(i)),
+                                   target_node->name());
+        } else {
+          break;
         }
       }
     }
   }
 
  private:
-  // extened context required for ArithmeticOptimizer
+  // Extended context required for ArithmeticOptimizer.
   const ArithmeticOptimizerContext ctx_ext_;
 };
 
-// Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
-// original inputs of absorbed nodes.
+// Subtype of ArithmeticOptimizerStage that does optimization by rewriting a
+// group of nodes from the optimized graph.
 //
-// All nodes in a Add/AddN subgraph must have symbolically equal shape. All
-// nodes must have the same device placement.
+// * AddOpsRewrite:
+//   Rewrite a group of Add/AddN with compact Add/AddN tree
 //
-// Example:
-//                AddN_1
-//             /    |    \
-//          Add_1   z   Add_2       -> AddN(z, y, z, w, q, e)
-//          /  \        /  \
-//         x    y      w    Add_3
-//                          / \
-//                         q   e
-class AddOpsRewriteStage : public ArithmeticOptimizerStage {
+// * MinimizeBroadcasts:
+//   Rewrite a group of binary associative ops, reordering
+//   inputs, to minimize the cost of broadcast
+class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
  public:
-  explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
-                              const ArithmeticOptimizerContext& ctx_ext)
-      : ArithmeticOptimizerStage("AddOpsRewrite", ctx, ctx_ext),
-        rewritten_nodes_() {}
+  explicit ArithmeticNodesGroupOptimizerStage(
+      const string& name, const GraphOptimizerContext& ctx,
+      const ArithmeticOptimizerContext ctx_ext)
+      : ArithmeticOptimizerStage(name, ctx, ctx_ext), optimized_nodes_{} {}
+  ~ArithmeticNodesGroupOptimizerStage() override = default;
+
+  // Input name with a statically inferred shape from GraphProperties
+  struct InputAndShape {
+    InputAndShape(const string& input, const TensorShapeProto& shape)
+        : input(input), shape(shape) {}
+    string input;
+    TensorShapeProto shape;
+  };
 
-  ~AddOpsRewriteStage() override = default;
+  // Subgraph (subtree) of nodes, that we want to optimize in "one shot" (e.g.
+  // all the Add nodes that we plan to rewrite with a single AddN). Subgraph is
+  // obtained by graph traversal, starting from a root node.
+  struct OptimizedNodesGroup {
+    NodeDef* root_node;
+    TensorShapeProto root_shape;
+    // Optimized nodes that will be updated or removed by rewrite
+    std::vector<NodeDef*> optimized_nodes;
+    // Inputs to optimized nodes
+    std::vector<InputAndShape> inputs;
+  };
 
-  // Check if a node can become a root of AddOpsGroup
-  bool IsSupported(const NodeDef* node) const override {
-    // check basic preconditions
-    if (!IsRewritable(node)) {
-      return false;
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
+    OptimizedNodesGroup group;
+    TF_RETURN_IF_ERROR(CreateOptimizedNodesGroup(node, &group));
+
+    if (!group.optimized_nodes.empty()) {
+      *simplified_node_name = RewriteOptimizedNodesGroup(group);
     }
 
-    // shape must be symbolically defined and all inputs compatible with it
-    OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(node->name(), &properties);
-    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
-           HasAllInputsOfSymbolicallyEqualShape(*node, properties);
+    return Status::OK();
   }
 
-  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
-    AddOpsGroup group;
-    TF_RETURN_IF_ERROR(CreateAddOpsGroup(node, &group));
+ protected:
+  // Modify the optimized graph after nodes group was successfully identified
+  virtual string RewriteOptimizedNodesGroup(
+      const OptimizedNodesGroup& group) = 0;
 
-    if (!group.absorbed_nodes.empty() && !IsRewritten(group)) {
-      *simplified_node_name = RewriteAddOpsGroup(group);
-    }
+  // Check if input can become a part of current optimized nodes group.
+  virtual bool IsAbsorbableByOptimizedNodesGroup(
+      const OptimizedNodesGroup& group, const string& input) const = 0;
 
+  Status AbsorbInputByOptimizedNodesGroup(const string& input,
+                                          OptimizedNodesGroup* group) const {
+    NodeDef* node;
+    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
+
+    if (IsAbsorbableByOptimizedNodesGroup(*group, input)) {
+      for (int i = 0; i < node->input_size(); ++i) {
+        const string& input_i = node->input(i);
+        if (!IsControlInput(input)) {
+          TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+        }
+      }
+      group->optimized_nodes.push_back(node);
+    } else {
+      // If node can't be absorbed, add it to OptimizedNodesGroup input
+      OpInfo::TensorProperties properties;
+      TF_RETURN_IF_ERROR(GetTensorProperties(input, &properties));
+      group->inputs.emplace_back(input, properties.shape());
+    }
     return Status::OK();
   }
 
- private:
-  // Holds together an add ops subgraph that we want to rewrite together.
-  //
-  // For the graph above the AddOpsGroup will be:
-  //   root_node: AddN_1
-  //   absorbed_nodes: [Add_1, Add_2]
-  //   input_nodes: [x, y, z, w, q, e]
-  struct AddOpsGroup {
-    const NodeDef* root_node;
-    TensorShapeProto root_shape;
-    // Add/AddN operations below the root level that were absorbed by this group
-    std::vector<NodeDef*> absorbed_nodes;
-    // Inputs of absorbed nodes that will be forwarded to rewritten AddN node
-    std::vector<string> inputs;
-  };
+  Status CreateOptimizedNodesGroup(NodeDef* root_node,
+                                   OptimizedNodesGroup* group) const {
+    OpInfo::TensorProperties root_node_output_properties;
+    TF_RETURN_IF_ERROR(
+        GetTensorProperties(root_node->name(), &root_node_output_properties));
 
-  // Check if all inputs have symbolically equal shapes
-  bool HasAllInputsOfSymbolicallyEqualShape(
+    group->root_node = root_node;
+    group->root_shape = root_node_output_properties.shape();
+
+    group->optimized_nodes.reserve(root_node->input_size());
+    for (int i = 0; i < root_node->input_size(); ++i) {
+      const string& input_i = root_node->input(i);
+      if (!IsControlInput(input_i)) {
+        TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Check if all inputs can be broadcasted to the same shape
+  // TODO(ezhulenev): move to GraphOptimizerStage?
+  bool HasAllInputsBroadcastableToShape(
       const NodeDef& node, const OpInfo::TensorProperties& properties) const {
-    const AddOpsRewriteStage* self = this;
-    return std::all_of(
-        node.input().begin(), node.input().end(),
-        [self, &properties](const string& input) {
-          OpInfo::TensorProperties input_properties;
-          Status has_input_properties =
-              self->GetTensorProperties(input, &input_properties);
-          return has_input_properties.ok() &&
-                 ShapesSymbolicallyEqual(properties, input_properties);
-        });
-  }
-
-  // TODO(ezhulenev): use GraphRewriter?
+    auto is_broadcastable = [this, &properties](const string& input) {
+      OpInfo::TensorProperties input_props;
+      Status has_input_properties = GetTensorProperties(input, &input_props);
+      return has_input_properties.ok() &&
+             ShapesBroadcastable(properties, input_props);
+    };
+    return std::all_of(node.input().begin(), node.input().end(),
+                       is_broadcastable);
+  }
+
+  // TODO(ezhulenev): move to GraphOptimizerStage?
   bool IsDrivenByControlDependency(const NodeDef& node) const {
     return std::any_of(node.input().begin(), node.input().end(),
                        IsControlInput);
   }
 
-  // TODO(ezhulenev): use GraphRewriter?
+  // TODO(ezhulenev): move to GraphOptimizerStage?
   bool DrivesControlDependency(const NodeDef& node) const {
     int position;
     for (const NodeDef* output : ctx_.node_map->GetOutputs(node.name())) {
@@ -442,15 +445,90 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     return false;
   }
 
-  // Check if a node can be absorbed by current AddOpsGroup
-  bool IsAbsorbableByAddOpsGroup(const string& name, const AddOpsGroup& group) {
+  string ShapeSignature(const TensorShapeProto& shape) const {
+    string signature = strings::StrCat("rank:", shape.dim_size(), ":dim");
+    for (int i = 0; i < shape.dim_size(); ++i)
+      strings::StrAppend(&signature, ":", shape.dim(i).size());
+    return signature;
+  }
+
+  void AddToOptimizedNodes(const NodeDef* node) {
+    optimized_nodes_.insert(node->name());
+  }
+
+  bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
+                         const NodeDef& node) const {
+    return group.root_node->device() == node.device();
+  }
+
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx_.nodes_to_preserve->find(node.name()) !=
+           ctx_.nodes_to_preserve->end();
+  }
+
+  bool IsAlreadyOptimized(const NodeDef& node) const {
+    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  }
+
+ private:
+  // set of nodes already processed by this optimizer stage
+  std::unordered_set<string> optimized_nodes_;
+};
+
+// Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
+// original inputs of absorbed nodes.
+//
+// 1) All nodes must have the same device placement.
+//
+// 2) If All nodes in a Add/AddN subgraph have symbolically equal shape, tree is
+//    optimized to a single AddN node.
+//
+//                AddN_1
+//             /    |    \
+//          Add_1   z   Add_2       -> AddN(x, y, z, w, q, e)
+//          /  \        /  \
+//         x    y      w    Add_3
+//                          / \
+//                         q   e
+//
+// 3) If some nodes have different shape (it needs to be broadcastable to the
+//    shape of a "root), tree is optimized to AddNs for symbolically equal
+//    shapes, and a tree of Add ops, that minimize broadcasts.
+//
+//                AddN_1                                 Add
+//             /    |    \                              /  \
+//          Add_1   z   Add_2       ->               Add    w
+//          /  \        /  \                        /   \
+//         x    y      w    Add_3      AddN(x, y, q, e)  z
+//                          / \
+//                         q   e
+class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit AddOpsRewriteStage(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("AddOpsRewrite", ctx, ctx_ext) {}
+  ~AddOpsRewriteStage() override = default;
+
+  // Check if a node can become a root of AddOpsGroup
+  bool IsSupported(const NodeDef* node) const override {
+    if (!CanOptimize(node)) return false;
+
+    // shape must be symbolically defined and all inputs compatible with it
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
     NodeDef* node;
-    Status node_status = GetInputNode(name, &node);
-    if (!node_status.ok()) {
-      return false;
-    }
-    // check basic preconditions
-    if (!IsRewritable(node)) {
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok() || !CanOptimize(node)) return false;
+
+    if (!IsOnTheSameDevice(group, *node)) {
       return false;
     }
     // with a single output data consumer (presumably if we reach this node from
@@ -459,149 +537,178 @@ class AddOpsRewriteStage : public ArithmeticOptimizerStage {
     if (NumNonControlDataOutputs(*node, *ctx_.node_map) != 1) {
       return false;
     }
-    // must be on the same device as a root node
-    if (node->device() != group.root_node->device()) {
-      return false;
-    }
-    // All input shapes must be symbolically defined and equal to the node shape
+    // All input shapes must be broadcastable to the node shape
     OpInfo::TensorProperties properties;
-    Status has_properties = GetTensorProperties(name, &properties);
+    Status has_properties = GetTensorProperties(input, &properties);
     return has_properties.ok() &&
-           HasAllInputsOfSymbolicallyEqualShape(*node, properties);
+           HasAllInputsBroadcastableToShape(*node, properties);
   }
 
   // Node requirements both for a root node and an absorbed node
-  bool IsRewritable(const NodeDef* node) const {
-    // only Add or AddN can be a root node
+  bool CanOptimize(const NodeDef* node) const {
     // TODO(ezhulenev): check if AccumulateNV2 can be supported too
     if (!IsAdd(*node) && !IsAddN(*node)) {
       return false;
     }
-    // it must not be in a preserve set
-    if (ctx_.nodes_to_preserve->find(node->name()) !=
-        ctx_.nodes_to_preserve->end()) {
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
       return false;
     }
-    // it must not be a node created or absorbed by previous iteration
-    if (rewritten_nodes_.find(node->name()) != rewritten_nodes_.end()) {
+    // it must not be created by this stage at any of previous optimization runs
+    if (str_util::StrContains(node->name(), stage_name_)) {
       return false;
     }
-    // should not drive or be driven by control dependency
     // TODO(ezhulenev): relax this condition for root node
     return !(IsDrivenByControlDependency(*node) ||
              DrivesControlDependency(*node));
   }
 
-  // Check that optimized group node name doesn't exists. It might happen if
-  // graph optimized multiple times without pruning between invocations.
-  bool IsRewritten(const AddOpsGroup& group) const {
-    return ctx_.node_map->NodeExists(AddOpsGroupName(group));
-  }
-
-  // Create an AddOpsGroup with a root in a given node
-  Status CreateAddOpsGroup(const NodeDef* root_node, AddOpsGroup* group) {
-    OpInfo::TensorProperties root_node_output_properties;
-    TF_RETURN_IF_ERROR(
-        GetTensorProperties(root_node->name(), &root_node_output_properties));
-
-    group->root_node = root_node;
-    group->root_shape = root_node_output_properties.shape();
+  // Rewrite a group of add ops into a single AddN if all input shapes are
+  // symbolically equal. If not, create AddN for equal shapes first, and then
+  // build an Add tree, minimizing the cost of broadcasts.
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    // all new nodes will be placed under the scope of a root node
+    auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
 
-    group->absorbed_nodes.reserve(root_node->input_size());
-    for (int i = 0; i < root_node->input_size(); ++i) {
-      TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(root_node->input(i), group));
+    // Find what shapes are present in the inputs of absorbed nodes
+    std::unordered_map<string, std::vector<InputAndShape>> shape_sig_to_inputs;
+    for (const auto& input : group.inputs) {
+      shape_sig_to_inputs[ShapeSignature(input.shape)].push_back(input);
     }
 
-    return Status::OK();
-  }
+    // Collect all the shapes from representative elements
+    std::vector<TensorShapeProto> shapes;
+    shapes.reserve(shape_sig_to_inputs.size());
+    for (const auto& el : shape_sig_to_inputs)
+      shapes.push_back(el.second[0].shape);
+
+    // If all inputs have the same shape, rewrite whole group with a single AddN
+    if (shapes.size() == 1) {
+      string node_name = OptimizedNodeName(root_scope_and_name);
+      AddInputsOfSymbolicallyEqualShape(*group.root_node, node_name,
+                                        group.inputs);
+      return node_name;
+    }
 
-  Status AbsorbInputByAddOpsGroup(const string& input, AddOpsGroup* group) {
-    NodeDef* node;
-    TF_RETURN_IF_ERROR(GetInputNode(input, &node));
+    // For inputs of different shapes:
+    // 1. Rewrite inputs of the same shape using AddN (leaf nodes)
+    // 2. Build a tree of Add nodes, minimizing cost of broadcast
+    std::sort(shapes.begin(), shapes.end(),
+              [](const TensorShapeProto& left, const TensorShapeProto& right) {
+                return CompareSymbolicallyShapedTensorSizes(left, right);
+              });
+
+    // optimized name for leaf AddN nodes
+    auto leaf_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Leaf_", i));
+    };
+    // optimized name for internal nodes of a tree built up from AddN leaves
+    auto internal_node_name = [&root_scope_and_name, this](int i) {
+      return OptimizedNodeName(root_scope_and_name,
+                               strings::StrCat("Internal_", i));
+    };
+
+    // Add/AddN nodes that must be added to the tree
+    std::deque<InputAndShape> add_ops;
+
+    // Prepare leaf AddN nodes for inputs of equal shape
+    for (int i = 0; i < shapes.size(); ++i) {
+      const auto node_name = leaf_node_name(i);
+      const auto& inputs = shape_sig_to_inputs[ShapeSignature(shapes[i])];
+      add_ops.push_back(AddInputsOfSymbolicallyEqualShape(*group.root_node,
+                                                          node_name, inputs));
+    }
 
-    if (IsAbsorbableByAddOpsGroup(input, *group)) {
-      group->absorbed_nodes.push_back(node);
-      for (int i = 0; i < node->input_size(); ++i) {
-        TF_RETURN_IF_ERROR(AbsorbInputByAddOpsGroup(node->input(i), group));
-      }
-    } else {
-      // If node can't be absorbed, add it to AddOpsGroup input
-      group->inputs.push_back(input);
+    // Build up a tree of Add ops
+    int internal_nodes = 0;
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+      string name = add_ops.empty() ? OptimizedNodeName(root_scope_and_name)
+                                    : internal_node_name(internal_nodes++);
+      InputAndShape add = AddAggregatedInputs(*group.root_node, name, lhs, rhs);
+      add_ops.push_front(add);
+    } while (add_ops.size() > 1);
+
+    InputAndShape optimized_root_node = add_ops.front();
+    return optimized_root_node.input;
+  }
+
+  // Add 'AddN' node to aggregate inputs of symbolically equal shape
+  InputAndShape AddInputsOfSymbolicallyEqualShape(
+      const NodeDef& root_node, const string& node_name,
+      const std::vector<InputAndShape>& inputs) {
+    CHECK(!inputs.empty()) << "Inputs must be non-empty";
+
+    // Do not create redundant AddN nodes
+    if (inputs.size() == 1) {
+      return inputs[0];
     }
-    return Status::OK();
-  }
 
-  // New node for AddOpsGroup is added to the same scope as a root_node. All
-  // absorbed nodes are stripped of their scope, and only names are used in a
-  // new node name.
-  //
-  // Example: AddOpsGroup(root="a/b/c/Add_2", absorbed=["d/Add_1", "e/Add"])
-  //          node_name="a/b/c/AddOpsGroup_Add_2_Add_1_Add
-  string AddOpsGroupName(const AddOpsGroup& group) const {
-    CHECK_NOTNULL(group.root_node);
+    // get shape from representative element
+    auto shape = inputs[0].shape;
 
-    auto root = ParseNodeScopeAndName(group.root_node->name());
+    // copy attributes from a root node
+    DataType dtype = root_node.attr().at("T").type();
 
-    std::vector<string> absorbed_node_names(group.absorbed_nodes.size());
-    std::transform(group.absorbed_nodes.begin(), group.absorbed_nodes.end(),
-                   absorbed_node_names.begin(),
-                   [](const NodeDef* node) { return node->name(); });
+    // add new AddN node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("AddN");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
+    (*node->mutable_attr())["N"].set_i(inputs.size());
+
+    for (const auto& inputAndShape : inputs) {
+      ctx_.node_map->AddOutput(inputAndShape.input, node_name);
+      node->add_input(inputAndShape.input);
+    }
 
-    return OptimizedNodeName(root, absorbed_node_names);
+    AddToOptimizedNodes(node);
+    return InputAndShape(node_name, shape);
   }
 
-  // Create a new node for a AddOpsGroup and return it's name.
-  string RewriteAddOpsGroup(const AddOpsGroup& group) {
-    CHECK_GT(group.absorbed_nodes.size(), 0)
-        << "AddOpsGroup must have non empty absorbed nodes";
-
-    // name for a new node constructed from AddOpsGroup
-    string node_name = AddOpsGroupName(group);
-
+  // Add a single 'Add' node to sum two inputs
+  InputAndShape AddAggregatedInputs(const NodeDef& root_node,
+                                    const string& node_name,
+                                    const InputAndShape& left,
+                                    const InputAndShape& right) {
     // copy attributes from a root node
-    DataType dtype = group.root_node->attr().at("T").type();
-
-    // add new AddN node
-    NodeDef* added_node = AddEmptyNode(node_name);
-    added_node->set_op("AddN");
-    added_node->set_device(group.root_node->device());
-    (*added_node->mutable_attr())["T"].set_type(dtype);
-    (*added_node->mutable_attr())["N"].set_i(group.inputs.size());
+    DataType dtype = root_node.attr().at("T").type();
 
-    // all inputs of absorbed nodes are added to the new node
-    for (const string& input : group.inputs) {
-      ctx_.node_map->AddOutput(input, node_name);
-      added_node->add_input(input);
-    }
-
-    // Add frame dependencies that the original node might have had.
-    AddFrameControlDeps(group.root_node, {added_node}, "", {});
+    // add new Add node
+    NodeDef* node = AddEmptyNode(node_name);
+    node->set_op("Add");
+    node->set_device(root_node.device());
+    (*node->mutable_attr())["T"].set_type(dtype);
 
-    VLOG(1) << "Absorbed " << group.absorbed_nodes.size()
-            << " Add/AddN nodes from the graph";
+    ctx_.node_map->AddOutput(left.input, node_name);
+    ctx_.node_map->AddOutput(right.input, node_name);
 
-    // keep track of nodes that were created or absorbed as a part of rewrite
-    rewritten_nodes_.insert(node_name);
-    for (const NodeDef* absorbed : group.absorbed_nodes) {
-      rewritten_nodes_.insert(absorbed->name());
-    }
+    node->add_input(left.input);
+    node->add_input(right.input);
 
-    return node_name;
+    AddToOptimizedNodes(node);
+    return InputAndShape(
+        node_name, TensorShapeProto());  // shape is not important at this point
   }
-
-  // keep nodes that were added or absorbed as a part of AddOpsGroup rewrite
-  std::unordered_set<string> rewritten_nodes_;
 };
 
-// Use the commutativity and (left- and right-) distributive property of
-// multiplication over addition to hoist common factors out of aggregate nodes
-// where all the inputs are Mul nodes. This pattern occurs frequently in
-// regularization terms for the gradients during training.
+// Use the distributive property of multiplication and division over addition,
+// along with commutativity of the former, to hoist common factors/denominators
+// out of aggregate nodes where ALL the inputs are Mul/Div nodes.
+// This pattern occurs frequently in regularization terms for the gradients
+// during training.
 //
 // For example, we can rewrite an expression of the form:
 //   AddN(Mul(x, y1), Mul(y2, x), Mul(x, y3), ... Mul(x, yn))
 // to the following:
 //   Mul(x, AddN(y1, y2, y3, ... yn))
+// For division, we can rewrite
+//   AddN(Div(y1, x), Div(y2, x), Div(y3, x), ... Div(yn, x))
+// to:
+//   Div(AddN(y1, y2, y3, ... yn), x)
 class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
  public:
   explicit HoistCommonFactorOutOfAggregation(
@@ -616,10 +723,13 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
+    bool common_factor_is_denominator = false;
     std::set<string> common_factors;
-    TF_RETURN_IF_ERROR(GetCommonFactors(node, &common_factors));
+    std::vector<string> ctrl_deps;
+    TF_RETURN_IF_ERROR(GetCommonFactors(
+        node, &common_factors, &common_factor_is_denominator, &ctrl_deps));
 
     if (common_factors.size() == 1) {
       const string& common_factor = *common_factors.begin();
@@ -627,49 +737,61 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       // Gather up the non-shared factors
       bool shapes_match = true;
       std::vector<string> unique_factors;
-      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor, &shapes_match,
-                                          &unique_factors));
+      TF_RETURN_IF_ERROR(GetUniqueFactors(node, common_factor,
+                                          common_factor_is_denominator,
+                                          &shapes_match, &unique_factors));
 
       if (shapes_match) {
         NodeDef* input_0;
         TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input_0));
 
-        // Use a copy of the first Mul node for the outer multiplication.
-        NodeDef* new_mul_node = AddCopyNode(OuterMulNodeName(node), input_0);
+        // Use a copy of the first node for the outer multiplication/division.
+        NodeDef* new_outer_node = AddCopyNode(
+            OuterNodeName(node, common_factor_is_denominator), input_0);
         // And a copy of aggregation node as one of the inner operands
         NodeDef* new_add_node = AddCopyNode(InnerAddNodeName(node), node);
 
-        new_mul_node->set_device(node->device());
-        new_mul_node->set_input(0, common_factor);
-        new_mul_node->set_input(1, new_add_node->name());
+        new_outer_node->set_device(node->device());
+        if (common_factor_is_denominator) {
+          new_outer_node->set_input(0, new_add_node->name());
+          new_outer_node->set_input(1, common_factor);
+        } else {
+          new_outer_node->set_input(0, common_factor);
+          new_outer_node->set_input(1, new_add_node->name());
+        }
 
-        ctx_.node_map->AddOutput(common_factor, new_mul_node->name());
-        ctx_.node_map->AddOutput(new_add_node->name(), new_mul_node->name());
+        ctx_.node_map->AddOutput(common_factor, new_outer_node->name());
+        ctx_.node_map->AddOutput(new_add_node->name(), new_outer_node->name());
 
         // Hoist non-shared factors up into the new AddN node.
         for (int i = 0; i < unique_factors.size(); ++i) {
-          new_add_node->set_input(i, unique_factors[i]);
+          const string& unique_factor_i = unique_factors[i];
+          new_add_node->set_input(i, unique_factor_i);
+          ctx_.node_map->AddOutput(unique_factor_i, new_add_node->name());
         }
 
-        // Add frame dependencies that the original node might have had.
-        AddFrameControlDeps(node, {new_add_node, new_mul_node}, common_factor,
-                            {new_add_node});
+        // Add control deps on add node
+        for (const string& ctrl_dep : ctrl_deps) {
+          *new_add_node->add_input() = ctrl_dep;
+          ctx_.node_map->AddOutput(NodeName(ctrl_dep), new_add_node->name());
+        }
 
         // optimize new inner aggregation node
         AddToOptimizationQueue(new_add_node);
         // do not optimize the same node twice
         rewritten_nodes_.insert(node->name());
-        *simplified_node_name = new_mul_node->name();
+        *simplified_node_name = new_outer_node->name();
       }
     }
     return Status::OK();
   }
 
  private:
-  // Get a name for new outer Mul node
-  string OuterMulNodeName(const NodeDef* node) const {
+  // Get a name for new outer node
+  string OuterNodeName(const NodeDef* node, bool is_div) const {
     auto scope_and_name = ParseNodeScopeAndName(node->name());
-    return OptimizedNodeName(scope_and_name, "Mul");
+    return is_div ? OptimizedNodeName(scope_and_name, "Div")
+                  : OptimizedNodeName(scope_and_name, "Mul");
   }
 
   // Get a name new inner Add node
@@ -678,24 +800,56 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     return OptimizedNodeName(scope_and_name, "Add");
   }
 
-  // Determine the set of common factors if the input nodes are all Mul nodes.
-  Status GetCommonFactors(const NodeDef* node,
-                          std::set<string>* common_factors) const {
+  // Determine the set of common factors if the input nodes are all Mul or
+  // Div nodes.
+  Status GetCommonFactors(const NodeDef* node, std::set<string>* common_factors,
+                          bool* common_factor_is_denominator,
+                          std::vector<string>* ctrl_deps) const {
     CHECK(common_factors->empty());
+    CHECK_NOTNULL(common_factor_is_denominator);
+    *common_factor_is_denominator = false;
 
+    bool has_mul = false;
+    bool has_div = false;
     for (int i = 0; i < node->input_size(); ++i) {
       if (i > 0 && common_factors->empty()) break;
-      if (IsControlInput(node->input(i))) break;
-
+      if (IsControlInput(node->input(i))) {
+        ctrl_deps->push_back(node->input(i));
+        continue;
+      }
       NodeDef* input;
       TF_RETURN_IF_ERROR(GetInputNode(node->input(i), &input));
 
-      if (!IsMul(*input)) {
+      if ((!IsMul(*input) && !IsAnyDiv(*input)) || (IsMul(*input) && has_div) ||
+          (IsAnyDiv(*input) && has_mul)) {
+        // Break if input is neither a Mul or Div, or if there are both Mul &
+        // Div Ops.
         common_factors->clear();
         break;
+      } else if (IsAnyDiv(*input)) {
+        has_div = true;
+        // In case of possible common dividers, we avoid hoisting out if any
+        // input is not float/double, since integer division is not distributive
+        // over addition.
+        OpInfo::TensorProperties properties0, properties1;
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(0), &properties0));
+        TF_RETURN_IF_ERROR(GetTensorProperties(input->input(1), &properties1));
+        if (properties0.dtype() != DT_FLOAT &&
+            properties0.dtype() != DT_DOUBLE &&
+            properties1.dtype() != DT_FLOAT &&
+            properties1.dtype() != DT_DOUBLE) {
+          common_factors->clear();
+          break;
+        }
+      } else if (IsMul(*input)) {
+        has_mul = true;
       }
 
-      std::set<string> factors_i{input->input(0), input->input(1)};
+      // We only focus on common factors from denominators if any Op is a
+      // Div.
+      std::set<string> factors_i =
+          has_mul ? std::set<string>{input->input(0), input->input(1)}
+                  : std::set<string>{input->input(1)};
       if (i == 0) {
         std::swap(*common_factors, factors_i);
       } else {
@@ -706,7 +860,12 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
             std::inserter(intersection, intersection.begin()));
         std::swap(*common_factors, intersection);
       }
+      for (int i = 2; i < input->input_size(); ++i) {
+        ctrl_deps->push_back(input->input(i));
+      }
     }
+
+    *common_factor_is_denominator = has_div;
     return Status::OK();
   }
 
@@ -715,6 +874,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   // have the same shape since the other aggregation ops do not support
   // broadcasting.
   Status GetUniqueFactors(const NodeDef* node, const string& common_factor,
+                          const bool common_factor_is_denominator,
                           bool* shapes_match,
                           std::vector<string>* unique_factors) const {
     *shapes_match = true;
@@ -725,11 +885,13 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
       if (IsControlInput(input)) {
         break;
       }
-      NodeDef* mul_node;
-      TF_RETURN_IF_ERROR(GetInputNode(input, &mul_node));
+      NodeDef* inner_node;
+      TF_RETURN_IF_ERROR(GetInputNode(input, &inner_node));
       const int unique_factor_index =
-          mul_node->input(0) == common_factor ? 1 : 0;
-      unique_factors->push_back(mul_node->input(unique_factor_index));
+          common_factor_is_denominator
+              ? 0
+              : (inner_node->input(0) == common_factor ? 1 : 0);
+      unique_factors->push_back(inner_node->input(unique_factor_index));
       if (i > 0 && !IsAdd(*node)) {
         OpInfo::TensorProperties lhs;
         OpInfo::TensorProperties rhs;
@@ -745,13 +907,209 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     // if graph rewrite happens in multiple passes without graph pruning between
     // them, it's possible that rewritten node already exists in a graph
     return rewritten_nodes_.find(node->name()) != rewritten_nodes_.end() ||
-           ctx_.node_map->NodeExists(OuterMulNodeName(node));
+           ctx_.node_map->NodeExists(OuterNodeName(node, false)) ||
+           ctx_.node_map->NodeExists(OuterNodeName(node, true));
   }
 
   // keep names of the nodes that were optimized by this stage
   std::unordered_set<string> rewritten_nodes_;
 };
 
+// Binary associative ops can be re-ordered to minimize the number of broadcasts
+// and the size of a temporary tensors.
+//
+// Example: [a, c] - scalars, [b, d] - matrices
+//   @ - binary associative op (Add or Mul)
+//   @* - broadcast
+//
+//           @                      @*
+//        /     \                /      \
+//      @*       @*      ->     @        @
+//    /   \    /   \          /   \    /   \
+//   a     b  c     d        a     c  b     d
+class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
+ public:
+  explicit MinimizeBroadcasts(const GraphOptimizerContext& ctx,
+                              const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticNodesGroupOptimizerStage("MinimizeBroadcasts", ctx, ctx_ext) {
+  }
+  ~MinimizeBroadcasts() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    if (!IsBinaryAssociative(*node)) return false;
+
+    // has a symbolically defined shape with broadcastable inputs
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(node->name(), &properties);
+    return has_properties.ok() && ShapeIsSymbolicallyDefined(properties) &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+ protected:
+  bool IsBinaryAssociative(const NodeDef& node) const {
+    return IsMul(node) || IsAdd(node);
+  }
+
+  bool IsSameOp(const OptimizedNodesGroup& group, const NodeDef& node) const {
+    return group.root_node->op() == node.op();
+  }
+
+  // Check if a node can be absorbed by current OptimizedNodesGroup
+  bool IsAbsorbableByOptimizedNodesGroup(const OptimizedNodesGroup& group,
+                                         const string& input) const override {
+    NodeDef* node;
+    Status node_status = GetInputNode(input, &node);
+    if (!node_status.ok()) return false;
+
+    if (!IsSameOp(group, *node)) {
+      return false;
+    }
+    if (IsInPreserveSet(*node) || IsAlreadyOptimized(*node)) {
+      return false;
+    }
+    if (IsDrivenByControlDependency(*node) || DrivesControlDependency(*node)) {
+      return false;
+    }
+    if (!IsOnTheSameDevice(group, *node)) {
+      return false;
+    }
+    // Optimized nodes updated in place, and that would break the graph, if the
+    // node has multiple output consumers
+    if (NumNonControlOutputs(*node, *ctx_.node_map) != 1) {
+      return false;
+    }
+    // All input shapes must be broadcastable to the node shape
+    OpInfo::TensorProperties properties;
+    Status has_properties = GetTensorProperties(input, &properties);
+    return has_properties.ok() &&
+           HasAllInputsBroadcastableToShape(*node, properties);
+  }
+
+  std::size_t CountUniqueShapes(const std::vector<InputAndShape>& inputs) {
+    std::set<string> sigs;
+    for (const auto& ias : inputs) {
+      sigs.insert(ShapeSignature(ias.shape));
+    }
+    return sigs.size();
+  }
+
+  string RewriteOptimizedNodesGroup(const OptimizedNodesGroup& group) override {
+    if (CountUniqueShapes(group.inputs) <= 1) {
+      // nothing to optimize when all shapes are the same
+      return group.root_node->name();
+    }
+
+    auto num_nodes = /*root*/ 1 + group.optimized_nodes.size();
+    auto num_inputs = group.inputs.size();
+    CHECK_EQ(num_nodes, num_inputs - 1)
+        << "Can't build a tree with " << num_inputs << " inputs, using "
+        << num_nodes << "binary op nodes.";
+
+    std::deque<InputAndShape> add_ops(group.inputs.begin(), group.inputs.end());
+    std::deque<NodeDef*> optimized_nodes(group.optimized_nodes.begin(),
+                                         group.optimized_nodes.end());
+
+    // sort inputs by it's shape from smallest to largest
+    std::stable_sort(add_ops.begin(), add_ops.end(),
+                     [](const InputAndShape& lhs, const InputAndShape& rhs) {
+                       return CompareSymbolicallyShapedTensorSizes(lhs.shape,
+                                                                   rhs.shape);
+                     });
+
+    // If there is an odd number of inputs, last one is the largest, and we want
+    // to attach it to the root node, to build a well balanced tree.
+    std::deque<InputAndShape> add_ops_leftover;
+    if (add_ops.size() % 2 != 0) {
+      add_ops_leftover.push_back(add_ops.back());
+      add_ops.pop_back();
+    }
+
+    // At this point it's guaranteed that add_ops have even number of inputs.
+    do {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops.front();
+      add_ops.pop_front();
+
+      NodeDef* node;
+      if (!optimized_nodes.empty()) {
+        // re-purpose optimized nodes to build a new tree
+        node = optimized_nodes.front();
+        optimized_nodes.pop_front();
+      } else {
+        // or use root node if none optimized nodes left
+        node = group.root_node;
+      }
+      InputAndShape updated_node = UpdateInputs(lhs.input, rhs.input, node);
+
+      // Pushing updated node to the back of a deque will create a wide and
+      // short tree, pushing to the front will create a tall tree. We prefer to
+      // get a wide tree, it minimizes the potential number of temporary tensors
+      // required to keep in memory, though sometimes we can go up to prevent
+      // propagating a brodcast from leaves to the root. Example:
+      //
+      // inputs: [s, s, s, M] (s - scalar, M - matrix)
+      // @* - op with broadcast
+      //
+      //  (only push_back)           @*     (push_front first op)
+      //                            /  \
+      //       @*                  @    M
+      //     /   \                / \
+      //    @     @*      ->     @   s
+      //   / \   / \            / \
+      //  s   s s   M          s   s
+      if (add_ops.size() >= 2 &&
+          CompareSymbolicallyShapedTensorSizes(add_ops.at(0).shape,
+                                               add_ops.at(1).shape)) {
+        add_ops.push_front(updated_node);
+      } else {
+        add_ops.push_back(updated_node);
+      }
+    } while (add_ops.size() > 1);
+    CHECK_EQ(1, add_ops.size());
+
+    // attach the largest tensor to the root op
+    if (!add_ops_leftover.empty()) {
+      const InputAndShape lhs = add_ops.front();
+      add_ops.pop_front();
+      const InputAndShape rhs = add_ops_leftover.front();
+      InputAndShape updated_node =
+          UpdateInputs(lhs.input, rhs.input, group.root_node);
+      add_ops.push_back(updated_node);
+    }
+
+    return add_ops.front().input;
+  }
+
+  InputAndShape UpdateInputs(const string& input_0, const string& input_1,
+                             NodeDef* node) {
+    string old_input_0 = node->input(0);
+    string old_input_1 = node->input(1);
+
+    // Update inputs only if they changed
+    if (old_input_0 != input_0 || old_input_1 != input_1) {
+      node->set_input(0, input_0);
+      node->set_input(1, input_1);
+      // Invalidate node properties (shape)
+      ctx_.graph_properties->ClearOutputProperties(node->name());
+      ctx_.graph_properties->ClearInputProperties(node->name());
+      // Update the node map
+      ctx_.node_map->RemoveOutput(NodeName(old_input_0), node->name());
+      ctx_.node_map->RemoveOutput(NodeName(old_input_1), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_0), node->name());
+      ctx_.node_map->AddOutput(NodeName(input_1), node->name());
+      // Add updated node to optimization queue
+      AddToOptimizationQueue(node);
+    }
+
+    // Do not add updated node to any other group
+    AddToOptimizedNodes(node);
+
+    TensorShapeProto shape;  // shape is not important at this point
+    return InputAndShape(node->name(), shape);
+  }
+};
+
 // Removes inverse transpose nodes
 class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
@@ -767,19 +1125,24 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   // TODO(rmlarsen): Forward control dependencies on the bypassed
   // transpose nodes.
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
+    if (!IsConstant(*node_perm)) {
+      return Status::OK();
+    }
     std::vector<int64> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
-
     if (input->op() == node->op()) {
       // Remove pairs of transposes that cancel each other.
       NodeDef* input_perm;
       TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
+      if (!IsConstant(*input_perm)) {
+        return Status::OK();
+      }
       std::vector<int64> input_perm_values;
       TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
       if (AreInversePermutations(node_perm_values, input_perm_values)) {
@@ -851,7 +1214,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
 
     // Bypass Bitcast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
@@ -889,7 +1252,8 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
   bool IsSupported(const NodeDef* node) const override { return IsCast(*node); }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
-    CHECK(IsSupported(node));
+    TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+
     // Bypass Cast whose source type and destination type are equal.
     if (GetSourceDataType(*node) == GetDestinationDataType(*node)) {
       *simplified_node_name = node->input(0);
@@ -1089,7 +1453,8 @@ namespace {
 
 bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
   const std::unordered_set<string> op_types_to_traverse = {
-      node.op(), "Identity", "IdentityN", "Reshape"};
+      node.op(),    "Identity", "IdentityN", "Reshape",
+      "ExpandDims", "Enter",    "Switch",    "Merge"};
   int node_idx = graph_view.index(node.name());
   std::set<int> node_fanout;
   graph_view.DepthFirstSearch(op_types_to_traverse, node_idx, &node_fanout);
@@ -1190,20 +1555,15 @@ void ArithmeticOptimizer::DedupComputations() {
   }
 }
 
-void ArithmeticOptimizer::AddFrameControlDeps(
-    const NodeDef* old_node, const std::vector<NodeDef*>& new_nodes,
-    const string& source_for_ctrl_dep,
-    const std::vector<NodeDef*>& sinks_for_control_dep) {
-  const auto frame_it = frame_map_.find(old_node);
-  if (frame_it != frame_map_.end()) {
-    for (auto node : new_nodes) {
-      frame_map_.emplace(node, frame_it->second);
-    }
-    if (!source_for_ctrl_dep.empty() && !sinks_for_control_dep.empty()) {
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          source_for_ctrl_dep, optimized_graph_, node_map_.get());
-      for (auto node : sinks_for_control_dep) {
-        MaybeAddControlInput(ctrl_dep, node, optimized_graph_, node_map_.get());
+void ArithmeticOptimizer::ForwardControlDependencies(
+    NodeDef* target_node, const std::vector<const NodeDef*>& src_nodes) {
+  for (const auto& src : src_nodes) {
+    for (int i = src->input_size() - 1; i >= 0; --i) {
+      if (IsControlInput(src->input(i))) {
+        *target_node->add_input() = src->input(i);
+        node_map_->AddOutput(NodeName(src->input(i)), target_node->name());
+      } else {
+        break;
       }
     }
   }
@@ -1255,23 +1615,22 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     //      ^      |
     //      |      |
     //    input ---+
-    NodeDef* reshape = node_map_->GetNode(node->name());
+    NodeDef* reshape = const_cast<NodeDef*>(node);
     int output_pos = 0;
-    string input_node_name = ParseNodeName(node->input(0), &output_pos);
+    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
     const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape") {
+    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
       reshape->set_input(0, input->input(0));
       node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
       nodes_to_simplify->PushBack(reshape);
       return reshape->name();
     }
 
-    // If the reshape is a no-op, forward its input to its consumers. This is
-    // considered aggressive, because users may state that the placeholder
-    // outputs tensors of shape [M, N] while feeding it with tensors of shape
-    // [M*N] (or worse). The reshape nodes are then necessary to update the
-    // tensor metadata to the required shape.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos)) {
+    // If the reshape is a no-op, forward its input to its consumers, unless it
+    // anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
+        !HasControlInputs(*reshape)) {
       return reshape->input(0);
     }
   }
@@ -1299,8 +1658,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     // with image.type than with dst_type.
     if (DeviceNameUtils::SplitDeviceName(transpose->device(), &dontcare,
                                          &device) &&
-        (StringPiece(device).contains(DEVICE_CPU) ||
-         StringPiece(device).contains(DEVICE_GPU))) {
+        (str_util::StrContains(device, DEVICE_CPU) ||
+         str_util::StrContains(device, DEVICE_GPU))) {
       const NodeDef* cast = node_map_->GetNode(transpose->input(0));
       if (cast->op() == "Cast") {
         const NodeDef* input = node_map_->GetNode(cast->input(0));
@@ -1324,10 +1683,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
           node_map_->AddOutput(new_transpose->name(), new_cast->name());
 
           nodes_to_simplify->PushBack(new_transpose);
-          //  Add frame dependencies that the original node might have had.
-          AddFrameControlDeps(node, {new_transpose, new_cast},
-                              new_transpose->input(0), {new_transpose});
-
+          ForwardControlDependencies(new_transpose, {cast, node});
           return new_cast->name();
         }
       }
@@ -1401,7 +1757,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
             node_map_->AddOutput(weights->name(), scaled_weights->name());
             scaled_weights->add_input(mul->input(1));
             node_map_->AddOutput(scale->name(), scaled_weights->name());
-            AddFrameControlDeps(node, {scaled_weights}, "", {});
+            ForwardControlDependencies(scaled_weights, {source});
 
             // Update `conv`'s weights to `scaled_weights`.
             conv->set_input(1, scaled_weights->name());
@@ -1437,7 +1793,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   }
 
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input.
+    // Discard aggregate nodes with a single input and no control dependencies.
     if (node->input_size() == 1) {
       return node->input(0);
     }
@@ -1483,6 +1839,8 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         return "";
       }
       new_const_node->set_device(node->device());
+      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                           optimized_graph_, node_map_.get());
       nodes_to_simplify->PushBack(new_const_node);
 
       // 2. Replace the aggregate node with Mul(Const(N), x).
@@ -1495,9 +1853,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
       new_mul_node->add_input(node->input(0));
       node_map_->AddOutput(node->input(0), new_mul_node->name());
 
-      CopyControlInputs(*node, new_mul_node, optimized_graph_, node_map_.get());
-      AddFrameControlDeps(node, {new_const_node, new_mul_node}, node->input(0),
-                          {new_const_node});
+      ForwardControlDependencies(new_mul_node, {node});
       return new_mul_node->name();
     }
   }
@@ -1530,7 +1886,6 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_a, new_op);
         new_op->set_input(0, a->input(0));
         node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-        AddFrameControlDeps(node, {new_op}, a->input(0), {new_op});
       }
       if (b_is_foldable) {
         const string attr_b =
@@ -1538,10 +1893,15 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
         FlipBooleanAttr(attr_b, new_op);
         new_op->set_input(1, b->input(0));
         node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-        if (!a_is_foldable) {
-          AddFrameControlDeps(node, {new_op}, b->input(0), {new_op});
-        }
       }
+      std::vector<const NodeDef*> deps_to_forward({node});
+      if (a_is_foldable) {
+        deps_to_forward.push_back(a);
+      }
+      if (b_is_foldable) {
+        deps_to_forward.push_back(b);
+      }
+      ForwardControlDependencies(new_op, deps_to_forward);
     }
   }
 
@@ -1563,7 +1923,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
                                                        : "Transpose");
       new_op->set_input(0, input->input(0));
       node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      AddFrameControlDeps(node, {new_op}, "", {});
+      ForwardControlDependencies(new_op, {node, input});
       return new_op->name();
     }
   }
@@ -1571,7 +1931,7 @@ string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
   return "";
 }
 
-Status ArithmeticOptimizer::SimplifyArithmeticOps() {
+Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
   for (int i = 0; i < optimized_graph_->node_size(); ++i) {
@@ -1579,38 +1939,29 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
   }
 
   const GraphOptimizerContext ctx(&nodes_to_preserve_, optimized_graph_,
-                                  graph_properties_.get(), node_map_.get(),
-                                  &frame_map_);
+                                  graph_properties_.get(), node_map_.get());
   const ArithmeticOptimizerContext ctx_ext(&nodes_to_simplify);
 
-  std::vector<std::unique_ptr<ArithmeticOptimizerStage>> stages;
-
-  if (options_.combine_add_to_addn) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new AddOpsRewriteStage(ctx, ctx_ext)));
-  }
-  if (options_.hoist_common_factor_out_of_aggregation) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new HoistCommonFactorOutOfAggregation(ctx, ctx_ext)));
-  }
-  if (options_.remove_identity_transpose) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveIdentityTranspose(ctx, ctx_ext)));
-  }
-  if (options_.remove_redundant_bitcast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantBitcastStage(ctx, ctx_ext)));
-  }
-  if (options_.remove_redundant_cast) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveRedundantCastStage(ctx, ctx_ext)));
-  }
-  if (options_.remove_negation) {
-    stages.push_back(std::unique_ptr<ArithmeticOptimizerStage>(
-        new RemoveNegationStage(ctx, ctx_ext)));
-  }
-
-  VLOG(1) << "Simplify arithmetic ops using " << stages.size()
+  // Stop pipeline after first stage returning non-empty simplified tensor name.
+  const auto stop = [](const string& result) { return !result.empty(); };
+  GraphOptimizerStagePipeline<string> pipeline(stop);
+
+  if (options_.combine_add_to_addn && can_use_shapes)
+    pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
+    pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
+  if (options_.minimize_broadcasts && can_use_shapes)
+    pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
+  if (options_.remove_identity_transpose && can_use_shapes)
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_redundant_bitcast)
+    pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_cast)
+    pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_negation)
+    pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+
+  VLOG(1) << "Simplify arithmetic ops using " << pipeline.NumStages()
           << " arithmetic optimization stages";
 
   while (!nodes_to_simplify.Empty()) {
@@ -1623,22 +1974,13 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
     }
 
     // if it was not simplified try to run it through all configured stages
-    if (simplified_tensor.empty()) {
-      for (auto& stage : stages) {
-        if (stage->IsSupported(node)) {
-          TF_RETURN_IF_ERROR(stage->TrySimplify(node, &simplified_tensor));
-          if (!simplified_tensor.empty()) {
-            break;
-          }
-        }
+    if (!stop(simplified_tensor)) {
+      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
+      if (!optimized) {
+        continue;
       }
     }
 
-    // if it's still empty go to the next Node
-    if (simplified_tensor.empty()) {
-      continue;
-    }
-
     // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
       // Always consider simplified_tensor for further optimizations.
@@ -1681,31 +2023,32 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps() {
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
                                      const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
-  optimized_graph_ = optimized_graph;
-  *optimized_graph_ = item.graph;
+  GrapplerItem optimized_item(item);
+  optimized_graph_ = &optimized_item.graph;
 
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
   node_map_.reset(new NodeMap(optimized_graph_));
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map_, &num_frames));
-  // Shapes are only needed in aggressive mode.
-  graph_properties_.reset(new GraphProperties(item));
-  TF_RETURN_IF_ERROR(graph_properties_->InferStatically(false));
-  // TODO(ezhulenev): Use GraphProperties to lookup tensor shapes directly
-  TF_RETURN_IF_ERROR(graph_properties_->AnnotateOutputShapes(optimized_graph_));
 
-  // Perform the optimizations.
   DedupComputations();
-  TF_RETURN_IF_ERROR(SimplifyArithmeticOps());
 
-  // Clear output shapes.
-  for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    optimized_graph_->mutable_node(i)->mutable_attr()->erase(kOutputShapesAttr);
+  // Perform topological sort on the graph in order to help AddOpsRewrite to
+  // optimize larger subgraphs starting from the roots with more inputs.
+  TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
+
+  // Shapes are only needed in aggressive mode.
+  graph_properties_.reset(new GraphProperties(item));
+  const Status status = graph_properties_->InferStatically(false);
+  const bool can_use_shapes = status.ok();
+  if (!can_use_shapes) {
+    VLOG(1) << "Shape inference failed." << status.error_message();
   }
 
+  // Perform the optimizations.
+  TF_RETURN_IF_ERROR(SimplifyArithmeticOps(can_use_shapes));
+
+  optimized_graph->Swap(optimized_graph_);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 965f0e9ea25e1aac9ff266949bf6e20b5f0a30c9..c0fe8839ca7bd111eaf95cf742a8f1de841dac14 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -60,6 +59,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool enable_try_simplify_and_replace = true;
     bool combine_add_to_addn = false;
     bool hoist_common_factor_out_of_aggregation = true;
+    bool minimize_broadcasts = false;
     bool remove_identity_transpose = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
@@ -69,7 +69,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
-      return ArithmeticOptimizerOptions();
+      ArithmeticOptimizerOptions options;
+      // TODO(ezhulenev): enable by default after 1.8 release cut
+      if (opt_level == RewriterConfig::AGGRESSIVE) {
+        options.combine_add_to_addn = true;
+        options.minimize_broadcasts = true;
+      }
+      return options;
     }
   };
 
@@ -94,17 +100,13 @@ class ArithmeticOptimizer : public GraphOptimizer {
   // Dedup redundant nodes in the graph.
   void DedupComputations();
 
-  // Fix frame dependencies by adding control dependencies from old_input to
-  // nodes in new_nodes_for_control_dep, and update frame_map for all nodes in
-  // new_nodes.
-  void AddFrameControlDeps(const NodeDef* old_node,
-                           const std::vector<NodeDef*>& new_nodes,
-                           const string& source_for_ctrl_dep,
-                           const std::vector<NodeDef*>& sinks_for_control_dep);
+  // Forward the control dependencies anchored on src_nodes to the target_nodes.
+  void ForwardControlDependencies(NodeDef* target_node,
+                                  const std::vector<const NodeDef*>& src_nodes);
 
   // Runs peep-hole optimizations on `optimized_graph`, e.g., removing inverse
   // transposes.
-  Status SimplifyArithmeticOps();
+  Status SimplifyArithmeticOps(bool can_use_shapes);
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
   // tensor (e.g. "split:1") or an emtpy string if no simplification is
@@ -129,7 +131,6 @@ class ArithmeticOptimizer : public GraphOptimizer {
   bool fetch_nodes_known_ = false;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
   std::unique_ptr<GraphProperties> graph_properties_;
   GraphDef* optimized_graph_ = nullptr;  // Not owned.
 };
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 792f67504387102b22438cc7a3c0dc35a8d7f2f7..e63981285810a24fb6d97e09533290fb64dce05d 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -31,6 +31,9 @@ namespace grappler {
 
 namespace {
 
+constexpr char kHoistFactorOptimizerDiv[] =
+    "ArithmeticOptimizer/HoistCommonFactor_Div_";
+
 constexpr char kHoistFactorOptimizerMul[] =
     "ArithmeticOptimizer/HoistCommonFactor_Mul_";
 
@@ -42,6 +45,11 @@ string HoistMulName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
 }
 
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+string HoistDivName(const string& name) {
+  return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
+}
+
 // Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
 string HoistAddName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
@@ -93,6 +101,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.hoist_common_factor_out_of_aggregation = false;
+    options.minimize_broadcasts = false;
     options.remove_identity_transpose = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
@@ -113,6 +122,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
   }
 
+  void EnableOnlyMinimizeBroadcasts(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.minimize_broadcasts = true;
+  }
+
   void EnableOnlyRemoveIdentityTranspose(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_identity_transpose = true;
@@ -156,25 +170,22 @@ TEST_F(ArithmeticOptimizerTest, OpDedupping) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
 
-  ArithmeticOptimizer optimizer;
-  GraphDef output;
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
 
+  ArithmeticOptimizer optimizer;
+  GraphDef output;
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
   EXPECT_EQ(2, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_div = output.node(1);
-  EXPECT_EQ("div", new_div.name());
-  EXPECT_EQ(2, new_div.input_size());
-  EXPECT_EQ("c1", new_div.input(0));
-  EXPECT_EQ("c1", new_div.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(2, new_div->input_size());
+  EXPECT_EQ("c1", new_div->input(0));
+  EXPECT_EQ("c1", new_div->input(1));
 
   auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
@@ -195,23 +206,30 @@ TEST_F(ArithmeticOptimizerTest, OpDeduppingAssertAndCheckNumerics) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div"};
+  Tensor bool_t(DT_BOOL, TensorShape({}));
+  bool_t.scalar<bool>().setConstant(true);
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_div = output.node(3);
-  EXPECT_EQ(4, new_div.input_size());
-  EXPECT_EQ("check1", new_div.input(0));
-  EXPECT_EQ("check1", new_div.input(1));
-  EXPECT_EQ("^assert1", new_div.input(2));
-  EXPECT_EQ("^assert1", new_div.input(3));
+  const NodeDef* new_div = node_map.GetNode("div");
+  ASSERT_NE(new_div, nullptr);
+  EXPECT_EQ(4, new_div->input_size());
+  EXPECT_EQ("check1", new_div->input(0));
+  EXPECT_EQ("check1", new_div->input(1));
+  EXPECT_EQ("^assert1", new_div->input(2));
+  EXPECT_EQ("^assert1", new_div->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", bool_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
@@ -223,32 +241,34 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   Output div1 = ops::Div(s.WithOpName("div1"), mul1, mul2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  item.fetch = {"div"};
+  item.fetch = {"div1"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(4, output.node_size());
-  const NodeDef& new_c1 = output.node(0);
-  EXPECT_EQ("c1", new_c1.name());
-  const NodeDef& new_c2 = output.node(1);
-  EXPECT_EQ("c2", new_c2.name());
-  const NodeDef& new_mul1 = output.node(2);
-  EXPECT_EQ("mul1", new_mul1.name());
-  EXPECT_EQ(2, new_mul1.input_size());
-  EXPECT_EQ("c1", new_mul1.input(0));
-  EXPECT_EQ("c2", new_mul1.input(1));
-  const NodeDef& new_div1 = output.node(3);
-  EXPECT_EQ("div1", new_div1.name());
-  EXPECT_EQ(2, new_div1.input_size());
-  EXPECT_EQ("mul1", new_div1.input(0));
-  EXPECT_EQ("mul1", new_div1.input(1));
+  const NodeDef* new_c1 = node_map.GetNode("c1");
+  ASSERT_NE(new_c1, nullptr);
+  const NodeDef* new_c2 = node_map.GetNode("c2");
+  ASSERT_NE(new_c2, nullptr);
+  const NodeDef* new_mul1 = node_map.GetNode("mul1");
+  ASSERT_NE(new_mul1, nullptr);
+  EXPECT_EQ(2, new_mul1->input_size());
+  EXPECT_EQ("c1", new_mul1->input(0));
+  EXPECT_EQ("c2", new_mul1->input(1));
+  const NodeDef* new_div1 = node_map.GetNode("div1");
+  ASSERT_NE(new_div1, nullptr);
+  EXPECT_EQ(2, new_div1->input_size());
+  EXPECT_EQ("mul1", new_div1->input(0));
+  EXPECT_EQ("mul1", new_div1->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MulToSquare) {
@@ -259,6 +279,9 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   Output id = ops::Identity(s.WithOpName("id"), mul);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -273,6 +296,10 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   EXPECT_EQ(2, output.node(4).input_size());
   EXPECT_EQ("c", output.node(4).input(0));
   EXPECT_EQ("^d", output.node(4).input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
@@ -285,6 +312,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   Output id = ops::Identity(s.WithOpName("id"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -295,6 +325,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
   EXPECT_EQ("c", output.node(1).input(0));
   EXPECT_EQ("c", output.node(3).input(0));
   EXPECT_EQ("c", output.node(5).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
@@ -307,6 +341,9 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -320,6 +357,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
   EXPECT_EQ(6, output.node_size());
   EXPECT_EQ("squeeze", output.node(5).input(0));
   EXPECT_EQ("c", output.node(2).input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
@@ -334,6 +375,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id2"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -351,6 +396,10 @@ TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
       EXPECT_EQ(original.input(j), optimized.input(j));
     }
   }
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
@@ -362,28 +411,35 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(5, output.node_size());
-  const NodeDef& new_const = output.node(3);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(4);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  const NodeDef& new_id = output.node(2);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
@@ -396,29 +452,36 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  std::vector<string> fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(6, output.node_size());
-  const NodeDef& new_const = output.node(4);
-  EXPECT_EQ(OptimizedName("add_const"), new_const.name());
-  EXPECT_EQ("^x", new_const.input(0));
+
+  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  ASSERT_NE(new_const, nullptr);
+  EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
-            new_const.attr().at("value").tensor().tensor_content());
-  const NodeDef& new_mul = output.node(5);
-  EXPECT_EQ(OptimizedName("add_mul"), new_mul.name());
-  EXPECT_EQ(OptimizedName("add_const"), new_mul.input(0));
-  EXPECT_EQ("x", new_mul.input(1));
-  EXPECT_EQ("^y", new_mul.input(2));
-  const NodeDef& new_id = output.node(3);
-  EXPECT_EQ("id", new_id.name());
-  EXPECT_EQ(OptimizedName("add_mul"), new_id.input(0));
+            new_const->attr().at("value").tensor().tensor_content());
+
+  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  ASSERT_NE(new_mul, nullptr);
+  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ("x", new_mul->input(1));
+  EXPECT_EQ("^y", new_mul->input(2));
+
+  const NodeDef* new_id = node_map.GetNode("id");
+  ASSERT_NE(new_id, nullptr);
+  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
@@ -434,6 +497,7 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
   const std::vector<string> devices{
       "/device:CPU:0", "/device:GPU:0", "/device:CPU:0", "/device:GPU:1",
       "/device:CPU:0", "/device:CPU:0", "/device:CPU:0",
@@ -458,54 +522,51 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   EXPECT_EQ(17, output.node_size());
 
   const NodeDef* id_node = node_map.GetNode("id");
-  ASSERT_TRUE(id_node != nullptr);
+  ASSERT_NE(id_node, nullptr);
   EXPECT_EQ(1, id_node->input_size());
   EXPECT_EQ(HoistMulName("Add_6"), id_node->input(0));
 
   const NodeDef* mul_node = node_map.GetNode(HoistMulName("Add_6"));
-  ASSERT_TRUE(mul_node != nullptr);
+  ASSERT_NE(mul_node, nullptr);
   EXPECT_EQ(2, mul_node->input_size());
   EXPECT_EQ("Placeholder", mul_node->input(0));
   EXPECT_EQ(HoistAddName("Add_6"), mul_node->input(1));
 
   const NodeDef* add_6_node = node_map.GetNode(HoistAddName("Add_6"));
-  ASSERT_TRUE(add_6_node != nullptr);
-  EXPECT_EQ(3, add_6_node->input_size());
+  ASSERT_NE(add_6_node, nullptr);
+  EXPECT_EQ(2, add_6_node->input_size());
   EXPECT_EQ(HoistAddName("Add_4"), add_6_node->input(0));
   EXPECT_EQ(HoistAddName("Add_5"), add_6_node->input(1));
-  EXPECT_EQ("^Placeholder", add_6_node->input(2));
 
   const NodeDef* add_4_node = node_map.GetNode(HoistAddName("Add_4"));
-  ASSERT_TRUE(add_4_node != nullptr);
+  ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
-  EXPECT_EQ(3, add_4_node->input_size());
+  EXPECT_EQ(2, add_4_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
-  EXPECT_EQ("^Placeholder", add_4_node->input(2));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
-  ASSERT_TRUE(add_5_node != nullptr);
+  ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
-  EXPECT_EQ(3, add_5_node->input_size());
+  EXPECT_EQ(2, add_5_node->input_size());
   EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
   EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
-  EXPECT_EQ("^Placeholder", add_5_node->input(2));
 
   const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
-  ASSERT_TRUE(add_const_node != nullptr);
+  ASSERT_NE(add_const_node, nullptr);
   EXPECT_EQ("Const", add_const_node->op());
   EXPECT_EQ(1, add_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_const_node->input(0));
 
   const NodeDef* add_1_const_node =
       node_map.GetNode(OptimizedName("Add_1_const"));
-  ASSERT_TRUE(add_1_const_node != nullptr);
+  ASSERT_NE(add_1_const_node, nullptr);
   EXPECT_EQ("Const", add_1_const_node->op());
   EXPECT_EQ(1, add_1_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_1_const_node->input(0));
 }
 
-TEST_F(ArithmeticOptimizerTest, HoistFactor) {
+TEST_F(ArithmeticOptimizerTest, HoistFactorMul) {
   for (bool matching_shapes : {true, false}) {
     for (bool use_addn : {true, false}) {
       tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -525,7 +586,8 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
       GrapplerItem item;
       item.fetch = {"id"};
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+      EXPECT_EQ(1, tensors_expected.size());
       ArithmeticOptimizer optimizer;
       EnableOnlyHoistCommonFactor(&optimizer);
 
@@ -550,55 +612,138 @@ TEST_F(ArithmeticOptimizerTest, HoistFactor) {
         EXPECT_EQ(9, output.node_size());
 
         const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
-        ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+        ASSERT_NE(new_add_node, nullptr) << "Hoisted Add node not found";
         EXPECT_EQ("y1", new_add_node->input(0));
         EXPECT_EQ("y2", new_add_node->input(1));
 
         const NodeDef* new_mul_node = node_map.GetNode(HoistMulName("add"));
-        ASSERT_TRUE(new_mul_node != nullptr) << "Hoisted Mul node not found";
+        ASSERT_NE(new_mul_node, nullptr) << "Hoisted Mul node not found";
         EXPECT_EQ("x", new_mul_node->input(0));
         EXPECT_EQ(new_add_node->name(), new_mul_node->input(1));
 
         const NodeDef* id_node = node_map.GetNode("id");
-        ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+        ASSERT_NE(id_node, nullptr) << "Id node not found";
         EXPECT_EQ("id", id_node->name());
         EXPECT_EQ(HoistMulName("add"), id_node->input(0));
       }
+      auto tensors = EvaluateNodes(output, item.fetch);
+      EXPECT_EQ(1, tensors.size());
+      test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
+  for (bool matching_shapes : {true, false}) {
+    for (bool use_addn : {true, false}) {
+      for (bool use_ints : {true, false}) {
+        tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+        Output x = use_ints
+                       ? ops::Const(s.WithOpName("x"), {1, 2}, {1, 2})
+                       : ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+        Output y1 = use_ints
+                        ? ops::Const(s.WithOpName("y1"), {3, 4}, {1, 2})
+                        : ops::Const(s.WithOpName("y1"), {3.0f, 4.0f}, {1, 2});
+        Output y2;
+        if (matching_shapes) {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5, 6}, {1, 2})
+                        : ops::Const(s.WithOpName("y2"), {5.0f, 6.0f}, {1, 2});
+        } else {
+          y2 = use_ints ? ops::Const(s.WithOpName("y2"), {5}, {1, 1})
+                        : ops::Const(s.WithOpName("y2"), {5.0f}, {1, 1});
+        }
+        Output div1 = ops::Div(s.WithOpName("div1"), y1, x);
+        Output div2 = ops::Div(s.WithOpName("div2"), y2, x);
+        Output id =
+            use_addn
+                ? ops::Identity(s.WithOpName("id"),
+                                ops::AddN(s.WithOpName("add"), {div1, div2}))
+                : ops::Identity(s.WithOpName("id"),
+                                ops::Add(s.WithOpName("add"), div1, div2));
+
+        GrapplerItem item;
+        item.fetch = {"id"};
+        TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+        ArithmeticOptimizer optimizer;
+        EnableOnlyHoistCommonFactor(&optimizer);
+
+        GraphDef output;
+        OptimizeTwice(&optimizer, &item, &output);
+
+        // We expect the following rewrite(s) to occur:
+        //
+        //        Add                 Div
+        //      /    \               /   \
+        //    Div    Div       ->  Add    x
+        //    / \    / \           / \
+        //   y1  x  y2  x         y1  y2
+        //
+        // If "root" op is AddN and shapes does not match, this rewrite is not
+        // possible and graph should stay intact.
+        NodeMap node_map(&output);
+
+        if ((use_addn && !matching_shapes) || use_ints) {
+          VerifyGraphsMatch(item.graph, output, __LINE__);
+        } else {
+          EXPECT_EQ(9, output.node_size());
+
+          const NodeDef* new_add_node = node_map.GetNode(HoistAddName("add"));
+          ASSERT_TRUE(new_add_node != nullptr) << "Hoisted Add node not found";
+          EXPECT_EQ("y1", new_add_node->input(0));
+          EXPECT_EQ("y2", new_add_node->input(1));
+
+          const NodeDef* new_div_node = node_map.GetNode(HoistDivName("add"));
+          ASSERT_TRUE(new_div_node != nullptr) << "Hoisted Div node not found";
+          EXPECT_EQ(new_add_node->name(), new_div_node->input(0));
+          EXPECT_EQ("x", new_div_node->input(1));
+
+          const NodeDef* id_node = node_map.GetNode("id");
+          ASSERT_TRUE(id_node != nullptr) << "Id node not found";
+          EXPECT_EQ("id", id_node->name());
+          EXPECT_EQ(HoistDivName("add"), id_node->input(0));
+        }
+      }
     }
   }
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  std::vector<string> fetch = {"trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("trans_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* trans_fused_node =
+      node_map.GetNode(OptimizedName("trans_fused"));
+  ASSERT_NE(trans_fused_node, nullptr);
+  EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
+  EXPECT_EQ("z", trans_fused_node->input(0));
+  EXPECT_EQ("perm", trans_fused_node->input(1));
+
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
@@ -606,44 +751,56 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conjugate_trans"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conjugate_trans_fused"), output.node(6).name());
-  EXPECT_EQ("Transpose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conjugate_trans_fused_node =
+      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
+  EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
+  EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output re = ops::Const(s.WithOpName("re"), {1.0, 2.0, 3.0, 4.0}, {2, 2});
-  Output im = ops::Const(s.WithOpName("im"), {5.0, 6.0, 7.0, 8.0}, {2, 2});
+  Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
+  Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"conj"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
+  NodeMap node_map(&output);
 
   EXPECT_EQ(7, output.node_size());
-  EXPECT_EQ(OptimizedName("conj_fused"), output.node(6).name());
-  EXPECT_EQ("ConjugateTranspose", output.node(6).op());
-  EXPECT_EQ("z", output.node(6).input(0));
-  EXPECT_EQ("perm", output.node(6).input(1));
+
+  const NodeDef* conj_fused_node =
+      node_map.GetNode(OptimizedName("conj_fused"));
+  EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
+  EXPECT_EQ("z", conj_fused_node->input(0));
+  EXPECT_EQ("perm", conj_fused_node->input(1));
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
@@ -665,27 +822,32 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
     }
     GrapplerItem item;
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    std::vector<string> fetch = {"matmul"};
+    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+    EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-    // Run the optimizer twice to make sure the rewrite is idempotent.
-    item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
+    OptimizeTwice(&optimizer, &item, &output);
+    NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
-    EXPECT_EQ(OptimizedName("matmul_fused"), output.node(6).name());
-    EXPECT_EQ("a", output.node(6).input(0));
-    EXPECT_EQ("b", output.node(6).input(1));
+
+    const NodeDef* matmul_fused_node =
+        node_map.GetNode(OptimizedName("matmul_fused"));
+    ASSERT_NE(matmul_fused_node, nullptr);
+    EXPECT_EQ("a", matmul_fused_node->input(0));
+    EXPECT_EQ("b", matmul_fused_node->input(1));
     if (matmul_type == "BatchMatMul") {
-      EXPECT_TRUE(output.node(6).attr().at("adj_x").b());
-      EXPECT_TRUE(output.node(6).attr().at("adj_y").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
     } else {
-      EXPECT_TRUE(output.node(6).attr().at("transpose_a").b());
-      EXPECT_TRUE(output.node(6).attr().at("transpose_b").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
+      EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
+    auto tensors = EvaluateNodes(output, fetch);
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
 }
 
@@ -707,6 +869,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  std::vector<string> fetch = {"matmul"};
+  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
@@ -719,6 +884,9 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   EXPECT_EQ("b", output.node(10).input(1));
   EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
   EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
+  auto tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
@@ -739,7 +907,10 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -747,6 +918,9 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
@@ -761,7 +935,10 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 3, 28, 28}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -769,6 +946,9 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
@@ -781,7 +961,6 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -812,7 +991,10 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({8, 3, 28, 28, 4}));
+  item.feed = {{"nchw_vect_c", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
   GraphDef output;
   TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
@@ -820,6 +1002,9 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, ReorderTransposeCast) {
@@ -1322,8 +1507,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
 
   // check add tree was replaced with AddN
   const NodeDef* collapsed_add =
-      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_add != nullptr);
+      node_map.GetNode("y/ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(3, collapsed_add->input_size());
@@ -1333,7 +1518,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
 
   // check output was re-wired to new node
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
-  ASSERT_TRUE(updated_outputs != nullptr);
+  ASSERT_NE(updated_outputs, nullptr);
 
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
 }
@@ -1381,8 +1566,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check left Add subtree replaced with AddN
   const NodeDef* collapsed_left =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_left != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_left, nullptr);
 
   EXPECT_EQ("AddN", collapsed_left->op());
   EXPECT_EQ(3, collapsed_left->input_size());
@@ -1392,8 +1577,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check right Add subtree replaced with AddN
   const NodeDef* collapsed_right =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz_Add_xy");
-  ASSERT_TRUE(collapsed_right != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_xyz");
+  ASSERT_NE(collapsed_right, nullptr);
 
   EXPECT_EQ("AddN", collapsed_right->op());
   EXPECT_EQ(3, collapsed_right->input_size());
@@ -1403,7 +1588,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
 
   // check that Mul inputs re-wired to new Nodes
   const NodeDef* updated_mul = node_map.GetNode("Mul");
-  ASSERT_TRUE(updated_mul != nullptr);
+  ASSERT_NE(updated_mul, nullptr);
 
   EXPECT_EQ("Mul", updated_mul->op());
   EXPECT_EQ(2, updated_mul->input_size());
@@ -1444,9 +1629,9 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   NodeMap node_map(&output);
 
   // check Add tree replaced with AddN
-  const NodeDef* collapsed_add = node_map.GetNode(
-      "ArithmeticOptimizer/AddOpsRewrite_Add_all_Add_ab_Add_bc");
-  ASSERT_TRUE(collapsed_add != nullptr);
+  const NodeDef* collapsed_add =
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_all");
+  ASSERT_NE(collapsed_add, nullptr);
 
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(4, collapsed_add->input_size());
@@ -1496,8 +1681,8 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
 
   // check add tree was replaced with AddN
   const NodeDef* collapsed_add =
-      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc_Add_ab");
-  ASSERT_TRUE(collapsed_add != nullptr);
+      node_map.GetNode("ArithmeticOptimizer/AddOpsRewrite_Add_abc");
+  ASSERT_NE(collapsed_add, nullptr);
   EXPECT_EQ("AddN", collapsed_add->op());
   EXPECT_EQ(3, collapsed_add->input_size());
   EXPECT_EQ("a", collapsed_add->input(0));
@@ -1506,10 +1691,173 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
 
   // check output was re-wired to new node
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
-  ASSERT_TRUE(updated_outputs != nullptr);
+  ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
 }
 
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32, 32, 32}, DT_FLOAT);
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto x = ops::Variable(s.WithOpName("x"), {32}, DT_FLOAT);
+  auto y = ops::Variable(s.WithOpName("y"), {32, 32}, DT_FLOAT);
+  auto z = ops::Variable(s.WithOpName("z"), {32, 32, 32}, DT_FLOAT);
+  auto add_xy = ops::Add(s.WithOpName("Add_xy"), x, y);
+  auto add_xyz = ops::Add(s.WithOpName("Add_xyz"), add_xy, z);
+
+  auto add_all = ops::Add(s.WithOpName("AddAll"), add_abc, add_xyz);
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_all);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //  1) [a, x], [b, y], [c, z] - aggregate same shapes first
+  //  2) Build an aggregation tree minimizing cost of broadcast
+  //
+  //         +                              +
+  //      /     \                       /       \
+  //     +       +                     +       AddN(c, z)
+  //    / \     / \                 /     \
+  //   +   c   x   + -->    AddN(a, x)  AddN(b, y)
+  //  / \         / \
+  // a   b       y   z
+  EXPECT_EQ(12, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_AddAll";
+  string outer_0_add_name =
+      "ArithmeticOptimizer/AddOpsRewrite_Internal_0_AddAll";
+  string inner_0_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_AddAll";
+  string inner_1_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_1_AddAll";
+  string inner_2_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_2_AddAll";
+
+  // Add [a, x] first
+  const NodeDef* add_ax_node = node_map.GetNode(inner_0_add_name);
+  ASSERT_NE(add_ax_node, nullptr);
+  EXPECT_EQ("AddN", add_ax_node->op());
+  EXPECT_EQ(2, add_ax_node->input_size());
+  EXPECT_EQ("a", add_ax_node->input(0));
+  EXPECT_EQ("x", add_ax_node->input(1));
+
+  // Then add [b, y]
+  const NodeDef* add_by_node = node_map.GetNode(inner_1_add_name);
+  ASSERT_NE(add_by_node, nullptr);
+  EXPECT_EQ("AddN", add_by_node->op());
+  EXPECT_EQ(2, add_by_node->input_size());
+  EXPECT_EQ("b", add_by_node->input(0));
+  EXPECT_EQ("y", add_by_node->input(1));
+
+  // Then add [c, z]
+  const NodeDef* add_cz_node = node_map.GetNode(inner_2_add_name);
+  ASSERT_NE(add_cz_node, nullptr);
+  EXPECT_EQ("AddN", add_cz_node->op());
+  EXPECT_EQ(2, add_cz_node->input_size());
+  EXPECT_EQ("c", add_cz_node->input(0));
+  EXPECT_EQ("z", add_cz_node->input(1));
+
+  // Then add results together starting from smaller shapes [a, x] + [b, y]
+  const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
+  ASSERT_NE(outer_0_node, nullptr);
+  EXPECT_EQ("Add", outer_0_node->op());
+  EXPECT_EQ(2, outer_0_node->input_size());
+  EXPECT_EQ(inner_0_add_name, outer_0_node->input(0));
+  EXPECT_EQ(inner_1_add_name, outer_0_node->input(1));
+
+  // And finally top level Add node
+  const NodeDef* outer_node = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_node, nullptr);
+  EXPECT_EQ("Add", outer_node->op());
+  EXPECT_EQ(2, outer_node->input_size());
+  EXPECT_EQ(outer_0_add_name, outer_node->input(0));
+  EXPECT_EQ(inner_2_add_name, outer_node->input(1));
+
+  // And outputs reading new top level Add node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
+TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // We have a small input with one unknown dimension
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+
+  // And second input which is larger, but has the same unknown dimension
+  // device spec prevents this node from rewriting
+  auto d = "/job:do_not_rewrite_me";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
+
+  // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
+  auto a = ops::Sqrt(s.WithOpName("a"), small);
+  auto b = ops::Square(s.WithOpName("b"), large);
+  auto c = ops::Round(s.WithOpName("c"), small);
+
+  // [add_ab, add_abc] shape must be inferred from inputs
+  auto add_ab = ops::Add(s.WithOpName("Add_ab"), a, b);
+  auto add_abc = ops::Add(s.WithOpName("Add_abc"), add_ab, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), add_abc);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyAddToAddNCombining(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: it's much cheaper to add small
+  // tensors, and do the broadcast just once
+  //
+  //     +                  +
+  //    / \                / \
+  //   +   c      -->     +   b
+  //  / \                / \
+  // a   b              a   c
+  EXPECT_EQ(9, output.node_size());
+  NodeMap node_map(&output);
+
+  // expected names of outer and inner nodes
+  string outer_add_name = "ArithmeticOptimizer/AddOpsRewrite_Add_abc";
+  string inner_add_name = "ArithmeticOptimizer/AddOpsRewrite_Leaf_0_Add_abc";
+
+  // outer Add node
+  const NodeDef* outer_add = node_map.GetNode(outer_add_name);
+  ASSERT_NE(outer_add, nullptr);
+  EXPECT_EQ("Add", outer_add->op());
+  EXPECT_EQ(inner_add_name, outer_add->input(0));
+  EXPECT_EQ("b", outer_add->input(1));
+
+  // inner AddN node
+  const NodeDef* inner_add = node_map.GetNode(inner_add_name);
+  ASSERT_NE(inner_add, nullptr);
+  EXPECT_EQ(2, inner_add->input_size());
+  EXPECT_EQ("a", inner_add->input(0));
+  EXPECT_EQ("c", inner_add->input(1));
+
+  // check output was re-wired to new node
+  const NodeDef* updated_outputs = node_map.GetNode("outputs");
+  ASSERT_NE(updated_outputs, nullptr);
+  EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+}
+
 TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto x = ops::Variable(s.WithOpName("x"), {2, 2}, DT_FLOAT);
@@ -1582,5 +1930,160 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   EXPECT_EQ(5, found);
 }
 
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul2);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //     *                  *
+  //    / \                / \
+  //   *   c      -->     *   b
+  //  / \                / \
+  // a   b              a   c
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("b", mul2_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul2, d);
+  auto mul4 = ops::Mul(s.WithOpName("mul4"), mul3, e);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul4);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur: Graph is "flattened" and
+  // largest shape pushed to the top.
+  //
+  //          *
+  //        /   \
+  //       *     e                *
+  //      /  \                  /   \
+  //     *    d               *      b
+  //    / \                 /  \
+  //   *   c      -->     *      *
+  //  / \                / \    / \
+  // a   b              a   c  d   e
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("c", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("d", mul2_node->input(0));
+  EXPECT_EQ("e", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("mul1", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+
+  const NodeDef* mul4_node = node_map.GetNode("mul4");
+  ASSERT_NE(mul4_node, nullptr);
+  EXPECT_EQ("mul3", mul4_node->input(0));
+  EXPECT_EQ("b", mul4_node->input(1));
+}
+
+TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  // [a, b, c] - scalars, [d] - matrix
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
+  auto b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  auto d = ops::Variable(s.WithOpName("D"), {32, 32}, DT_FLOAT);
+
+  auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
+  auto mul2 = ops::Mul(s.WithOpName("mul2"), c, d);
+  auto mul3 = ops::Mul(s.WithOpName("mul3"), mul1, mul2);
+
+  auto outputs = ops::Identity(s.WithOpName("outputs"), mul3);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyMinimizeBroadcasts(&optimizer);
+
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // We expect the following rewrite(s) to occur:
+  //
+  //                              *
+  //                            /  \
+  //       *                   *    D
+  //     /   \                / \
+  //    *     *      ->      *   c
+  //   / \   / \            / \
+  //  a   b c   D          a   b
+  NodeMap node_map(&output);
+
+  const NodeDef* mul1_node = node_map.GetNode("mul1");
+  ASSERT_NE(mul1_node, nullptr);
+  EXPECT_EQ("a", mul1_node->input(0));
+  EXPECT_EQ("b", mul1_node->input(1));
+
+  const NodeDef* mul2_node = node_map.GetNode("mul2");
+  ASSERT_NE(mul2_node, nullptr);
+  EXPECT_EQ("mul1", mul2_node->input(0));
+  EXPECT_EQ("c", mul2_node->input(1));
+
+  const NodeDef* mul3_node = node_map.GetNode("mul3");
+  ASSERT_NE(mul3_node, nullptr);
+  EXPECT_EQ("D", mul3_node->input(0));
+  EXPECT_EQ("mul2", mul3_node->input(1));
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index bdec73e69ecb3b1f2c3a9fc73bb0a7e5293efdd5..b2a1ce6ab6746d02f729f380ff9ca3fbdea8e781 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -109,33 +109,18 @@ class DeviceSimple : public DeviceBase {
 };
 
 template <typename T>
-bool AllValuesAre(const TensorProto& tensor, const T& value) {
-  // TensorProto represents the content of the tensor in either <type>_val or
-  // tensor_content.
-  typename checkpoint::SaveTypeTraits<T>::RepeatedField* tensor_values =
-      checkpoint::MutableTensorProtoData<T>(const_cast<TensorProto*>(&tensor));
-  if (!tensor_values->empty()) {
-    for (const T& tensor_value : *tensor_values) {
-      if (tensor_value != value) {
-        return false;
-      }
-    }
-    return true;
+bool AllValuesAre(const TensorProto& proto, const T& value) {
+  Tensor tensor;
+  if (!tensor.FromProto(proto)) {
+    return false;
   }
-  const auto tensor_content_size = tensor.tensor_content().size();
-  if (tensor_content_size > 0) {
-    CHECK_EQ(0, tensor_content_size % sizeof(T));
-    std::vector<T> raw_values(tensor_content_size / sizeof(T));
-    port::CopyToArray(tensor.tensor_content(),
-                      reinterpret_cast<char*>(raw_values.data()));
-    for (int i = 0; i < tensor_content_size / sizeof(T); ++i) {
-      if (raw_values[i] != value) {
-        return false;
-      }
+  auto values = tensor.flat<T>();
+  for (int i = 0; i < tensor.NumElements(); ++i) {
+    if (values(i) != value) {
+      return false;
     }
-    return true;
   }
-  return false;
+  return true;
 }
 
 // Add new_input as a control input to node if it does not already depend on it.
@@ -313,7 +298,8 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
   for (int node_idx = 0; node_idx < node_count; ++node_idx) {
     NodeDef* node = graph_->mutable_node(node_idx);
     const string op = node->op();
-    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN") {
+    if (op != "Shape" && op != "Size" && op != "Rank" && op != "ShapeN" &&
+        op != "TensorArraySizeV3") {
       continue;
     }
 
@@ -364,6 +350,36 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
       continue;
     }
 
+    if (op == "TensorArraySizeV3") {
+      const NodeDef* array = node_map_->GetNode(node->input(0));
+      if (array->attr().count("dynamic_size") != 0 &&
+          array->attr().at("dynamic_size").b()) {
+        continue;
+      }
+      const NodeDef* array_size = node_map_->GetNode(array->input(0));
+      if (IsReallyConstant(*array_size)) {
+        // Don't materialize 0 sizes to avoid triggering incorrect static
+        // checks. A 0 sized array that can't grow isn't useful anyway.
+        const TensorProto& raw_val = array_size->attr().at("value").tensor();
+        if (raw_val.dtype() != DT_INT32) {
+          continue;
+        }
+        Tensor value(raw_val.dtype(), raw_val.tensor_shape());
+        if (!value.FromProto(raw_val)) {
+          continue;
+        }
+        if (value.flat<int32>()(0) == 0) {
+          continue;
+        }
+        node->set_op("Const");
+        *node->mutable_attr() = array_size->attr();
+        node->set_input(0, AsControlDependency(NodeName(node->input(0))));
+        node->set_input(1, AddControlDependency(NodeName(node->input(1)),
+                                                graph_, node_map_.get()));
+      }
+      continue;
+    }
+
     // Handle ShapeN materialization case.
     // It's possible that not all input tensors have known shapes.
     CHECK_EQ(op, "ShapeN");
@@ -498,6 +514,11 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
     return Status::OK();
   }
 
+  // Don't optimize this again if it was already optimized and folded.
+  if (OptimizedNodeExists(node, "-folded-1") ||
+      OptimizedNodeExists(node, "-folded-2")) {
+    return Status::OK();
+  }
   int64 min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
@@ -753,10 +774,6 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   if (op.find("Quantized") != string::npos || op.find("Sparse") == 0) {
     return false;
   }
-  if (node.attr().count("_XlaCompile") > 0 &&
-      node.attr().at("_XlaCompile").b()) {
-    return false;
-  }
 
   const OpDef* op_def = nullptr;
   Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
@@ -783,7 +800,7 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
   // the case of a merge node that propagate the first inputs that becomes
   // available, and therefore only requires a single constant input to be
   // foldable.
-  bool has_constant_input = false;
+  bool merge_has_constant_input = false;
   const bool is_merge = IsMerge(node);
   for (const auto& input : node.input()) {
     if (IsControlInput(input)) {
@@ -794,21 +811,20 @@ bool ConstantFolding::IsFoldable(const NodeDef& node) const {
       return false;
     }
     bool is_const = IsReallyConstant(*input_node);
-    if (!is_const && !is_merge) {
-      return false;
-    }
-    // Don't fold strings constants for now since this causes problems with
-    // checkpointing.
-    if (is_const && input_node->attr().at("dtype").type() == DT_STRING) {
+    if (is_const) {
+      // Don't fold strings constants for now since this causes problems with
+      // checkpointing.
+      if (input_node->attr().at("dtype").type() == DT_STRING) {
+        return false;
+      }
+      // Special case: If a Merge node has at least one constant input that
+      // does not depend on a control input, we can fold it.
+      merge_has_constant_input |= !HasControlInputs(*input_node);
+    } else if (!is_merge) {
       return false;
     }
-    has_constant_input |= is_const;
-  }
-  if (is_merge) {
-    return has_constant_input;
   }
-
-  return true;
+  return !is_merge || merge_has_constant_input;
 }
 
 namespace {
@@ -825,17 +841,23 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
   t->set_dtype(type);
   *t->mutable_tensor_shape() = shape;
   switch (type) {
-    SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
-    SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
-    SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
-    SET_TENSOR_VAL_CASE(DT_UINT64, int64, int64);
-    SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT32, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
-    SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
-    SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
+    case DT_HALF:
+      t->add_half_val(static_cast<Eigen::half>(value).x);
+      break;
+    case DT_BFLOAT16:
+      t->add_half_val(static_cast<bfloat16>(value).value);
+      break;
+      SET_TENSOR_VAL_CASE(DT_FLOAT, float, float);
+      SET_TENSOR_VAL_CASE(DT_DOUBLE, double, double);
+      SET_TENSOR_VAL_CASE(DT_INT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_UINT64, int64, int64);
+      SET_TENSOR_VAL_CASE(DT_INT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT32, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT16, int32, int);
+      SET_TENSOR_VAL_CASE(DT_INT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_UINT8, int32, int);
+      SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
       return errors::InvalidArgument("Unsupported type: ", type);
   }
@@ -1246,7 +1268,8 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
     Status s = FoldNode(node, output);
     processed_nodes.insert(node->name());
     if (!s.ok()) {
-      VLOG(1) << "Failed to fold node " << node->name() << ": " << s;
+      VLOG(1) << "Failed to fold node " << node->DebugString()
+              << "\nError message: " << s;
     } else {
       for (auto& output : fanout) {
         if (IsFoldable(*output)) {
@@ -1388,8 +1411,8 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_HALF);
+    IS_ONES_CASE(DT_BFLOAT16);
     IS_ONES_CASE(DT_FLOAT);
     IS_ONES_CASE(DT_DOUBLE);
     IS_ONES_CASE(DT_COMPLEX64);
@@ -1423,8 +1446,8 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
-    // TODO(rmlarsen): Make DT_HALF case compile.
-    //    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_HALF);
+    IS_ZEROS_CASE(DT_BFLOAT16);
     IS_ZEROS_CASE(DT_FLOAT);
     IS_ZEROS_CASE(DT_DOUBLE);
     IS_ZEROS_CASE(DT_COMPLEX64);
@@ -1511,9 +1534,8 @@ void ConstantFolding::ReplaceSubtractionFromZeroByNegation(NodeDef* node,
 }
 
 Status ConstantFolding::ReplaceOperationWithConstant(
-    double value, const TensorShapeProto& shape, NodeDef* node,
-    GraphDef* graph) {
-  AttrValue dtype_attr = node->attr().at("T");
+    double value, const AttrValue& dtype_attr, const TensorShapeProto& shape,
+    NodeDef* node, GraphDef* graph) {
   AttrValue tensor_attr;
   TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
                                                    shape, &tensor_attr));
@@ -1542,8 +1564,19 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
     NodeDef* node = optimized_graph->mutable_node(i);
 
+    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(1, node, optimized_graph);
+      continue;
+    }
+
+    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+      ReplaceOperationWithIdentity(0, node, optimized_graph);
+      continue;
+    }
+
     // Remove Shuffle or Reverse op over scalar values.
     if (use_shape_info &&
+        !properties->GetInputProperties(node->name()).empty() &&
         (IsShuffle(*node) || IsReverse(*node) || IsTranspose(*node))) {
       const auto& shape =
           properties->GetInputProperties(node->name())[0].shape();
@@ -1707,9 +1740,11 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
     }
 
     // Move constants past Enter.
-    // TODO(rmlarsen): Reenable when we fix the root cause of b/76008022
-    if (opt_level_ == RewriterConfig::AGGRESSIVE && IsEnter(*node) &&
-        node->input_size() > 0) {
+    if (IsEnter(*node) && node->input_size() > 0) {
+      if (node->attr().count("is_constant") == 0 ||
+          !node->attr().at("is_constant").b()) {
+        continue;
+      }
       const string& node_name = node->name();
       const NodeDef* input = node_map_->GetNode(node->input(0));
       if (input != nullptr && IsReallyConstant(*input) &&
@@ -1738,7 +1773,7 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
           node_map_->AddOutput(node_name, new_node->name());
           for (NodeDef* consumer : consumers) {
             for (int i = 0; i < consumer->input_size(); ++i) {
-              if (consumer->input(i) == node_name) {
+              if (NodeName(consumer->input(i)) == node_name) {
                 node_map_->UpdateInput(consumer->name(), node_name,
                                        new_node->name());
                 consumer->set_input(i, new_node->name());
@@ -1947,8 +1982,14 @@ Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
           (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
         const PartialTensorShape shp(output_shape);
         if (shp.IsFullyDefined()) {
-          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(0, output_shape, node,
-                                                          optimized_graph));
+          AttrValue dtype_attr;
+          if (node->op() == "SparseMatMul") {
+            dtype_attr.set_type(DT_FLOAT);
+          } else {
+            dtype_attr = node->attr().at("T");
+          }
+          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
+              0, dtype_attr, output_shape, node, optimized_graph));
           continue;
         }
         // Even if an input shape is only partially known, we may known that it
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index b6645d335e3074f188e5ab67a27fb11975fa14ed..f8a9e90d62111e516496450edb873d0e32e40464 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -83,7 +83,7 @@ class ConstantFolding : public GraphOptimizer {
   void ReplaceOperationWithSnapshot(int input_to_forward, NodeDef* node,
                                     GraphDef* graph);
   void ReplaceSubtractionFromZeroByNegation(NodeDef* node, GraphDef* graph);
-  Status ReplaceOperationWithConstant(double value,
+  Status ReplaceOperationWithConstant(double value, const AttrValue& dtype_attr,
                                       const TensorShapeProto& shape,
                                       NodeDef* node, GraphDef* graph);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 6340565bcdfc480d00c648318e98bab20eb60622..31abe438469e69e548e40cb39c97b6cd9db56cdb 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -22,13 +22,66 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ConstantFoldingTest : public GrapplerTest {};
+class ConstantFoldingTest : public GrapplerTest {
+ protected:
+  template <DataType DTYPE>
+  void SimpleNeutralElementTest() {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
+                                ops::Placeholder::Shape(TensorShape({2, 2})));
+    Tensor zeros_t(DTYPE, TensorShape({2, 2}));
+    Tensor ones_t(DTYPE, TensorShape({2, 2}));
+    Tensor x_t(DTYPE, TensorShape({2, 2}));
+    for (int i = 0; i < 4; ++i) {
+      zeros_t.flat<T>()(i) = T(0);
+      ones_t.flat<T>()(i) = T(1);
+      x_t.flat<T>()(i) = T(i + 1);
+    }
+    Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
+    Output ones = ops::Const(s.WithOpName("ones"), ones_t);
+    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+    Output mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
+
+    GrapplerItem item;
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch = {"mul1", "mul2"};
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef output;
+    Status status = optimizer.Optimize(nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    LOG(INFO) << output.DebugString();
+    EXPECT_EQ(5, output.node_size());
+    for (int i = 0; i < output.node_size(); ++i) {
+      const NodeDef& node = output.node(i);
+      const string& name = node.name();
+      if (name == "mul1") {
+        EXPECT_EQ("Const", node.op());
+        EXPECT_EQ("^x", node.input(0));
+        EXPECT_EQ("^zeros", node.input(1));
+      } else if (name == "mul2") {
+        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("x", node.input(0));
+        EXPECT_EQ("^ones", node.input(1));
+      }
+    }
+    auto tensors_expected =
+        EvaluateNodes(item.graph, {"mul1", "mul2"}, {{"x", x_t}});
+    auto tensors = EvaluateNodes(output, {"mul1", "mul2"}, {{"x", x_t}});
+    EXPECT_EQ(2, tensors_expected.size());
+    EXPECT_EQ(2, tensors.size());
+    for (int i = 0; i < 2; ++i) {
+      test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
+    }
+  }
+};
 
 TEST_F(ConstantFoldingTest, SimpleFolding) {
   // Build a simple graph with a few trivially prunable ops.
@@ -319,9 +372,31 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(1).size());
       }
     }
+    auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 2}));
+    auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+    auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+    auto bias_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
+
+    auto tensors_expected = EvaluateNodes(
+        item.graph, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors_expected.size());
+    auto tensors = EvaluateNodes(
+        output, item.fetch,
+        {{"x", x_t}, {"y", y_t}, {"a", a_t}, {"b", b_t}, {"bias", bias_t}});
+    EXPECT_EQ(item.fetch.size(), tensors.size());
+    for (int i = 0; i < item.fetch.size(); ++i) {
+      test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+    }
   }
 }
 
+TEST_F(ConstantFoldingTest, NeutralElement_ShortFloats) {
+  SimpleNeutralElementTest<DT_HALF>();
+  SimpleNeutralElementTest<DT_BFLOAT16>();
+}
+
 TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output cf_half = ops::Const(s.WithOpName("cf_half"), 0.5f, {1});
@@ -614,7 +689,8 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   GrapplerItem item;
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -641,6 +717,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
     }
   }
   EXPECT_EQ(1, found);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
@@ -714,7 +793,8 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   GrapplerItem item;
   item.fetch.push_back("i2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
-
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -733,6 +813,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
       EXPECT_EQ("^p2", node.input(1));
     }
   }
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
@@ -1173,6 +1256,10 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Merge m1(scope.WithOpName("m1"), {x, const1, const2});
   ops::Merge m2(scope.WithOpName("m2"), {const1, const3});
   ops::Merge m3(scope.WithOpName("m3"), {x, y});
+  // m4 is not foldable because the only constant input
+  // has a control input, so we cannot know if it will be
+  // triggered.
+  ops::Merge m4(scope.WithOpName("m4"), {x, const1});
 
   ops::Identity out1(scope.WithOpName("out1"), m1.output);
   ops::Identity idx1(scope.WithOpName("idx1"), m1.value_index);
@@ -1180,9 +1267,11 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   ops::Identity idx2(scope.WithOpName("idx2"), m2.value_index);
   ops::Identity out3(scope.WithOpName("out3"), m3.output);
   ops::Identity idx3(scope.WithOpName("idx3"), m3.value_index);
+  ops::Identity out4(scope.WithOpName("out4"), m4.output);
+  ops::Identity idx4(scope.WithOpName("idx4"), m4.value_index);
 
   GrapplerItem item;
-  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3"};
+  item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
   ConstantFolding optimizer(nullptr /* cpu_device */);
@@ -1190,6 +1279,7 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
+  EXPECT_EQ(19, output.node_size());
   int found_nodes = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "out1") {
@@ -1226,10 +1316,18 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("m3:1", node.input(0));
       ++found_nodes;
+    } else if (node.name() == "out4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4", node.input(0));
+      ++found_nodes;
+    } else if (node.name() == "idx4") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("m4:1", node.input(0));
+      ++found_nodes;
     }
   }
   // Make sure the graph contains all the nodes we're expecting.
-  EXPECT_EQ(6, found_nodes);
+  EXPECT_EQ(8, found_nodes);
 
   std::vector<string> fetch = {"out1", "idx1"};
   auto tensors = EvaluateNodes(output, fetch);
@@ -1244,6 +1342,82 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   EXPECT_EQ(2, out_idx.flat<int32>()(0));
 }
 
+TEST_F(ConstantFoldingTest, SplitRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({4}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  ops::Split s1(scope.WithOpName("s1"), split_dim, in1, 1);
+  ops::Split s2(scope.WithOpName("s2"), split_dim, in2, 2);
+
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("s1", "Identity", {"in1", AsControlDependency("split_dim")}, {},
+          &want);
+  AddNode("s2", "Split", {"in2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, SplitVRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 =
+      ops::Variable(scope.WithOpName("in1"), TensorShape({2}), DT_FLOAT);
+  Output in2 =
+      ops::Variable(scope.WithOpName("in2"), TensorShape({5}), DT_FLOAT);
+  auto split_dim = ops::Const(scope.WithOpName("split_dim"), {0}, {});
+  auto size_splits1 = ops::Const(scope.WithOpName("size_splits1"), {2}, {1});
+  auto size_splits2 = ops::Const(scope.WithOpName("size_splits2"), {2, 3}, {2});
+  ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
+  ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
+
+  LOG(INFO) << s1.output.size();
+  LOG(INFO) << s2.output.size();
+  ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
+
+  GrapplerItem item;
+  item.fetch = {"out"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("split_dim", "Const", {}, {}, &want);
+  AddNode("size_splits1", "Const", {}, {}, &want);
+  AddNode("size_splits2", "Const", {}, {}, &want);
+  AddNode("s1", "Identity",
+          {"in1", AsControlDependency("size_splits1"),
+           AsControlDependency("split_dim")},
+          {}, &want);
+  AddNode("s2", "SplitV", {"in2", "size_splits2", "split_dim"}, {}, &want);
+  AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1664,6 +1838,79 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   EXPECT_EQ(6, found);
 }
 
+TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({2, 2})));
+  Output b = ops::Square(s.WithOpName("b"), a);
+  Output c = ops::Mul(s.WithOpName("c"), a, b);
+  Output d = ops::Shape(s.WithOpName("d"), a);
+  Output e = ops::Shape(s.WithOpName("e"), b);
+
+  auto f = ops::internal::BroadcastGradientArgs(s.WithOpName("f"), d, e);
+  Output o1 = ops::Identity(s.WithOpName("o1"), f.r0);
+  Output o2 = ops::Identity(s.WithOpName("o2"), f.r1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // Run a second time to make sure the optimization is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(11, output.node_size());
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "ConstantFolding/f-folded-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+      EXPECT_EQ("^b", node.input(1));
+    } else if (node.name() == "d") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^a", node.input(0));
+    } else if (node.name() == "e") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^b", node.input(0));
+    } else if (node.name() == "o1") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-0", node.input(0));
+    } else if (node.name() == "o2") {
+      ++found;
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ConstantFolding/f-bcastargs-1", node.input(0));
+    } else if (node.name() == "ConstantFolding/f-bcastargs-0") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    } else if (node.name() == "ConstantFolding/f-bcastargs-1") {
+      ++found;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("^ConstantFolding/f-folded-1", node.input(0));
+      EXPECT_EQ(0, TensorShape(node.attr().at("value").tensor().tensor_shape())
+                       .num_elements());
+    }
+  }
+  EXPECT_EQ(7, found);
+}
+
 TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output input =
@@ -1678,6 +1925,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch.push_back("reshape");
 
+  auto input_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
+  Tensor indices_t(DT_INT32, TensorShape({2}));
+  indices_t.flat<int>()(0) = 0;
+  indices_t.flat<int>()(1) = 1;
+  auto tensors_expected = EvaluateNodes(
+      item.graph, item.fetch, {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+
   ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
@@ -1704,6 +1959,11 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     }
   }
   EXPECT_EQ(3, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch,
+                               {{"input", input_t}, {"indices", indices_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, LargeConstant) {
@@ -1742,6 +2002,12 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   EXPECT_EQ(2, found);
 
   EXPECT_GT(1024 * 1024, output.ByteSizeLong());
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
@@ -1794,6 +2060,23 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
     }
   }
   EXPECT_EQ(6, found);
+
+  // Evaluate id_true when input tensor x is true.
+  Tensor x_t(DT_BOOL, TensorShape({}));
+  x_t.flat<bool>()(0) = true;
+  auto tensors_expected = EvaluateNodes(item.graph, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, {"id_true"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
+
+  // Evalute id_false when input tensor is false.
+  x_t.flat<bool>()(0) = false;
+  tensors_expected = EvaluateNodes(item.graph, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  tensors = EvaluateNodes(output, {"id_false"}, {{"x", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bool>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
@@ -1881,7 +2164,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
         EXPECT_EQ("ConstantFolding/acc6_partial_split_2", node.input(1));
         EXPECT_EQ("y", node.input(2));
       }
-      if (StringPiece(node.name()).starts_with("ConstantFolding/")) {
+      if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
         EXPECT_EQ("Const", node.op());
       }
     }
@@ -1966,7 +2249,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
       EXPECT_EQ("x", node.input(1));
       EXPECT_EQ("y", node.input(2));
       EXPECT_EQ("axis", node.input(3));
-    } else if (StringPiece(node.name()).starts_with("ConstantFolding/")) {
+    } else if (str_util::StartsWith(node.name(), "ConstantFolding/")) {
       EXPECT_EQ("Const", node.op());
     } else {
       EXPECT_EQ(item.graph.node(i).DebugString(), node.DebugString());
@@ -2035,6 +2318,15 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
       EXPECT_EQ("^id_n", node.input(0));
     }
   }
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({}));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+  EXPECT_EQ(4, tensors.size());
+  for (int i = 0; i < tensors.size(); i++) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
+  }
 }
 
 TEST_F(ConstantFoldingTest, TrivialPack) {
@@ -2082,6 +2374,10 @@ TEST_F(ConstantFoldingTest, Enter) {
   GrapplerItem item;
   AttrValue frame_name;
   frame_name.set_s("foo");
+  AttrValue is_constant_true;
+  is_constant_true.set_b(true);
+  AttrValue is_constant_false;
+  is_constant_false.set_b(false);
   AttrValue type;
   type.set_type(DT_FLOAT);
   AttrValue value;
@@ -2092,19 +2388,31 @@ TEST_F(ConstantFoldingTest, Enter) {
   GraphDef& graph = item.graph;
   AddNode("x", "Placeholder", {}, {{"T", type}}, &graph);
   AddNode("c1", "Const", {"^x"}, {{"value", value}, {"dtype", type}}, &graph);
-  AddNode("enter1", "Enter", {"x"}, {{"T", type}, {"frame_name", frame_name}},
+  AddNode("enter1", "Enter", {"x"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
           &graph);
-  AddNode("enter2", "Enter", {"c1"}, {{"T", type}, {"frame_name", frame_name}},
+  AddNode("enter2", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_true}},
+          &graph);
+  AddNode("enter3", "Enter", {"c1"},
+          {{"T", type},
+           {"frame_name", frame_name},
+           {"is_constant", is_constant_false}},
           &graph);
   AddNode("id1", "Identity", {"enter1"}, {{"T", type}}, &graph);
   AddNode("id2", "Identity", {"enter2"}, {{"T", type}}, &graph);
   AddNode("id3", "Identity", {"enter2"}, {{"T", type}}, &graph);
+  AddNode("id4", "Identity", {"enter3"}, {{"T", type}}, &graph);
   item.fetch.push_back("id1");
   item.fetch.push_back("id2");
   item.fetch.push_back("id3");
+  item.fetch.push_back("id4");
 
-  ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                            nullptr /* cpu_device */);
+  ConstantFolding optimizer(nullptr /* cpu_device */);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -2113,7 +2421,7 @@ TEST_F(ConstantFoldingTest, Enter) {
   status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  EXPECT_EQ(7, output.node_size());
+  EXPECT_EQ(9, output.node_size());
   for (const NodeDef& node : output.node()) {
     if (node.name() == "id1") {
       EXPECT_EQ("Identity", node.op());
@@ -2125,9 +2433,56 @@ TEST_F(ConstantFoldingTest, Enter) {
       EXPECT_EQ(1, node.input_size());
       EXPECT_EQ("^enter2", node.input(0));
     }
+    if (node.name() == "id4") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("enter3", node.input(0));
+    }
   }
 }
 
+TEST_F(ConstantFoldingTest, TensorArraySize) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+  Output size = ops::Const(scope.WithOpName("size"), 5, TensorShape({}));
+  auto dynamic_array =
+      ops::TensorArray(scope.WithOpName("dynamic"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(true));
+  auto static_array =
+      ops::TensorArray(scope.WithOpName("static"), size, DT_FLOAT,
+                       ops::TensorArray::DynamicSize(false));
+  auto dynamic_sz = ops::TensorArraySize(
+      scope.WithOpName("dynamic_sz"), dynamic_array.handle, dynamic_array.flow);
+  auto static_sz = ops::TensorArraySize(scope.WithOpName("static_sz"),
+                                        static_array.handle, static_array.flow);
+
+  GrapplerItem item;
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  auto tensors_expected =
+      EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+  // Run the optimizer twice to make sure the rewrite is idempotent.
+  item.graph.Swap(&output);
+  status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(5, output.node_size());
+  EXPECT_EQ("dynamic_sz", output.node(3).name());
+  EXPECT_EQ("TensorArraySizeV3", output.node(3).op());
+  EXPECT_EQ("static_sz", output.node(4).name());
+  EXPECT_EQ("Const", output.node(4).op());
+
+  auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
+  EXPECT_EQ(2, tensors_expected.size());
+  EXPECT_EQ(2, tensors_actual.size());
+  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper.cc b/tensorflow/core/grappler/optimizers/debug_stripper.cc
index 461f1aa2fb2d7b7ecfd02c83b73410df3e749a89..9701a038d0287db7745b9181b429bd81b1cdd854 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper.cc
@@ -14,16 +14,51 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace grappler {
 
 Status DebugStripper::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* output) {
-  // TODO(haoliang): Let's remove assertions here.
   *output = item.graph;
+  for (NodeDef& node : *output->mutable_node()) {
+    if (IsAssert(node)) {
+      // Convert this node into a no-op.
+      node.set_op("NoOp");
+      node.clear_attr();
+      // Convert all its inputs into control dependency, which will then
+      // be optimized away by dependency optimizer.
+      for (string& inp : *node.mutable_input()) {
+        if (!IsControlInput(inp)) {
+          inp = AsControlDependency(inp);
+        }
+      }
+    } else if (IsCheckNumerics(node) || IsPrint(node)) {
+      // Replace with Identity op which will be pruned later.
+      node.set_op("Identity");
+      // Only preserve T attribute.
+      protobuf::Map<string, AttrValue> new_attr;
+      if (node.attr().find("T") != node.attr().end()) {
+        new_attr.insert({"T", node.attr().at("T")});
+      }
+      node.mutable_attr()->swap(new_attr);
+      // As Identity op only takes one input, mark redundant inputs as control
+      // input.
+      for (size_t i = 1; i < node.input_size(); ++i) {
+        if (!IsControlInput(node.input(i))) {
+          *node.mutable_input(i) = AsControlDependency(node.input(i));
+        }
+      }
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
index d2cabc0798389e2396c2172387d8b76810098611..96ceee791f8f15c3cba1d8a6a5ae5e1f1106597c 100644
--- a/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
+++ b/tensorflow/core/grappler/optimizers/debug_stripper_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -27,16 +28,176 @@ namespace {
 
 class DebugStripperTest : public GrapplerTest {};
 
-// TODO(haoliang): Add tests for different removal operations.
 TEST_F(DebugStripperTest, OutputEqualToInput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto c = ops::Const(s.WithOpName("c"), 0, {});
+  Output x = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({}));
+  Output add = ops::Add(s, x, y);
+  Output result = ops::Identity(s, add);
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   DebugStripper optimizer;
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(item.graph, output);
+}
+
+TEST_F(DebugStripperTest, StripAssertFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto assert = ops::Assert(s.WithOpName("Assert"), greaterequal, {x, y});
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({assert.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+      EXPECT_EQ("GreaterEqual", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    } else if (node.name() == "Assert") {
+      count++;
+      EXPECT_EQ("NoOp", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("^GreaterEqual", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ("^y", node.input(2));
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+      EXPECT_EQ("^Assert", node.input(2));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(DebugStripperTest, StripCheckNumericsFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto check1 = ops::CheckNumerics(s.WithOpName("CheckNumerics1"), x, "foo");
+  auto check2 = ops::CheckNumerics(s.WithOpName("CheckNumerics2"), y, "foo");
+  Output add = ops::Add(s.WithOpName("z"), check1, check2);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "CheckNumerics1") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "CheckNumerics2") {
+      count++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+      EXPECT_EQ(1, node.attr_size());
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("CheckNumerics1", node.input(0));
+      EXPECT_EQ("CheckNumerics2", node.input(1));
+    }
+  }
+  EXPECT_EQ(5, count);
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  Tensor y_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  y_t.flat<float>()(0) = 0.5f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"z"}, {{"x", x_t}, {"y", y_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"z"}, {{"x", x_t}, {"y", y_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
+}
+
+TEST_F(DebugStripperTest, StripPrintFromGraph) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output print = ops::Print(s.WithOpName("Print"), x, {x});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DebugStripper optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "Print") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^x", node.input(1));
+      EXPECT_EQ(1, node.attr_size());
+    }
+  }
+
+  EXPECT_EQ(2, output.node_size());
+
+  Tensor x_t(DT_FLOAT, TensorShape({}));
+  x_t.flat<float>()(0) = 1.0f;
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"Print"}, {{"x", x_t}});
+  std::vector<Tensor> optimized =
+      EvaluateNodes(output, {"Print"}, {{"x", x_t}});
+  test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 63bc19630de98a651e193558de8e99e11051e37c..ed9bce439c6d6a5c09e3af53718fa49e191549ab 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -87,7 +88,7 @@ bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) {
     // Don't turn Identity nodes following Switch into NoOp or remove them
     // if it requires anchoring a control dependencies the Switch node, which
     // is not valid.
-    if (StringPiece(node.name()).starts_with(kConstantFoldingCtrl)) {
+    if (str_util::StartsWith(node.name(), kConstantFoldingCtrl)) {
       // TODO(rmlarsen): Try to remove this artificial contraint.
       return false;
     }
@@ -298,6 +299,15 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       input_nodes.push_back(input_node);
     }
 
+    // Make sure that we don't increase the number of edges that cross
+    // device boundaries.
+    if ((num_inputs == 1 && num_outputs > 1 &&
+         input_nodes[0]->device() != node->device()) ||
+        (num_inputs > 1 && num_outputs == 1 &&
+         output_nodes[0]->device() != node->device())) {
+      return;
+    }
+
     // TODO(rmlarsen): Not all device crossings are equally expensive.
     // Assign a cost to each based on device affinity and compute a
     // cost before and after.
@@ -316,6 +326,8 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       // unless they only have consumers on the same device as themselves.
       return;
     }
+
+    // Make sure we do not increase the number of device crossings.
     const int num_cross_before = num_cross_in + num_cross_out;
     int num_cross_after = 0;
     for (NodeDef* input_node : input_nodes) {
@@ -325,7 +337,6 @@ void DependencyOptimizer::OptimizeNode(int node_idx,
       }
     }
     if (num_cross_after > num_cross_before) {
-      // Avoid increasing the number of device crossings.
       return;
     }
 
@@ -518,10 +529,6 @@ Status DependencyOptimizer::TransitiveReduction() {
       if (longest_distance[target] > 1) {
         const int input_slot = control_output.second;
         control_edges_to_remove[target].emplace(input_slot, source);
-        //        VLOG(1) << "Removing edge from:\n"
-        //                << optimized_graph_->node(source).DebugString() <<
-        //                "\n\nto:\n\n"
-        //                << optimized_graph_->node(target).DebugString();
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index cc1e142041c36fb645267c13b306d86639b2541e..6a297da52d075ea9bdae4584b7646ee44b950012 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -285,6 +285,38 @@ TEST_F(DependencyOptimizerTest, RemoveNoOps_DeviceBoundaries) {
   VerifyGraphsEqual(item.graph, output, __FUNCTION__);
 }
 
+TEST_F(DependencyOptimizerTest, RemoveIdentityOps_DeviceBoundaries) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::RandomUniform(s.WithOpName("x").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  Output y = ops::RandomUniform(s.WithOpName("y").WithDevice("/CPU:0"), {1, 2},
+                                DT_FLOAT);
+  // Identity with a single input- and two output dependencies.
+  auto id_a = ops::Identity(s.WithOpName("id_a").WithDevice("/CPU:1"), x);
+  // Identity with a two input- and a single output dependency.
+  auto id_b = ops::Identity(
+      s.WithOpName("id_b").WithControlDependencies(y).WithDevice("/CPU:0"), x);
+
+  Output id =
+      ops::Identity(s.WithControlDependencies(id_a).WithDevice("/CPU:1"), id_b);
+  Output id_1 = ops::Identity(s.WithDevice("/CPU:1"), id_a);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch.push_back("Identity");
+  item.fetch.push_back("Identity_1");
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // The optimization should be disabled to prevent increasing the number of
+  // nodes crossing device boundaries.
+  TF_CHECK_OK(TopologicalSort(&item.graph));
+  VerifyGraphsEqual(item.graph, output, __FUNCTION__);
+}
+
 TEST_F(DependencyOptimizerTest, RemoveNoOps_SingleInputOrOutput) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output x = ops::RandomUniform(s.WithOpName("x"), {1, 2}, DT_FLOAT);
@@ -646,6 +678,50 @@ TEST_F(DependencyOptimizerTest, Identity_DeviceCrossing_ConsumerOnSameDevice) {
   }
 }
 
+TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  Output y = ops::Placeholder(s.WithOpName("y"), DT_FLOAT,
+                              ops::Placeholder::Shape({}));
+  auto greaterequal = ops::GreaterEqual(s.WithOpName("GreaterEqual"), x, y);
+  auto noop =
+      ops::NoOp(s.WithOpName("NoOp").WithControlDependencies(greaterequal));
+  Output add = ops::Add(
+      s.WithOpName("z").WithControlDependencies({noop.operation}), x, y);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  item.fetch.push_back("z");
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "y") {
+      count++;
+      EXPECT_EQ("Placeholder", node.op());
+      EXPECT_EQ(0, node.input_size());
+    } else if (node.name() == "GreaterEqual") {
+      count++;
+    } else if (node.name() == "NoOp") {
+      count++;
+    } else if (node.name() == "z") {
+      count++;
+      EXPECT_EQ("Add", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("y", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 2a6b8a325f223f1202203c880786d0cf015a496d..343c89a9da8fc32a985dc36b23280f097ba3f48e 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -32,16 +32,129 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+class FunctionInliningContext {
+ public:
+  explicit FunctionInliningContext(const GrapplerItem& item,
+                                   RewriterConfig::Toggle opt_level)
+      : library_(&item.graph.library()),
+        opt_level_(opt_level),
+        functions_(InliningCandidates(item)) {}
+
+  const FunctionDefLibrary& Library() const { return *library_; }
+
+  bool HasInlinedFunctions() const { return !functions_.empty(); }
+
+  // Find inlining candidate by name. Return nullptr if not found.
+  const FunctionDef* FindInlinedFunction(const string& name) const {
+    auto it = functions_.find(name);
+    if (it != functions_.end()) {
+      return it->second;
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::unordered_map<string, const FunctionDef*> InliningCandidates(
+      const GrapplerItem& item) const {
+    std::unordered_map<string, const FunctionDef*> functions;
+    for (const FunctionDef& func : item.graph.library().function()) {
+      // Don't inline functions marked as noinline
+      if (func.attr().count("_noinline") != 0 &&
+          func.attr().at("_noinline").b() &&
+          opt_level_ != RewriterConfig::AGGRESSIVE) {
+        continue;
+      }
+      // Can't create IdentityN nodes with no input or output: skip these
+      // functions for now.
+      if (func.signature().input_arg_size() == 0 ||
+          func.signature().output_arg_size() == 0) {
+        continue;
+      }
+      functions[func.signature().name()] = &func;
+    }
+    return functions;
+  }
+
+  const FunctionDefLibrary* library_;
+  RewriterConfig::Toggle opt_level_;
+  std::unordered_map<string, const FunctionDef*> functions_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FunctionInliningContext);
+};
+
+// Copy input/output argument type to the type_list. Return error if argument
+// type is not explicitly defined, and not specified in function attributes.
+Status CopyArgType(const NodeDef& func_node,
+                   const std::unordered_map<string, AttrValue>& func_attr,
+                   const string& arg_kind, const OpDef::ArgDef& arg,
+                   AttrValue::ListValue* type_list) {
+  if (arg.type() != DT_INVALID) {
+    type_list->add_type(arg.type());
+  } else {
+    auto it = func_attr.find(arg.type_attr());
+    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
+      return errors::InvalidArgument(
+          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
+          func_node.op(), " instantiated by ", func_node.name());
+    }
+    type_list->add_type(it->second.type());
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function inputs to: this ensures that
+// they're all evaluated before the evaluation of the function body starts.
+Status HookInlinedFunctionInputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
+  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs->set_op("IdentityN");
+  inputs->set_device(func_node.device());
+  *inputs->mutable_input() = func_node.input();
+  AttrValue::ListValue* type_list =
+      (*inputs->mutable_attr())["T"].mutable_list();
+  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "input", arg, type_list));
+  }
+  return Status::OK();
+}
+
+// Add an IdentityN op to hook the function outputs to: this ensures that the
+// function body is fully evaluated before its fanout gets scheduled.
+Status HookInlinedFunctionOutputs(
+    const NodeDef& func_node, const FunctionDef& func,
+    const std::unordered_map<string, AttrValue>& func_attr,
+    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
+  outputs->set_name(func_node.name());
+  outputs->set_op("IdentityN");
+  outputs->set_device(func_node.device());
+  AttrValue::ListValue* type_list =
+      (*outputs->mutable_attr())["T"].mutable_list();
+  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
+    const OpDef::ArgDef& arg = func.signature().output_arg(i);
+    TF_RETURN_IF_ERROR(
+        CopyArgType(func_node, func_attr, "output", arg, type_list));
+    // Use the fetch names since they take into account the output mapping.
+    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+  }
+  return Status::OK();
+}
+
+Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
+                      const FunctionInliningContext& ctx,
+                      GraphDef* optimized_graph) {
+  const std::unordered_map<string, AttrValue> func_attr(
+      func_node.attr().begin(), func_node.attr().end());
 
-Status InlineFunction(const NodeDef& node, const FunctionDef& func,
-                      const FunctionDefLibrary& library, GraphDef* graph) {
-  const std::unordered_map<string, AttrValue> attr(node.attr().begin(),
-                                                   node.attr().end());
   std::unique_ptr<GrapplerItem> item =
-      GrapplerItemFromFunctionDef(func, attr, library);
+      GrapplerItemFromFunctionDef(func, func_attr, ctx.Library());
   if (!item) {
-    return errors::InvalidArgument("Failed to inline function ", node.op(),
-                                   " instantiated by ", node.name());
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name());
   }
 
   std::unordered_map<string, int> input_nodes;
@@ -50,43 +163,25 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
     input_nodes[arg.name()] = i;
   }
 
-  // Add an IdentityN op to hook the function inputs to: this ensures that
-  // they're all evaluated before the evaluation of the function body starts.
-  NodeDef* func_inputs = graph->add_node();
-  func_inputs->set_name(strings::StrCat(node.name(), "/", "inlined_inputs"));
-  func_inputs->set_op("IdentityN");
-  func_inputs->set_device(node.device());
-  *func_inputs->mutable_input() = node.input();
-  AttrValue::ListValue* type_list =
-      (*func_inputs->mutable_attr())["T"].mutable_list();
-  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
-    } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid input argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
-      }
-      type_list->add_type(it->second.type());
-    }
-  }
+  // Hook inlined function inputs to IdentityN node
+  NodeDef* func_inputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(
+      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
 
   for (NodeDef& func_body_node : *item->graph.mutable_node()) {
     if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+      CHECK_EQ(0, func_body_node.input_size());
       // Turn input placeholders into identity nodes
       if (IsPlaceholder(func_body_node)) {
         func_body_node.set_op("Identity");
       }
-      CHECK_EQ(0, func_body_node.input_size());
       int input_id = input_nodes[func_body_node.name()];
       func_body_node.add_input(
           strings::StrCat(func_inputs->name(), ":", input_id));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
-        input = AddPrefixToNodeName(input, node.name());
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
       }
       // If the node has no input, make hook it up to the func_inputs node to
       // ensure it runs in the same frame as the other nodes of the function
@@ -98,39 +193,33 @@ Status InlineFunction(const NodeDef& node, const FunctionDef& func,
 
     // Add the node name as a prefix to avoid collisions after inlining
     func_body_node.set_name(
-        strings::StrCat(node.name(), "/", func_body_node.name()));
+        strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
     // Make sure the node is placed
-    func_body_node.set_device(node.device());
-
-    // Move the node to the main graph
-    graph->add_node()->Swap(&func_body_node);
-  }
-
-  // Add an IdentityN op to hook the function outputs to: this ensures that the
-  // function body is fully evaluated before its fanout gets scheduled.
-  NodeDef* func_outputs = graph->add_node();
-  func_outputs->set_name(node.name());
-  func_outputs->set_op("IdentityN");
-  func_outputs->set_device(node.device());
-  type_list = (*func_outputs->mutable_attr())["T"].mutable_list();
-  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().output_arg(i);
-    if (arg.type() != DT_INVALID) {
-      type_list->add_type(arg.type());
+    func_body_node.set_device(func_node.device());
+
+    // Check if a body node is itself a function
+    const FunctionDef* func_body_node_func =
+        ctx.FindInlinedFunction(func_body_node.op());
+    if (func_body_node_func != nullptr) {
+      // Recursively inline function calls
+      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
+                                        ctx, optimized_graph));
     } else {
-      auto it = attr.find(arg.type_attr());
-      if (it == attr.end()) {
-        return errors::InvalidArgument("Invalid output argument ", arg.name(),
-                                       " for function ", node.op(),
-                                       " instantiated by ", node.name());
+      // Annotate the node with the function attributes.
+      for (const auto& attr : func.attr()) {
+        func_body_node.mutable_attr()->insert(attr);
       }
-      type_list->add_type(it->second.type());
+      // Move the node to the main graph
+      optimized_graph->add_node()->Swap(&func_body_node);
     }
-    // Use the fetch names since they take into account the output mapping.
-    func_outputs->add_input(strings::StrCat(node.name(), "/", item->fetch[i]));
   }
 
+  // Hook inlined function outputs to IdentityN node
+  NodeDef* func_outputs = optimized_graph->add_node();
+  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
+                                                item->fetch, func_outputs));
+
   return Status::OK();
 }
 
@@ -278,31 +367,14 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
   return Status::OK();
 }
 
+}  // namespace
+
 Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
-  std::unordered_map<string, const FunctionDef*> functions;
-  for (const FunctionDef& func : item.graph.library().function()) {
-    // Don't inline functions marked as noinline
-    if (func.attr().count("_noinline") != 0) {
-      continue;
-    }
-    // Don't touch anything marked XLA to prevent XLA failures further down the
-    // road.
-    if (func.attr().count("_XlaCompile") > 0 &&
-        func.attr().at("_XlaCompile").b()) {
-      continue;
-    }
-    // Can't create IdentityN nodes with no input or output: skip these
-    // functions for now.
-    if (func.signature().input_arg_size() == 0 ||
-        func.signature().output_arg_size() == 0) {
-      continue;
-    }
-    functions[func.signature().name()] = &func;
-  }
+  FunctionInliningContext function_inlining_ctx(item, opt_level_);
 
-  // Nothing to do.
-  if (functions.empty()) {
+  // Nothing to do here.
+  if (!function_inlining_ctx.HasInlinedFunctions()) {
     *optimized_graph = item.graph;
     return Status::OK();
   }
@@ -315,12 +387,14 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
       continue;
     }
-    auto it = functions.find(node.op());
-    if (it == functions.end()) {
-      *optimized_graph->add_node() = node;
+
+    const FunctionDef* func =
+        function_inlining_ctx.FindInlinedFunction(node.op());
+    if (func != nullptr) {
+      TF_RETURN_IF_ERROR(
+          InlineFunction(node, *func, function_inlining_ctx, optimized_graph));
     } else {
-      TF_RETURN_IF_ERROR(InlineFunction(node, *it->second, item.graph.library(),
-                                        optimized_graph));
+      *optimized_graph->add_node() = node;
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index 41444e467364f83e7627477a7651203100e47d8a..b124efe01dbc697b84a164fb33c57a9f5071db5c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -26,7 +26,7 @@ namespace grappler {
 // operations to make the overall graph more efficient.
 class FunctionOptimizer : public GraphOptimizer {
  public:
-  FunctionOptimizer(RewriterConfig::Toggle opt_level) {}
+  FunctionOptimizer(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
   ~FunctionOptimizer() override {}
 
   string name() const override { return "function_optimizer"; };
@@ -36,6 +36,9 @@ class FunctionOptimizer : public GraphOptimizer {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 52a1118080ab4e0529330514258ba1b498f4dbbb..fe26a56fc214d65a617310b158e8ead55b37469f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -26,7 +26,22 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class FunctionOptimizerTest : public GrapplerTest {};
+constexpr char kDevice[] = "/device:CPU:0";
+
+class FunctionOptimizerTest : public GrapplerTest {
+ protected:
+  Tensor MakeScalarTensor(float value) {
+    Tensor tensor(DT_FLOAT, {});
+    tensor.scalar<float>()() = value;
+    return tensor;
+  }
+
+  Tensor MakeScalarTensor(int value) {
+    Tensor tensor(DT_INT32, {});
+    tensor.scalar<int>()() = value;
+    return tensor;
+  }
+};
 
 TEST_F(FunctionOptimizerTest, SimpleFunction) {
   // Build a graph to compute y = XTimesTwo(x)
@@ -94,9 +109,8 @@ TEST_F(FunctionOptimizerTest, SimpleFunction) {
   }
   EXPECT_EQ(7, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -183,9 +197,8 @@ TEST_F(FunctionOptimizerTest, FixedTypeFunction) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -268,9 +281,8 @@ TEST_F(FunctionOptimizerTest, FunctionWithOutputMapping) {
   }
   EXPECT_EQ(6, count);
 
+  Tensor pi = MakeScalarTensor(3.14f);
   item.fetch = {"z"};
-  Tensor pi(DT_FLOAT, {});
-  pi.flat<float>()(0) = 3.14f;
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
@@ -325,18 +337,11 @@ TEST_F(FunctionOptimizerTest, FunctionWithInputForwarding) {
   TF_EXPECT_OK(status);
 
   item.fetch = {"z0", "z1", "z2"};
-  Tensor in(DT_FLOAT, {});
-  in.flat<float>()(0) = 3.14f;
-  item.feed.emplace_back("x0", in);
-  in.flat<float>()(0) = 2.7f;
-  item.feed.emplace_back("x1", in);
-  in.flat<float>()(0) = 1.0f;
-  item.feed.emplace_back("x2", in);
-  in.flat<float>()(0) = -1.0f;
-  item.feed.emplace_back("x4", in);
-  Tensor in_int(DT_INT32, {});
-  in_int.flat<int>()(0) = 1234;
-  item.feed.emplace_back("x3", in_int);
+  item.feed.emplace_back("x0", MakeScalarTensor(3.14f));
+  item.feed.emplace_back("x1", MakeScalarTensor(2.7f));
+  item.feed.emplace_back("x2", MakeScalarTensor(1.0f));
+  item.feed.emplace_back("x4", MakeScalarTensor(-1.0f));
+  item.feed.emplace_back("x3", MakeScalarTensor(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
   GrapplerItem optimized(item, std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
@@ -379,6 +384,100 @@ TEST_F(FunctionOptimizerTest, FunctionWithoutInput) {
   EXPECT_EQ(item.graph.DebugString(), output.DebugString());
 }
 
+TEST_F(FunctionOptimizerTest, InlineFunctionWithNestedFunctionCall) {
+  // Define square via function library:
+  //   MySquare(x) = MyMul(x, x)
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {test::function::NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}},
+                            kDevice),
+       test::function::NDef("outputs", "Identity", {"square:0"},
+                            {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func});
+
+  GraphDef output;
+  FunctionOptimizer optimizer(RewriterConfig::ON);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "square/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/inlined_inputs" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/x", node.input(0));
+      EXPECT_EQ("square/x", node.input(1));
+    } else if (node.name() == "square/output/x" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:0", node.input(0));
+    } else if (node.name() == "square/output/y" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/inlined_inputs:1", node.input(0));
+    } else if (node.name() == "square/output/output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square/output/x", node.input(0));
+      EXPECT_EQ("square/output/y", node.input(1));
+    } else if (node.name() == "square/output" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output/output:0", node.input(0));
+    } else if (node.name() == "square" && count++) {
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square/output:0", node.input(0));
+    } else if (node.name() == "outputs" && count++) {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square:0", node.input(0));
+    }
+  }
+  EXPECT_EQ(9, count);
+
+  item.fetch = {"outputs"};
+  item.feed.emplace_back("a", MakeScalarTensor(2.0f));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -409,13 +508,14 @@ TEST_F(FunctionOptimizerTest, SymbolicGradients) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out1", "out2"});
-  std::vector<Tensor> optimized = EvaluateNodes(output, {"out1", "out2"});
+  std::vector<Tensor> expected =
+      EvaluateNodes(item.graph, {"out1", "out2"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out1", "out2"}, {});
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
   test::ExpectTensorEqual<float>(expected[1], optimized[1]);
 }
@@ -450,7 +550,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -478,8 +578,8 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsIdentity) {
     EXPECT_EQ("Identity", output.node(i).op());
   }
 
-  std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out"});
-  std::vector<Tensor> optimized = EvaluateNodes(output, {"out"});
+  std::vector<Tensor> expected = EvaluateNodes(item.graph, {"out"}, {});
+  std::vector<Tensor> optimized = EvaluateNodes(output, {"out"}, {});
   test::ExpectTensorEqual<float>(expected[0], optimized[0]);
 }
 
@@ -513,7 +613,7 @@ TEST_F(FunctionOptimizerTest, SymbolicGradientsNoInlineFunc) {
   TF_EXPECT_OK(scope.ToGraphDef(&item.graph));
   *item.graph.mutable_library()->add_function() = func;
 
-  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  FunctionOptimizer optimizer(RewriterConfig::ON);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   // The optimizer should succeed but the graphs should be the same.
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
index 7044705adee7c1be52b04e6556066546b17f944f..1ea57f7b4f003e8a98fe187f6325e39ebe30e9e7 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.cc
@@ -42,6 +42,10 @@ Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
 Status GetTensorProperties(const GraphOptimizerContext& ctx,
                            const string& tensor,
                            OpInfo::TensorProperties* properties) {
+  if (ctx.graph_properties == nullptr) {
+    return errors::InvalidArgument("Graph properties are unknown.");
+  }
+
   int port;
   string tensor_node_name = ParseNodeName(tensor, &port);
   if (port < 0) {
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index be95c00d2dae30d4419cc45eec9f2f4855daeadd..072f7729466ddcee68dc2c9ca0b27ee4b97b18c7 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
-#include "tensorflow/core/grappler/utils/frame.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -45,21 +44,16 @@ const NodeScopeAndName ParseNodeScopeAndName(const string& node_name);
 struct GraphOptimizerContext {
   GraphOptimizerContext(const std::unordered_set<string>* nodes_to_preserve,
                         GraphDef* optimized_graph,
-                        GraphProperties* graph_properties, NodeMap* node_map,
-                        FrameMap* frame_map)
+                        GraphProperties* graph_properties, NodeMap* node_map)
       : nodes_to_preserve(nodes_to_preserve),
         optimized_graph(optimized_graph),
         graph_properties(graph_properties),
-        node_map(node_map),
-        frame_map(frame_map) {}
+        node_map(node_map) {}
 
   const std::unordered_set<string>* nodes_to_preserve;
   GraphDef* optimized_graph;
   GraphProperties* graph_properties;
   NodeMap* node_map;
-  // TODO(ezhulenev): it seems that frame_map is only relevant for loop
-  // optimizer? Move it to loop-optimizer specific context extension.
-  FrameMap* frame_map;
 };
 
 Status GetInputNode(const GraphOptimizerContext& ctx, const string& input,
@@ -117,6 +111,9 @@ class GraphOptimizerStage {
       : optimizer_name_(optimizer_name), stage_name_(stage_name), ctx_(ctx) {}
   virtual ~GraphOptimizerStage() = default;
 
+  const string& stage_name() const { return stage_name_; }
+  const string& optimizer_name() const { return optimizer_name_; }
+
   // Check if we should try to simplify node. Returning true doesn't
   // guarantee that node will be simplified.
   //
@@ -137,6 +134,18 @@ class GraphOptimizerStage {
   // and remove template parameter.
   virtual Status TrySimplify(NodeDef* node, Result* result) = 0;
 
+  // Return InvalidArgumentError if node is not supported by the optimizer
+  // stage.
+  // TODO(ezhulenev): make this check part of non-virtual public API
+  // (TrySimplify), and make virtual implementation protected.
+  Status EnsureNodeIsSupported(const NodeDef* node) const {
+    return IsSupported(node)
+               ? Status::OK()
+               : errors::InvalidArgument(
+                     "Node ", node->name(), " is not supported by optimizer ",
+                     optimizer_name_, " and stage ", stage_name_);
+  }
+
   // Get a name for a new node, created by this stage, based on one or multiple
   // nodes of an original graph.
   const string OptimizedNodeName(const NodeScopeAndName& node) const {
@@ -179,6 +188,64 @@ class GraphOptimizerStage {
   const GraphOptimizerContext ctx_;
 };
 
+template <typename Result>
+class GraphOptimizerStagePipeline {
+ public:
+  // Break predicate specifies if a pipeline should stop early, and not pass
+  // a node to the next registered optimizer stage, typically that should be the
+  // case when a stage successfully optimized a node, and it wants to yield
+  // control to the optimizer.
+  explicit GraphOptimizerStagePipeline(
+      const std::function<bool(const Result&)> break_predicate)
+      : break_predicate_(break_predicate) {}
+
+  // Add a stage to the pipeline. It should be called with the arguments for the
+  // stage constructor:
+  //
+  //   pipeline.AddStage<FooStage>(constructor_arg1, constructor_arg2);
+  //
+  // Returns a reference to the added stage.
+  template <typename T, typename... Args>
+  T& AddStage(Args&&... args) {
+    auto stage = new T(std::forward<Args>(args)...);
+    stages_.push_back(std::unique_ptr<T>(stage));
+    return *stage;
+  }
+
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true.
+  //
+  // Return true, if pipeline exited after a break predicate was evaluated as
+  // 'true', which typically means that a node was optimized by one of the
+  // registered stages.
+  //
+  // Return false, if node was not optimized by any of registered stages.
+  bool PassThroughAllStages(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (stage->IsSupported(node)) {
+        const Status stage_status = stage->TrySimplify(node, result);
+        // Each stage must be "error safe" (just like exception safe). In
+        // case of any error it must leave optimized graph unmodified.
+        if (!stage_status.ok()) {
+          LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
+                       << ", stage " << stage->stage_name()
+                       << ". Error: " << stage_status.error_message();
+        }
+        if (break_predicate_(*result)) return true;
+      }
+    }
+    return false;
+  }
+
+  std::size_t NumStages() { return stages_.size(); }
+
+ private:
+  std::vector<std::unique_ptr<GraphOptimizerStage<Result>>> stages_;
+  std::function<bool(const Result&)> break_predicate_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphOptimizerStagePipeline);
+};
+
 }  // end namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
index 416327e6228431edbf0389f6135cdd028fda45dc..3f5ab87a5a372a0dc954aa5a9ae57241635d5594 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage_test.cc
@@ -58,8 +58,8 @@ TEST_F(GraphOptimizerStageTest, ParseNodeNameAndScope_InScope) {
 TEST_F(GraphOptimizerStageTest, OptimizedNodeName) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ nullptr,
-                            /*graph_properties*/ nullptr, /*node_name*/ nullptr,
-                            /*frame_map*/ nullptr);
+                            /*graph_properties*/ nullptr,
+                            /*node_name*/ nullptr);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   const auto node = ParseNodeScopeAndName("a/b/c/Add");
@@ -94,8 +94,7 @@ TEST_F(GraphOptimizerStageTest, GetInputNodeAndProperties) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
-                            /*frame_map*/ nullptr);
+                            /*node_name*/ &node_map);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   NodeDef* add_node;
@@ -134,8 +133,7 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
   GraphOptimizerContext ctx(/*nodes_to_preserve*/ nullptr,
                             /*optimized_graph*/ &item.graph,
                             /*graph_properties*/ &properties,
-                            /*node_name*/ &node_map,
-                            /*frame_map*/ nullptr);
+                            /*node_name*/ &node_map);
   FakeOptimizerStage stage("my_opt", "my_stg", ctx);
 
   NodeDef* add_node;
@@ -165,4 +163,4 @@ TEST_F(GraphOptimizerStageTest, AddNodes) {
 
 }  // namespace
 }  // end namespace grappler
-}  // end namespace tensorflow
\ No newline at end of file
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 18e63f823b0ca6ee1011e8e98c43e46ad07c2995..8fb30d116de8f2784a7f58baea6c77d4c76e0ae9 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -17,9 +17,13 @@ limitations under the License.
 #include <unordered_set>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -363,6 +367,28 @@ std::vector<int> DataInputPos(const NodeDef& node) {
   return {};
 }
 
+bool IsHostMemory(const NodeDef& node, int output_port) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (DeviceNameUtils::ParseFullName(node.device(), &parsed_name)) {
+    DeviceType device_type(parsed_name.type);
+    Status s = FindKernelDef(device_type, node, nullptr, nullptr);
+    if (s.ok()) {
+      tensorflow::MemoryTypeVector in_mtypes;
+      tensorflow::MemoryTypeVector out_mtypes;
+      s = tensorflow::MemoryTypesForNode(OpRegistry::Global(), device_type,
+                                         node, &in_mtypes, &out_mtypes);
+      if (s.ok()) {
+        if (out_mtypes[output_port] == HOST_MEMORY) {
+          return true;
+        }
+      }
+    } else {
+      return true;
+    }
+  }
+  return false;
+}
+
 class GraphProcessor {
  public:
   GraphProcessor(const GraphProperties& graph_properties,
@@ -547,8 +573,8 @@ class NodeProcessor : public GraphProcessor {
     string device;
     string not_used;
     if (DeviceNameUtils::SplitDeviceName(device_name, &not_used, &device) &&
-        (StringPiece(str_util::Lowercase(device)))
-            .contains(str_util::Lowercase(DEVICE_GPU))) {
+        str_util::StrContains(str_util::Lowercase(device),
+                              str_util::Lowercase(DEVICE_GPU))) {
       return true;
     }
     return false;
@@ -883,6 +909,24 @@ class NodeProcessor : public GraphProcessor {
     list->set_i(3, w);
   }
 
+  string MaybeGetHostDevice(const string& input_name) const {
+    string device = node_->device();
+    DeviceNameUtils::ParsedName parsed_name;
+    if (DeviceNameUtils::ParseFullName(device, &parsed_name)) {
+      if (parsed_name.type != "CPU") {
+        NodeDef* input = node_map_->GetNode(input_name);
+        int port;
+        ParseNodeName(input_name, &port);
+        if (IsHostMemory(*input, port)) {
+          parsed_name.type = "CPU";
+          parsed_name.id = 0;
+          device = DeviceNameUtils::ParsedNameToString(parsed_name);
+        }
+      }
+    }
+    return device;
+  }
+
   NodeDef* AddNodeDataFormatOp(const string& name, const string& input_name,
                                const string& op, DataType dtype,
                                bool nhwc_to_nchw) {
@@ -890,7 +934,9 @@ class NodeProcessor : public GraphProcessor {
     added_node->set_name(name);
     added_node->set_op(op);
     node_map_->AddNode(added_node->name(), added_node);
-    added_node->set_device(node_->device());
+    // The inputs of a DataFormat op could be in host memory for ops such as
+    // Reshape.
+    added_node->set_device(MaybeGetHostDevice(input_name));
     AttrValue attr_data_type;
     attr_data_type.set_type(dtype);
     added_node->mutable_attr()->insert({"T", attr_data_type});
@@ -2119,6 +2165,10 @@ Status LayoutOptimizer::Tune(const GrapplerItem& item,
 
 Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
+  if (cluster == nullptr) {
+    return errors::InvalidArgument("cluster == nullptr");
+  }
+
   if (GetNumGPUs(*cluster) < 1) {
     // LayoutOptimizer is currently only tuned for GPU.
     *output = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index 1c912fcaa251c576308a983ef351319053423a85..b913f2b00413e0609c136850f6dfd9c85fbccd20 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/costs/virtual_placer.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -35,7 +36,7 @@ class LayoutOptimizerTest : public ::testing::Test {
     DeviceProperties device_properties;
     device_properties.set_type("GPU");
     device_properties.mutable_environment()->insert({"architecture", "6"});
-    virtual_cluster_.reset(new VirtualCluster({{"/GPU:0", device_properties}}));
+    virtual_cluster_.reset(new VirtualCluster({{"/GPU:1", device_properties}}));
   }
 
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
@@ -158,7 +159,7 @@ class LayoutOptimizerTest : public ::testing::Test {
     return output.x_backprop;
   }
 
-  std::unique_ptr<VirtualCluster> virtual_cluster_;
+  std::unique_ptr<Cluster> virtual_cluster_;
 };
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
@@ -1130,6 +1131,27 @@ TEST_F(LayoutOptimizerTest, LoopNoLiveLock) {
   EXPECT_EQ(mul_node->input(0),
             "Conv2D-0-0-TransposeNCHWToNHWC-LayoutOptimizer");
 }
+
+TEST_F(LayoutOptimizerTest, DevicePlacement) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D(&s, 4, 2, "VALID");
+  auto shape = ops::Shape(s.WithOpName("s"), conv);
+  auto i = ops::Identity(s.WithOpName("i"), shape);
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  VirtualPlacer virtual_placer(virtual_cluster_.get());
+  for (auto& node : *item.graph.mutable_node()) {
+    string device = virtual_placer.get_canonical_device_name(node);
+    node.set_device(device);
+  }
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  auto vec_permute =
+      node_map.GetNode("s-0-0-VecPermuteNCHWToNHWC-LayoutOptimizer");
+  EXPECT_EQ(vec_permute->device(), "/device:CPU:0");
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index a063dc33816e25c560a385e188203c9ad9bfe4cd..fff06dd2acefe424b75b21904fa21494af020be8 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -16,18 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 
 #include <algorithm>
+#include <deque>
 #include <limits>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <deque>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -46,74 +45,36 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-std::vector<int> GetStackPushNodesToConvert(
-    const SimpleGraphView& graph_view,
-    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
-  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
-  const std::unordered_set<string> op_types_to_traverse(
-      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
-       "Identity", "RefIdentity"});
-  std::vector<int> nodes_to_convert;
-  std::set<int> fanout;
-  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
-  for (int fanout_idx : fanout) {
-    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
-    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
-    if (IsStackPushOp(fanout_node)) {
-      nodes_to_convert.push_back(fanout_idx);
-    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
-               op_types_to_traverse.find(fanout_node.op()) !=
-                   op_types_to_traverse.end()) {
-      continue;
-    } else if (!IsStackPopOp(fanout_node) ||
-               (!graph_view.outputs(fanout_idx).empty() ||
-                nodes_to_preserve.find(fanout_node.name()) !=
-                    nodes_to_preserve.end())) {
-      // The node is either a stack pop with consumers or something unexpected
-      // so we leave the graph alone.
-      nodes_to_convert.clear();
-      break;
-    }
-  }
-  return nodes_to_convert;
-}
+class LoopInvariantNodeMotionOptimizer {
+ public:
+  explicit LoopInvariantNodeMotionOptimizer(GraphDef* optimized_graph)
+      : optimized_graph_(optimized_graph) {}
+  virtual ~LoopInvariantNodeMotionOptimizer() = default;
+  Status Optimize();
 
-Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
-  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
-  const GraphDef& graph = item.graph;
-  *optimized_graph = graph;
-  NodeMap node_map(optimized_graph);
-  SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
-  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
-    if (IsStackOp(graph.node(node_idx))) {
-      for (int push_node_idx : GetStackPushNodesToConvert(
-               graph_view, nodes_to_preserve, node_idx)) {
-        // We found push nodes without corresponding pops. Convert them to
-        // Identity passing the data through and add a control dependency from
-        // the op supplying the stack handle.
-        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
-        VLOG(1) << "Converting " << push_node_idx << " : "
-                << push_node->DebugString();
-        if (push_node->attr().count("swap_memory") != 0) {
-          push_node->mutable_attr()->erase("swap_memory");
-        }
-        push_node->set_op("Identity");
-        push_node->mutable_input()->SwapElements(0, 1);
-        const string ctrl_dep = ConstantFolding::AddControlDependency(
-            push_node->input(1), optimized_graph, &node_map);
-        push_node->set_input(1, ctrl_dep);
-        VLOG(1) << "After converting: " << push_node->DebugString();
-      }
-    }
-  }
-  return Status::OK();
-}
+ private:
+  Status FindInvariantNodes(NodeDef* node);
+  Status RevertInvariantNodes();
+  Status MoveInvariantNodes(const int frame_id);
+  Status HandleInvariantNode(NodeDef* node, const int num_outputs,
+                             const int frame_id);
+  Status HandleConst(NodeDef* node, const int num_outputs, const int frame_id);
+  Status HandleInvariantEnter(NodeDef* node, const int num_outputs);
 
-}  // namespace
+  GraphDef* optimized_graph_;  // Not owned.
+  std::unique_ptr<NodeMap> node_map_;
+  std::map<NodeDef*, int> invariant_nodes_;
+  std::set<int> empty_set_;
+  // TODO(rmlarsen): Use vector instead of map, since frames ids are dense.
+  std::map<int, std::set<int>> frame_children_;
+  std::map<int, int> frame_parent_;
+  std::map<int, const NodeDef*> loop_cond_;
+  std::map<int, std::vector<NodeDef*>> invariant_enters_;
+  int new_enter_id_;
+};
 
-Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
-                                               const int num_outputs) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
+    NodeDef* node, const int num_outputs) {
   auto consumers = node_map_->GetOutputs(node->name());
   std::vector<string> enter_control_inputs;
   string enter_input;
@@ -142,9 +103,10 @@ Status LoopOptimizer::LINMHandleInvariantEnter(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleConst(NodeDef* node,
-    const int num_outputs, const int frame_id) {
-  NodeDef* const_node;
+Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
+                                                     const int num_outputs,
+                                                     const int frame_id) {
+  NodeDef* const_node = nullptr;
   if (num_outputs == 0) {
     // all successor nodes are invariant
     // Remove the control inputs from this frame to the const node,
@@ -156,12 +118,17 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
     // some successor nodes are variant
     // Have to keep the const node in the frame,
     // so create a new one outside the frame (in parent frame)
-    const_node = optimized_graph_->add_node();
-    const_node->set_name(AddPrefixToNodeName(node->name(), kLoopOptimizer));
-    const_node->set_op("Const");
-    const_node->set_device(node->device());
-    *const_node->mutable_attr() = node->attr();
-    node_map_->AddNode(const_node->name(), const_node);
+    const string const_node_name =
+        AddPrefixToNodeName(node->name(), kLoopOptimizer);
+    const_node = node_map_->GetNode(const_node_name);
+    if (const_node == nullptr) {
+      const_node = optimized_graph_->add_node();
+      const_node->set_name(const_node_name);
+      const_node->set_op("Const");
+      const_node->set_device(node->device());
+      *const_node->mutable_attr() = node->attr();
+      node_map_->AddNode(const_node->name(), const_node);
+    }
     auto consumers = node_map_->GetOutputs(node->name());
     for (auto* consumer : consumers) {
       if (invariant_nodes_.count(consumer)) {
@@ -185,8 +152,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
     int parent_id = parent_it->second;
     auto loop_cond_it = loop_cond_.find(parent_id);
     if (loop_cond_it == loop_cond_.end()) {
-      return errors::InvalidArgument(
-          "Frame ", frame_id, " doesn't have a LoopCond node");
+      return errors::InvalidArgument("Frame ", frame_id,
+                                     " doesn't have a LoopCond node");
     }
     auto& loop_cond_name = loop_cond_it->second->name();
     NodeDef* switch_node = nullptr;
@@ -197,9 +164,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
       }
     }
     if (!switch_node) {
-      return errors::InvalidArgument(
-          "LoopCond node of Frame ", frame_id,
-          " doesn't connect to any Switch node");
+      return errors::InvalidArgument("LoopCond node of Frame ", frame_id,
+                                     " doesn't connect to any Switch node");
     }
     string switch_output = StrCat(switch_node->name(), ":1");
     const string ctrl_dep = ConstantFolding::AddControlDependency(
@@ -210,8 +176,8 @@ Status LoopOptimizer::LINMHandleConst(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
-    const int num_outputs, const int frame_id) {
+Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
+    NodeDef* node, const int num_outputs, const int frame_id) {
   // have to remove control inputs to the invariant node from the same frame
   // when moving this node out of this frame
   for (int i = 0; i < node->input_size(); ++i) {
@@ -228,16 +194,14 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   DataTypeVector output_types;
   OpRegistryInterface* op_registry = OpRegistry::Global();
   const OpRegistrationData* op_reg_data = nullptr;
-  TF_RETURN_IF_ERROR(
-      op_registry->LookUp(node->op(), &op_reg_data));
-  TF_RETURN_IF_ERROR(
-      InOutTypesForNode(*node, op_reg_data->op_def,
-                        &input_types, &output_types));
+  TF_RETURN_IF_ERROR(op_registry->LookUp(node->op(), &op_reg_data));
+  TF_RETURN_IF_ERROR(InOutTypesForNode(*node, op_reg_data->op_def, &input_types,
+                                       &output_types));
 
   auto consumers = node_map_->GetOutputs(node->name());
   string fname = invariant_enters_[frame_id][0]->attr().at("frame_name").s();
-  int piterations = invariant_enters_[frame_id][0]
-                    ->attr().at("parallel_iterations").i();
+  int piterations =
+      invariant_enters_[frame_id][0]->attr().at("parallel_iterations").i();
   for (auto* consumer : consumers) {
     if (!invariant_nodes_.count(consumer)) {
       for (int i = 0; i < consumer->input_size(); ++i) {
@@ -281,28 +245,27 @@ Status LoopOptimizer::LINMHandleInvariantNode(NodeDef* node,
   return Status::OK();
 }
 
-Status LoopOptimizer::MoveInvariantNodes(const int frame_id) {
-  for (auto iter = invariant_nodes_.begin();
-       iter != invariant_nodes_.end(); ++iter) {
+Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
+    const int frame_id) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();
+       ++iter) {
     auto* invariant_node = iter->first;
     const int num_outputs = iter->second;
     if (IsEnter(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleInvariantEnter(invariant_node, num_outputs));
+      TF_RETURN_IF_ERROR(HandleInvariantEnter(invariant_node, num_outputs));
     } else if (IsConstant(*invariant_node)) {
-      TF_RETURN_IF_ERROR(
-          LINMHandleConst(invariant_node, num_outputs, frame_id));
+      TF_RETURN_IF_ERROR(HandleConst(invariant_node, num_outputs, frame_id));
     } else {
       TF_RETURN_IF_ERROR(
-          LINMHandleInvariantNode(invariant_node, num_outputs, frame_id));
+          HandleInvariantNode(invariant_node, num_outputs, frame_id));
     }
   }
   return Status::OK();
 }
 
-Status LoopOptimizer::RevertInvariantNodes() {
+Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
   std::deque<const NodeDef*> reverted_nodes;
-  for (auto iter=invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
+  for (auto iter = invariant_nodes_.begin(); iter != invariant_nodes_.end();) {
     bool erased = false;
     const auto* node = iter->first;
     if (!IsConstant(*node) && !IsEnter(*node) && iter->second > 0) {
@@ -331,8 +294,8 @@ Status LoopOptimizer::RevertInvariantNodes() {
       auto* producer = node_map_->GetNode(input);
       auto iter = invariant_nodes_.find(producer);
       if (iter != invariant_nodes_.end()) {
-        if (IsControlInput(input) &&
-            !IsConstant(*producer) && !IsEnter(*producer)) {
+        if (IsControlInput(input) && !IsConstant(*producer) &&
+            !IsEnter(*producer)) {
           reverted_nodes.push_back(producer);
           invariant_nodes_.erase(iter);
         } else {
@@ -357,12 +320,11 @@ Status LoopOptimizer::RevertInvariantNodes() {
   return Status::OK();
 }
 
-Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
+Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(NodeDef* node) {
   auto consumers = node_map_->GetOutputs(node->name());
   invariant_nodes_.insert(std::make_pair(node, consumers.size()));
   for (auto* consumer : consumers) {
-    if (invariant_nodes_.count(consumer) ||
-        ModifiesFrameInfo(*consumer)) {
+    if (invariant_nodes_.count(consumer) || ModifiesFrameInfo(*consumer)) {
       continue;
     }
     bool is_invariant = true;
@@ -399,9 +361,14 @@ Status LoopOptimizer::FindInvariantNodes(NodeDef* node) {
   return Status::OK();
 }
 
-Status LoopOptimizer::LoopInvariantNodeMotion() {
+Status LoopInvariantNodeMotionOptimizer::Optimize() {
+  node_map_.reset(new NodeMap(optimized_graph_));
+  FrameMap frame_map;
+  int num_frames;
+  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
+                                               &frame_map, &num_frames));
   std::deque<int> worklist;
-  for (auto iter = frame_map_.begin(); iter != frame_map_.end(); ++iter) {
+  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
     auto* node = iter->first;
     auto& frame_ids = iter->second;
     if (frame_ids.size() >= 3) {
@@ -467,19 +434,82 @@ Status LoopOptimizer::LoopInvariantNodeMotion() {
   return Status::OK();
 }
 
-Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
-                               GraphDef* optimized_graph) {
+std::vector<int> GetStackPushNodesToConvert(
+    const SimpleGraphView& graph_view,
+    const std::unordered_set<string>& nodes_to_preserve, int stack_node_idx) {
+  VLOG(1) << "Stack node: " << graph_view.graph()->node(stack_node_idx).name();
+  const std::unordered_set<string> op_types_to_traverse(
+      {"Stack", "StackV2", "Enter", "RefEnter", "Switch", "RefSwitch",
+       "Identity", "RefIdentity"});
+  std::vector<int> nodes_to_convert;
+  std::set<int> fanout;
+  graph_view.DepthFirstSearch(op_types_to_traverse, stack_node_idx, &fanout);
+  for (int fanout_idx : fanout) {
+    const NodeDef& fanout_node = graph_view.graph()->node(fanout_idx);
+    VLOG(1) << "Fanout " << fanout_idx << " : " << fanout_node.name();
+    if (IsStackPushOp(fanout_node)) {
+      nodes_to_convert.push_back(fanout_idx);
+    } else if (IsStackOp(fanout_node) || IsStackCloseOp(fanout_node) ||
+               op_types_to_traverse.find(fanout_node.op()) !=
+                   op_types_to_traverse.end()) {
+      continue;
+    } else if (!IsStackPopOp(fanout_node) ||
+               (!graph_view.outputs(fanout_idx).empty() ||
+                nodes_to_preserve.find(fanout_node.name()) !=
+                    nodes_to_preserve.end())) {
+      // The node is either a stack pop with consumers or something unexpected
+      // so we leave the graph alone.
+      nodes_to_convert.clear();
+      break;
+    }
+  }
+  return nodes_to_convert;
+}
+
+Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
+  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
+  const GraphDef& graph = item.graph;
+  *optimized_graph = graph;
+  NodeMap node_map(optimized_graph);
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
+  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
+    if (IsStackOp(graph.node(node_idx))) {
+      for (int push_node_idx : GetStackPushNodesToConvert(
+               graph_view, nodes_to_preserve, node_idx)) {
+        // We found push nodes without corresponding pops. Convert them to
+        // Identity passing the data through and add a control dependency from
+        // the op supplying the stack handle.
+        NodeDef* push_node = optimized_graph->mutable_node(push_node_idx);
+        VLOG(1) << "Converting " << push_node_idx << " : "
+                << push_node->DebugString();
+        if (push_node->attr().count("swap_memory") != 0) {
+          push_node->mutable_attr()->erase("swap_memory");
+        }
+        push_node->set_op("Identity");
+        push_node->mutable_input()->SwapElements(0, 1);
+        const string ctrl_dep = ConstantFolding::AddControlDependency(
+            push_node->input(1), optimized_graph, &node_map);
+        push_node->set_input(1, ctrl_dep);
+        VLOG(1) << "After converting: " << push_node->DebugString();
+      }
+    }
+  }
+  return Status::OK();
+}
 
-  TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+}  // namespace
 
-  if (opt_level_ == RewriterConfig::AGGRESSIVE) {
-    optimized_graph_ = optimized_graph;
-    // Set up helper data structures.
-    node_map_.reset(new NodeMap(optimized_graph_));
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                                 &frame_map_, &num_frames));
-    TF_RETURN_IF_ERROR(LoopInvariantNodeMotion());
+Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                               GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+  // Set up helper data structures.
+  if (options_.enable_loop_invariant_node_motion) {
+    LoopInvariantNodeMotionOptimizer linm_optimizer(optimized_graph);
+    TF_RETURN_IF_ERROR(linm_optimizer.Optimize());
+  }
+  if (options_.enable_stack_push_removal) {
+    TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index c1b0321e4e16f2c34a8016fe51068a79634a9617..a422505d23c197a6fd677c97e326b529a4bd57b2 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -30,9 +30,13 @@ constexpr char kLoopOptimizer[] = "LoopOptimizer";
 
 class LoopOptimizer : public GraphOptimizer {
  public:
-  LoopOptimizer() : opt_level_(RewriterConfig::ON) {}
+  LoopOptimizer()
+      : opt_level_(RewriterConfig::ON),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
   explicit LoopOptimizer(RewriterConfig::Toggle opt_level)
-      : opt_level_(opt_level) {}
+      : opt_level_(opt_level),
+        options_(LoopOptimizerOptions::Default(RewriterConfig::ON)) {}
+
   ~LoopOptimizer() override {}
 
   string name() const override { return "loop_optimizer"; };
@@ -44,29 +48,21 @@ class LoopOptimizer : public GraphOptimizer {
                 const GraphDef& optimized_graph, double result) override;
 
  private:
-  Status LoopInvariantNodeMotion();
-  Status FindInvariantNodes(NodeDef* node);
-  Status RevertInvariantNodes();
-  Status MoveInvariantNodes(const int frame_id);
-  Status LINMHandleInvariantNode(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleConst(NodeDef* node, const int num_outputs,
-      const int frame_id);
-  Status LINMHandleInvariantEnter(NodeDef* node, const int num_outputs);
-
-  std::map<NodeDef*, int> invariant_nodes_;
-  std::set<int> empty_set_;
-  std::map<int, std::set<int>> frame_children_;
-  std::map<int, int> frame_parent_;
-  std::map<int, const NodeDef*> loop_cond_;
-  std::map<int, std::vector<NodeDef*>> invariant_enters_;
-  int new_enter_id_;
-  RewriterConfig::Toggle opt_level_;
+  friend class LoopOptimizerTest;
+
+  // Granular control for loop optimizer stages.
+  struct LoopOptimizerOptions {
+    bool enable_loop_invariant_node_motion = true;
+    bool enable_stack_push_removal = true;
 
-  std::unique_ptr<NodeMap> node_map_;
-  FrameMap frame_map_;
-  std::unique_ptr<GraphProperties> graph_properties_;
-  GraphDef* optimized_graph_;  // Not owned.
+    static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
+      LoopOptimizerOptions options;
+      return options;
+    }
+  };
+
+  RewriterConfig::Toggle opt_level_;
+  LoopOptimizerOptions options_;
 };
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index a0bd3351976ccbeddd8778281dbdc0c07bbd6455..10ec544424e651a1c0d39ef6af9a8f824de6c99e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
-namespace {
 
 class LoopOptimizerTest : public GrapplerTest {
  protected:
@@ -57,6 +56,23 @@ class LoopOptimizerTest : public GrapplerTest {
     attributes.emplace_back("T", type);
     AddNode(name, op, inputs, attributes, graph);
   }
+
+  void DisableAllStages(LoopOptimizer* optimizer) {
+    LoopOptimizer::LoopOptimizerOptions options;
+    options.enable_loop_invariant_node_motion = false;
+    options.enable_stack_push_removal = false;
+    optimizer->options_ = options;
+  }
+
+  void EnableOnlyLoopInvariantNodeMotion(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_loop_invariant_node_motion = true;
+  }
+
+  void EnableOnlyStackPushRemoval(LoopOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.enable_stack_push_removal = true;
+  }
 };
 
 TEST_F(LoopOptimizerTest, Basic) {
@@ -81,7 +97,8 @@ TEST_F(LoopOptimizerTest, Basic) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -128,7 +145,8 @@ TEST_F(LoopOptimizerTest, Const) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -175,7 +193,8 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -235,7 +254,8 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -302,7 +322,8 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -365,7 +386,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -429,7 +451,8 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   GrapplerItem item;
   item.graph = graph;
 
-  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  LoopOptimizer optimizer;
+  EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -475,6 +498,7 @@ TEST_F(LoopOptimizerTest, NoOp) {
   CHECK(fake_input.NextItem(&item));
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -504,6 +528,7 @@ TEST_F(LoopOptimizerTest, RemovePush_NoOp) {
   AddSimpleNode("stop", "StopGradient", {"stack3"}, &graph);
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -534,6 +559,7 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   item.fetch.push_back("pop4");
 
   LoopOptimizer optimizer;
+  EnableOnlyStackPushRemoval(&optimizer);
   GraphDef output;
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -563,6 +589,5 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   }
 }
 
-}  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 9595936e9e6158045a13ebede95d63b9291ca434..a1f80802ddc2b3c959a74e010f6b45cb421864cf 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -426,7 +426,7 @@ TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
   EXPECT_EQ(4, count);
 
   std::vector<string> fetch = {"a", "b", "c", "e"};
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, fetch, {});
   EXPECT_EQ(4, tensors.size());
 
   for (int i = 0; i < tensors[0].NumElements(); ++i) {
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 47ec16226bb906100ee741787af4022c8ae521a9..5723e397abe2348bec82fb939ea8bfca1df72eb7 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/utils/colocation.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -44,16 +45,15 @@ int64 NumEdges(const GraphDef& graph) {
 }
 
 string PrintSizesBeforeAfter(const GraphDef& before, const GraphDef& after) {
-  return strings::StrCat("Graph size before: ", before.node_size(), " nodes, ",
-                         NumEdges(before),
-                         " edges. Graph size after: ", after.node_size(),
-                         " nodes, ", NumEdges(after), " edges.");
+  return strings::StrCat("Graph size after: ", after.node_size(), " nodes (",
+                         after.node_size() - before.node_size(), "), ",
+                         NumEdges(after), " edges (",
+                         NumEdges(after) - NumEdges(before), ")");
 }
 }  // namespace
 
 std::unique_ptr<GraphOptimizer> MetaOptimizer::NewOptimizer(
     const string& optimizer) {
-  VLOG(1) << "Adding graph optimization pass: " << optimizer;
   std::unique_ptr<GraphOptimizer> graph_optimizer;
   if (optimizer == "pruning") {
     graph_optimizer.reset(new ModelPruner());
@@ -102,6 +102,10 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new FunctionOptimizer(cfg_.function_optimization())));
     }
+    if (cfg_.debug_stripper() == RewriterConfig::ON) {
+      optimizers.push_back(
+          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
+    }
     if (cfg_.constant_folding() != RewriterConfig::OFF) {
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new ConstantFolding(cfg_.constant_folding(), cpu_device_)));
@@ -138,10 +142,6 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       optimizers.push_back(std::unique_ptr<GraphOptimizer>(
           new AutoParallel(cfg_.auto_parallel().num_replicas())));
     }
-    if (cfg_.debug_stripper() == RewriterConfig::ON) {
-      optimizers.push_back(
-          std::unique_ptr<GraphOptimizer>(new DebugStripper()));
-    }
   } else {
     const std::set<string> available_optimizers = {
         "pruning",    "function",      "constfold",  "layout",
@@ -171,46 +171,58 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     return Status::OK();
   }
 
+  // Some optimizers should be run only once.
+  const std::set<string> run_once_optimizers = {"layout"};
   bool already_optimized = false;
-  for (const auto& optimizer : optimizers) {
-    if (!already_optimized) {
-      Status status = optimizer->Optimize(cluster, item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        result = status.ToString();
-      } else {
-        already_optimized = true;
-        result = strings::StrCat(
-            "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+  const int num_iterations =
+      cfg_.meta_optimizer_iterations() == RewriterConfig::DEFAULT_NUM_ITERS
+          ? 1
+          : cfg_.meta_optimizer_iterations();
+  for (int iteration = 0; iteration < num_iterations; ++iteration) {
+    VLOG(1) << "Starting optimization iteration " << iteration + 1;
+    for (const auto& optimizer : optimizers) {
+      if (iteration > 0 && run_once_optimizers.count(optimizer->name())) {
+        continue;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
-    } else {
-      GrapplerItem optimized_item(item, std::move(*optimized_graph));
-      Status status =
-          optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      string result;
-      if (!status.ok()) {
-        VLOG(1) << "Not able to apply optimizer " << optimizer->name()
-                << ". Return status: " << status.ToString();
-        optimized_graph->Swap(&optimized_item.graph);
-        result = status.ToString();
+      if (!already_optimized) {
+        Status status = optimizer->Optimize(cluster, item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name()
+                  << ". Return status: " << status.ToString();
+          result = status.ToString();
+        } else {
+          already_optimized = true;
+          result = strings::StrCat(
+              "OK. ", PrintSizesBeforeAfter(item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << "Optimizer " << optimizer->name()
+                << " return status: " << result;
       } else {
-        result = strings::StrCat(
-            "OK. ",
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        GrapplerItem optimized_item(item, std::move(*optimized_graph));
+        Status status =
+            optimizer->Optimize(cluster, optimized_item, optimized_graph);
+        string result;
+        if (!status.ok()) {
+          VLOG(1) << "Not able to apply optimizer " << optimizer->name() << ": "
+                  << status.ToString();
+          optimized_graph->Swap(&optimized_item.graph);
+          result = status.ToString();
+        } else {
+          result = strings::StrCat(
+              optimizer->name(), ": ",
+              PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph));
+        }
+        result_.push_back(std::make_pair(optimizer->name(), result));
+        VLOG(1) << result;
       }
-      result_.push_back(std::make_pair(optimizer->name(), result));
-      VLOG(1) << "Optimizer " << optimizer->name()
-              << " return status: " << result;
     }
   }
 
   if (already_optimized) {
     TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph));
+    ReassignColocation(optimized_graph);
     // Make sure that the optimizers preserved the graph version and library.
     DCHECK_GE(optimized_graph->library().function_size(),
               item.graph.library().function_size());
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 536347d8348738e1755e920f3f08c2d4858cb256..d9a386b9be2cecbbd7562c38bc24626ff966fa46 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -72,6 +72,20 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
+TEST(MetaOptimizerTest, RunOptimizersTwice) {
+  TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index 8480a74572883a4657e11606b4cb8dcd5532ea3a..2b12eadec96fa93d576ebf9036a2b2b1af9acc59 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -26,7 +28,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class ModelPrunerTest : public ::testing::Test {};
+class ModelPrunerTest : public GrapplerTest {};
 
 TEST_F(ModelPrunerTest, NoPruning) {
   // This trivial graph is so basic there's nothing to prune.
@@ -86,6 +88,13 @@ TEST_F(ModelPrunerTest, StopGradientPruning) {
   EXPECT_EQ(NodeName(b.name()), new_e.input(0));
   EXPECT_EQ(1, new_d.input_size());
   EXPECT_EQ(NodeName(b.name()), new_d.input(0));
+
+  std::vector<string> fetch = {"e"};
+  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto actual_tensors = EvaluateNodes(output, fetch);
+  EXPECT_EQ(1, expected_tensors.size());
+  EXPECT_EQ(1, actual_tensors.size());
+  test::ExpectTensorEqual<float>(expected_tensors[0], actual_tensors[0]);
 }
 
 TEST_F(ModelPrunerTest, IdentityPruning) {
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 450e85340796fdde9afdfebbd0eb9a724cb9440a..5206e9957dc75c13a03dfcb060b8b3b3dc732ad8 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -40,6 +40,12 @@ static Costs::NanoSeconds PredictExecutionTime(
     op_context.op_info.add_inputs()->Swap(&input);
   }
 
+  std::vector<OpInfo::TensorProperties> outputs =
+      properties.GetOutputProperties(node.name());
+  for (auto& output : outputs) {
+    op_context.op_info.add_outputs()->Swap(&output);
+  }
+
   DeviceProperties device = placer.get_device(node);
   op_context.op_info.mutable_device()->Swap(&device);
 
diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index 08580d92842377c2dd999950b2e01bef01e2fee6..d632e460e7ccfc092945805a9a0b6b4b4c2215d1 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -64,17 +64,17 @@ TEST_F(StaticScheduleTest, BasicGraph) {
     if (time.first->name() == "Const/Const") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(250001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(1500001), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(1500004), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(4000004), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(2750007), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(6500007), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(4000010), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(9000010), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(5250013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(11500013), time.second);
     } else if (time.first->name() == "y") {
-      EXPECT_EQ(Costs::NanoSeconds(6500013), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(14000013), time.second);
     }
   }
 }
@@ -110,13 +110,13 @@ TEST_F(StaticScheduleTest, BasicGraphWithCtrlDependencies) {
     if (time.first->name() == "a") {
       EXPECT_EQ(Costs::NanoSeconds(1), time.second);
     } else if (time.first->name() == "b") {
-      EXPECT_EQ(Costs::NanoSeconds(12500001), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000001), time.second);
     } else if (time.first->name() == "c") {
-      EXPECT_EQ(Costs::NanoSeconds(12500002), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(25000002), time.second);
     } else if (time.first->name() == "d") {
-      EXPECT_EQ(Costs::NanoSeconds(12500003), time.second);
-    } else if (time.first->name() == "e") {
       EXPECT_EQ(Costs::NanoSeconds(25000003), time.second);
+    } else if (time.first->name() == "e") {
+      EXPECT_EQ(Costs::NanoSeconds(50000003), time.second);
     }
   }
 }
@@ -142,17 +142,17 @@ TEST_F(StaticScheduleTest, RequiredTimes) {
 
   for (auto time : required_times) {
     if (time.first->name() == "Const/Const") {
-      EXPECT_EQ(Costs::NanoSeconds(-6500012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-14000012), time.second);
     } else if (time.first->name() == "x") {
-      EXPECT_EQ(Costs::NanoSeconds(-6250012), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-12500012), time.second);
     } else if (time.first->name() == "Square") {
-      EXPECT_EQ(Costs::NanoSeconds(-5000009), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-10000009), time.second);
     } else if (time.first->name() == "Square_1") {
-      EXPECT_EQ(Costs::NanoSeconds(-3750006), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-7500006), time.second);
     } else if (time.first->name() == "Square_2") {
-      EXPECT_EQ(Costs::NanoSeconds(-2500003), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-5000003), time.second);
     } else if (time.first->name() == "Square_3") {
-      EXPECT_EQ(Costs::NanoSeconds(-1250000), time.second);
+      EXPECT_EQ(Costs::NanoSeconds(-2500000), time.second);
     } else if (time.first->name() == "y") {
       EXPECT_EQ(Costs::NanoSeconds(0), time.second);
     }
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
index a9dcf44e236281badfabaf5213ef09fd98bf0820..eb79bab3141579132ea2e2d2afc5733f0013a0d5 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.h
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -31,8 +31,8 @@ bool IsUnknown(const TensorShapeProto::Dim& dim);
 bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
 bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
 
-// Shapes are symbolically equal, if they have the same rank, they are
-// they are known or symbolically defined, and have matching dimensions.
+// Shapes are symbolically equal, if they have the same rank, they are known or
+// symbolically defined, and have matching dimensions.
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
                              const TensorShapeProto& right);
 bool ShapesSymbolicallyEqual(const OpInfo::TensorProperties& left,
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 829bfe9e3104273cbf9d3d0ddb0bef4a233708e7..534fe670e0fea6111ac5128c618e1b85c9b08109 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -33,8 +33,8 @@ namespace {
 template <typename T>
 bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > std::numeric_limits<RealType>::max() ||
-      value < std::numeric_limits<RealType>::min()) {
+  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
+      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
     return false;
   }
   tensor->flat<T>()(0) = static_cast<T>(value);
@@ -255,6 +255,14 @@ int NumOutputs(const NodeDef& node, GraphDef* graph) {
   return num_outputs;
 }
 
+bool HasControlInputs(const NodeDef& node) {
+  int num_inputs = node.input_size();
+  if (num_inputs > 0 && IsControlInput(node.input(num_inputs - 1))) {
+    return true;
+  }
+  return false;
+}
+
 int NumNonControlInputs(const NodeDef& node) {
   int num_inputs = node.input_size();
   for (const string& input : node.input()) {
@@ -422,18 +430,28 @@ Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
 }
 
 void SimpleGraphView::DepthFirstSearch(
-    const std::unordered_set<string>& op_types_to_traverse, int node_idx,
+    const std::unordered_set<string>& op_types_to_traverse, int root_node,
     std::set<int>* nodes_found) const {
-  if (nodes_found->find(node_idx) != nodes_found->end()) {
-    return;
-  }
-  nodes_found->insert(node_idx);
-  const string& op_type = graph_->node(node_idx).op();
+  nodes_found->clear();
+  const string& op_type = graph_->node(root_node).op();
   if (op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
     return;
   }
-  for (auto output_idx : this->outputs(node_idx)) {
-    DepthFirstSearch(op_types_to_traverse, output_idx, nodes_found);
+  std::vector<int> stack;
+  stack.reserve(32);
+  stack.push_back(root_node);
+  while (!stack.empty()) {
+    const int node_idx = stack.back();
+    stack.pop_back();
+    nodes_found->insert(node_idx);
+    const string& op_type = graph_->node(node_idx).op();
+    if (op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
+      for (auto output_idx : this->outputs(node_idx)) {
+        if (nodes_found->find(output_idx) == nodes_found->end()) {
+          stack.push_back(output_idx);
+        }
+      }
+    }
   }
 }
 
@@ -473,8 +491,8 @@ Status SetTensorValue(DataType dtype, int value, Tensor* tensor) {
         "Expected scalar tensor, got num_elements = ", tensor->NumElements());
   }
   switch (dtype) {
-    // TODO(rmlarsen): Handle DT_HALF.
-    //    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_HALF);
+    HANDLE_CASE(DT_BFLOAT16);
     HANDLE_CASE(DT_BOOL);
     HANDLE_CASE(DT_FLOAT);
     HANDLE_CASE(DT_DOUBLE);
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 7aa31939f58ae2556fc7ffa59ca337a3c162ca2e..11555d712abd1de538aa8526f1574f249f630cbf 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -138,6 +138,9 @@ string AsControlDependency(const string& node);
 // some of the outputs may be unconnected.
 int NumOutputs(const NodeDef& node, GraphDef* graph);
 
+// Returns true iff the node has at least one control input.
+bool HasControlInputs(const NodeDef& node);
+
 // Number of connected non-control inputs.
 int NumNonControlInputs(const NodeDef& node);
 
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 939031c44b57e930b80fc7897be8e9f5e7906688..7419c26dff58067856c5e5280edcecd77a41c6c7 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "scc",
     srcs = ["scc.cc"],
@@ -193,3 +181,28 @@ tf_cc_test(
         "//tensorflow/core:testlib",
     ],
 )
+
+cc_library(
+    name = "colocation",
+    srcs = ["colocation.cc"],
+    hdrs = ["colocation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:utils",
+    ],
+)
+
+tf_cc_test(
+    name = "colocation_test",
+    size = "small",
+    srcs = ["colocation_test.cc"],
+    deps = [
+        ":colocation",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/grappler/utils/colocation.cc b/tensorflow/core/grappler/utils/colocation.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0573e0a8309d7525733013d27befce53a0ecc44b
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include <cstring>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+
+// Find root node of the colocation group.
+// The map is mapping from one node name to its parent. node_name is the
+// starting node to search. By iteratively following the path from child to
+// parent, we can find the root node for the colocation group that node_name
+// belongs to.
+string GetColocationGroupRoot(std::unordered_map<string, string>* map,
+                              const string& node_name) {
+  if (map->find(node_name) == map->end()) {
+    // If node_name is not in the map, we create a new root node which points
+    // to itself.
+    map->insert({node_name, node_name});
+    return node_name;
+  }
+  string cur = node_name;
+  while ((*map)[cur] != cur) {
+    // Backtracing the map until we reach the root node.
+    cur = (*map)[cur];
+  }
+  return cur;
+}
+
+// Merge two colocation groups into one.
+// left and right is the root node of two colocation groups respectively.
+void MergeColocationGroup(std::unordered_map<string, string>* map,
+                          const string& left, const string& right) {
+  // Do nothing if left or right node is not in the map.
+  if (map->find(left) == map->end() || map->find(right) == map->end()) {
+    return;
+  }
+  if (left != right) {
+    // Make the right node a child of the left node, which merges the two
+    // groups.
+    map->at(right) = left;
+  }
+}
+}  // namespace
+
+// Use of disjoint set algorithm to build the colocation groups from the input
+// graph. The core data structure in use is a hash map from one node to its
+// parent node. Whenever we see two nodes colocate with each other, we merge
+// their colocation groups together. After we traverse all colocation pairs
+// in the graph, we will have several disjoint sets. Then we pick the root node
+// of each disjoint set as the representative node, and let all other nodes in
+// the group colocate with the representative node.
+void ReassignColocation(GraphDef* graph) {
+  constexpr char kClassAttr[] = "_class";
+  constexpr char kColocPrefix[] = "loc:@";
+
+  // A hashmap that maps from a node name to its parent node name.
+  std::unordered_map<string, string> coloc_groups;
+  NodeMap node_map(graph);
+  for (const auto& node : graph->node()) {
+    auto iter = node.attr().find(kClassAttr);
+    if (iter != node.attr().end() && iter->second.has_list()) {
+      for (const auto& str : iter->second.list().s()) {
+        size_t pos = str.find(kColocPrefix);
+        if (pos == 0) {
+          // After we find a colocation, update the colocation groups.
+          string colocate_node = str.substr(pos + strlen(kColocPrefix));
+          MergeColocationGroup(
+              &coloc_groups, GetColocationGroupRoot(&coloc_groups, node.name()),
+              GetColocationGroupRoot(&coloc_groups, colocate_node));
+        }
+      }
+    }
+  }
+
+  // We use the root node of each colocation groups as its representative
+  // node. For each node in one group, colocate with the representative node
+  // if the node is in the graph.
+  for (const auto& pair : coloc_groups) {
+    if (pair.first != pair.second) {
+      // This is a child node.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {
+        // Colocate this node with the root node.
+        AttrValue new_value;
+        new_value.mutable_list()->add_s(
+            kColocPrefix + GetColocationGroupRoot(&coloc_groups, pair.first));
+        node->mutable_attr()->erase(kClassAttr);
+        node->mutable_attr()->insert({kClassAttr, new_value});
+      }
+    } else {
+      // This is a root node. Clear the _class attribute.
+      NodeDef* node = node_map.GetNode(pair.first);
+      if (node) {  // root node should always exist in the graph as guaranteed
+                   // by order of merging. Just put check here to ensure safety.
+        node->mutable_attr()->erase(kClassAttr);
+      }
+    }
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/colocation.h b/tensorflow/core/grappler/utils/colocation.h
new file mode 100644
index 0000000000000000000000000000000000000000..6062db6102c50853145b15dae08994e971cca83d
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
+
+#include <unordered_map>
+#include "tensorflow/core/framework/graph.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Evaluates the colocation relation in the graph and rewrites the new
+// colocation relation in the graph. We scan the graph nodes sequentially, and
+// builds a disjoint-sets of nodes (within each disjoint-set the nodes are
+// colocated with each other). We then select the root node of each set as a
+// representative node, and then colocate each node within the set (should also
+// exist in graph) with the representative node.
+// Note that there is current one situation this function can't handle:
+// Node A colocates with X, node B colocates with Y, X colocates with Y but
+// X, Y are removed from graph. In this case we can't know A colocates with B.
+void ReassignColocation(GraphDef* graph);
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_COLOCATION_H_
diff --git a/tensorflow/core/grappler/utils/colocation_test.cc b/tensorflow/core/grappler/utils/colocation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6638364240fe4a85a9bb066b812b087fe407db6e
--- /dev/null
+++ b/tensorflow/core/grappler/utils/colocation_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/utils/colocation.h"
+
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ColocationTest : public ::testing::Test {};
+
+bool VerifyNodeHasColocation(const NodeDef& ndef, const string& coloc) {
+  if (ndef.attr().empty()) {
+    return false;
+  }
+  if (ndef.attr().find("_class") == ndef.attr().end()) {
+    return false;
+  }
+  return ndef.attr().at("_class").list().s(0) == coloc;
+}
+
+TEST(ColocationTest, ReassignColocation_SingleNode) {
+  // Node A colocates with B, but node B is not in the graph.
+  //   A
+  //   |
+  //   |
+  //  [B]
+
+  NodeDef ndef;
+  const Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@B"}).Finalize(&ndef);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef({ndef});
+
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(1, gdef.node(0).attr_size());
+
+  ReassignColocation(&gdef);
+
+  // Validates that node A's colocation info is cleared.
+  EXPECT_EQ(1, gdef.node_size());
+  EXPECT_EQ(0, gdef.node(0).attr_size());
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_SingleGroup) {
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node X is not in the graph.
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  // After re-assign of colocation, A, B, C, D should colocate with E.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  GraphDef gdef =
+      test::function::GDef({ndef_a, ndef_b, ndef_c, ndef_d, ndef_e});
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(5, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+}
+
+TEST(ColocationTest, ReassignColocation_MultiNode_MultiGroup) {
+  // Before re-assign:
+  // Node A, B, C colocate with X. D colocates with C. E colocates with D.
+  // Node U, V colocates with W. Node X, W are not in the graph:
+  //  A   B   C---D---E
+  //  |   |   |
+  //  |   |   |
+  //  +--[X]--+
+  //
+  //  U       V
+  //  |       |
+  //  |       |
+  //  +--[W]--+
+  //
+  // After re-assign:
+  // A, B, C, D should colocate with E. U should colocate with V.
+  // A   B   C   D
+  // |   |   |   |
+  // |   |   |   |
+  // +---+-E-+---+
+  //
+  // U
+  // |
+  // |
+  // V
+
+  NodeDef ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v;
+  Status status =
+      NodeDefBuilder("A", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_a);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("B", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_b);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("C", "Const").Attr("_class", {"loc:@X"}).Finalize(&ndef_c);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("D", "Const").Attr("_class", {"loc:@C"}).Finalize(&ndef_d);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("E", "Const").Attr("_class", {"loc:@D"}).Finalize(&ndef_e);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("U", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_u);
+  TF_EXPECT_OK(status);
+  status =
+      NodeDefBuilder("V", "Const").Attr("_class", {"loc:@W"}).Finalize(&ndef_v);
+  TF_EXPECT_OK(status);
+  GraphDef gdef = test::function::GDef(
+      {ndef_a, ndef_b, ndef_c, ndef_d, ndef_e, ndef_u, ndef_v});
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@X"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@X"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@X"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@C"));  // D
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(4), "loc:@D"));  // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@W"));  // U
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(6), "loc:@W"));  // V
+
+  ReassignColocation(&gdef);
+
+  EXPECT_EQ(7, gdef.node_size());
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(0), "loc:@E"));  // A
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(1), "loc:@E"));  // B
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(2), "loc:@E"));  // C
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(3), "loc:@E"));  // D
+  EXPECT_EQ(0, gdef.node(4).attr_size());                        // E
+  EXPECT_TRUE(VerifyNodeHasColocation(gdef.node(5), "loc:@V"));  // U
+  EXPECT_EQ(0, gdef.node(6).attr_size());                        // V
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index ee126f49557643a96926bc6dfc00629e27fb79da..910b0acaefbb10e1da24ab9ec5bfa95b1b5710d4 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -41,11 +41,17 @@ GrapplerTest::GrapplerTest() {
 
 std::vector<Tensor> GrapplerTest::EvaluateNodes(
     const GraphDef& graph, const std::vector<string>& node_names) const {
+  return EvaluateNodes(graph, node_names, {});
+}
+
+std::vector<Tensor> GrapplerTest::EvaluateNodes(
+    const GraphDef& graph, const std::vector<string>& node_names,
+    const std::vector<std::pair<string, Tensor>>& inputs) const {
   std::unique_ptr<tensorflow::Session> session(NewSession(options_));
   TF_CHECK_OK(session->Create(graph));
   RunOptions run_options;
   std::vector<Tensor> output_tensors;
-  TF_CHECK_OK(session->Run(run_options, {}, node_names, node_names,
+  TF_CHECK_OK(session->Run(run_options, inputs, node_names, node_names,
                            &output_tensors, nullptr));
   TF_CHECK_OK(session->Close());
   return output_tensors;
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index e0c67381a454a7a6219ffe596831f203856b71ab..e1394b9c35f2cd5ec55db19dc165352399faf568 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -37,6 +37,10 @@ class GrapplerTest : public ::testing::Test {
   std::vector<Tensor> EvaluateNodes(
       const GraphDef& graph, const std::vector<string>& node_names) const;
 
+  std::vector<Tensor> EvaluateNodes(
+      const GraphDef& graph, const std::vector<string>& node_names,
+      const std::vector<std::pair<string, Tensor>>& inputs) const;
+
   std::vector<Tensor> EvaluateFetchNodes(const GrapplerItem& item) const;
 
   NodeDef* AddNode(const string& name, const string& op,
@@ -53,6 +57,15 @@ class GrapplerTest : public ::testing::Test {
   // Count nodes of the given op-type in a graph.
   int CountOpNodes(const GraphDef& graph, const string& op);
 
+  // Get a random tansor with given shape.
+  template <DataType DTYPE>
+  Tensor GenerateRandomTensor(const TensorShape& shape) const {
+    typedef typename EnumToDataType<DTYPE>::Type T;
+    Tensor tensor(DTYPE, shape);
+    tensor.flat<T>() = tensor.flat<T>().random();
+    return tensor;
+  }
+
  private:
   SessionOptions options_;
 };
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b469c01881f4a6ef3debc730c3e69152608d0e2a..2bbedfff73a1c261e5ef0ade24848d112a769cbe 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -131,6 +131,17 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "collective_ops",
+    prefix = "collective_ops",
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "concat_lib",
     srcs = [
@@ -1395,6 +1406,7 @@ tf_kernel_library(
     visibility = [":friends"],
     deps = [
         ":bounds_check",
+        ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/eigen3",
@@ -2265,6 +2277,7 @@ tf_cc_tests(
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -5120,6 +5133,9 @@ filegroup(
             "summary_interface.*",
             "summary_kernels.*",
             "spectrogram_convert_test_data.cc",
+            "decode_proto_op.cc",
+            "encode_proto_op.cc",
+            "rpc_op.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
             "scatter_nd_op*",
@@ -5905,6 +5921,7 @@ tf_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -5926,8 +5943,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5942,8 +5958,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5959,8 +5974,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5980,8 +5994,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -5997,8 +6010,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6014,8 +6026,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6023,8 +6034,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_fused_batch_norm_op.cc"],
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6032,8 +6042,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_aggregate_ops",
     deps = MATH_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6041,8 +6050,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_concat_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6050,8 +6058,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_reshape_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6059,8 +6066,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_identity_op",
     deps = ARRAY_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6068,8 +6074,7 @@ tf_mkl_kernel_library(
     prefix = "mkl_lrn_op",
     deps = NN_DEPS + [
         "//third_party/mkl:intel_binary_blob",
-        "@mkl_dnn",
-    ],
+    ] + if_mkl(["@mkl_dnn"]),
 )
 
 tf_mkl_kernel_library(
@@ -6096,6 +6101,13 @@ cc_library(
     ],
 )
 
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_ops",
+    ],
+)
+
 cc_library(
     name = "captured_function",
     hdrs = ["captured_function.h"],
@@ -6144,21 +6156,53 @@ tf_kernel_library(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+tf_kernel_library(
+    name = "decode_proto_op",
+    srcs = [
+        "decode_proto_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:decode",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+tf_kernel_library(
+    name = "encode_proto_op",
+    srcs = ["encode_proto_op.cc"],
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/util/proto:descriptors",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "rpc_op",
+    srcs = [
+        "rpc_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:rpc_ops_op_lib",
+        "//tensorflow/core/util/rpc:call_container",
+        "//tensorflow/core/util/rpc:rpc_factory",
+        "//tensorflow/core/util/rpc:rpc_factory_registry",
+        "//third_party/eigen3",
+    ],
 )
 
+# -----------------------------------------------------------------------------
+# Google-internal targets.  These must be at the end for syncrepo.
+
 # Library to link with when compiling the cwise_op kernels directly,
 # e.g. for selective registration.
 # should not be linked by projects that also link the cwise_op library.
@@ -6185,3 +6229,12 @@ cc_library(
         "@gemmlowp",
     ],
 )
+
+# Header-only version of cwise_lib for clients that want to use the cwise_ops
+# functionality in their own custom ops.
+cc_header_only_library(
+    name = "cwise_lib_hdrs",
+    deps = [
+        ":cwise_lib",
+    ],
+)
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index a312e8e8a420f7f909b20b28f84bf55597a58aba..2ed1628bf1a84bf8729a949ca1b6d66ce58bdcdc 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -77,7 +77,8 @@ class AssignOp : public OpKernel {
 
       // 1. Try to reuse the rhs.
       std::unique_ptr<Tensor> input_alias = context->forward_input(
-          1, old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
+          1, OpKernelContext::Params::kNoReservation /*output_index*/,
+          old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
       if (input_alias != nullptr) {
         // Transfer ownership to the ref.
         context->replace_ref_input(0, *input_alias.release(),
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 4397410a5cee839a70bde69f34ca72e31530565f..de05c647d6bfc80a0368ee3edba8f31bccff33f9 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -8,18 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "**/google_*",
-        ],
-    ),
-)
-
 cc_library(
     name = "periodic_function_dynamic",
     srcs = ["periodic_function.cc"],
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..62327dfe1d044bd05966d420e557fc39edd84afd
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -0,0 +1,89 @@
+# Description:
+#   OpKernels for boosted trees ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library",
+)
+
+tf_proto_library(
+    name = "boosted_trees_proto",
+    srcs = ["boosted_trees.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resource_ops",
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_ops",
+    srcs = ["stats_ops.cc"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_kernel_library(
+    name = "training_ops",
+    srcs = ["training_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "boosted_trees_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+        ":stats_ops",
+        ":training_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
new file mode 100644
index 0000000000000000000000000000000000000000..106ceedc00721f51468639a1c9e235728db8dbae
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -0,0 +1,113 @@
+syntax = "proto3";
+
+package tensorflow.boosted_trees;
+option cc_enable_arenas = true;
+option java_outer_classname = "BoostedTreesProtos";
+option java_multiple_files = true;
+option java_package = "org.tensorflow.framework";
+
+// Node describes a node in a tree.
+message Node {
+  oneof node {
+    Leaf leaf = 1;
+    BucketizedSplit bucketized_split = 2;
+  }
+  NodeMetadata metadata = 777;
+}
+
+// NodeMetadata encodes metadata associated with each node in a tree.
+message NodeMetadata {
+  // The gain associated with this node.
+  float gain = 1;
+
+  // The original leaf node before this node was split.
+  Leaf original_leaf = 2;
+}
+
+// Leaves can either hold dense or sparse information.
+message Leaf {
+  oneof leaf {
+    // See third_party/tensorflow/contrib/decision_trees/
+    // proto/generic_tree_model.proto
+    // for a description of how vector and sparse_vector might be used.
+    Vector vector = 1;
+    SparseVector sparse_vector = 2;
+  }
+  float scalar = 3;
+}
+
+message Vector {
+  repeated float value = 1;
+}
+
+message SparseVector {
+  repeated int32 index = 1;
+  repeated float value = 2;
+}
+
+message BucketizedSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  int32 threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
+// Tree describes a list of connected nodes.
+// Node 0 must be the root and can carry any payload including a leaf
+// in the case of representing the bias.
+// Note that each node id is implicitly its index in the list of nodes.
+message Tree {
+  repeated Node nodes = 1;
+}
+
+message TreeMetadata {
+  // Number of layers grown for this tree.
+  int32 num_layers_grown = 2;
+
+  // Whether the tree is finalized in that no more layers can be grown.
+  bool is_finalized = 3;
+
+  // If tree was finalized and post pruning happened, it is possible that cache
+  // still refers to some nodes that were deleted or that the node ids changed
+  // (e.g. node id 5 became node id 2 due to pruning of the other branch).
+  // The mapping below allows us to understand where the old ids now map to and
+  // how the values should be adjusted due to post-pruning.
+  // The size of the list should be equal to the number of nodes in the tree
+  // before post-pruning happened.
+  // If the node was pruned, it will have new_node_id equal to the id of a node
+  // that this node was collapsed into. For a node that didn't get pruned, it is
+  // possible that its id still changed, so new_node_id will have the
+  // corresponding id in the pruned tree.
+  // If post-pruning didn't happen, or it did and it had no effect (e.g. no
+  // nodes got pruned), this list will be empty.
+  repeated PostPruneNodeUpdate post_pruned_nodes_meta = 4;
+
+  message PostPruneNodeUpdate {
+    int32 new_node_id = 1;
+    float logit_change = 2;
+  }
+}
+
+message GrowingMetadata {
+  // Number of trees that we have attempted to build. After pruning, these
+  // trees might have been removed.
+  int64 num_trees_attempted = 1;
+  // Number of layers that we have attempted to build. After pruning, these
+  // layers might have been removed.
+  int64 num_layers_attempted = 2;
+}
+
+// TreeEnsemble describes an ensemble of decision trees.
+message TreeEnsemble {
+  repeated Tree trees = 1;
+  repeated float tree_weights = 2;
+
+  repeated TreeMetadata tree_metadata = 3;
+  // Metadata that is used during the training.
+  GrowingMetadata growing_metadata = 4;
+}
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b13a4505464cf74fef82eb0151f23cb6b56a434e
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -0,0 +1,263 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+// The Op used during training time to get the predictions so far with the
+// current ensemble being built.
+// Expect some logits are cached from the previous step and passed through
+// to be reused.
+class BoostedTreesTrainingPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesTrainingPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    const Tensor* cached_tree_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_tree_ids", &cached_tree_ids_t));
+    const auto cached_tree_ids = cached_tree_ids_t->vec<int32>();
+
+    const Tensor* cached_node_ids_t;
+    OP_REQUIRES_OK(context,
+                   context->input("cached_node_ids", &cached_node_ids_t));
+    const auto cached_node_ids = cached_node_ids_t->vec<int32>();
+
+    // Allocate outputs.
+    Tensor* output_partial_logits_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output("partial_logits",
+                                            {batch_size, logits_dimension_},
+                                            &output_partial_logits_t));
+    auto output_partial_logits = output_partial_logits_t->matrix<float>();
+
+    Tensor* output_tree_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("tree_ids", {batch_size},
+                                                     &output_tree_ids_t));
+    auto output_tree_ids = output_tree_ids_t->vec<int32>();
+
+    Tensor* output_node_ids_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output("node_ids", {batch_size},
+                                                     &output_node_ids_t));
+    auto output_node_ids = output_node_ids_t->vec<int32>();
+
+    // Indicate that the latest tree was used.
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    if (latest_tree < 0) {
+      // Ensemble was empty. Nothing changes.
+      output_node_ids = cached_node_ids;
+      output_tree_ids = cached_tree_ids;
+      // All the predictions are zeros.
+      output_partial_logits.setZero();
+    } else {
+      output_tree_ids.setConstant(latest_tree);
+      auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
+                      &cached_node_ids, &output_partial_logits,
+                      &output_node_ids, batch_size,
+                      latest_tree](int32 start, int32 end) {
+        for (int32 i = start; i < end; ++i) {
+          int32 tree_id = cached_tree_ids(i);
+          int32 node_id = cached_node_ids(i);
+          float partial_tree_logit = 0.0;
+
+          // If the tree was pruned, returns the node id into which the
+          // current_node_id was pruned, as well the correction of the cached
+          // logit prediction.
+          resource->GetPostPruneCorrection(tree_id, node_id, &node_id,
+                                           &partial_tree_logit);
+
+          // Logic in the loop adds the cached node value again if it is a leaf.
+          // If it is not a leaf anymore we need to subtract the old node's
+          // value. The following logic handles both of these cases.
+          partial_tree_logit -= resource->node_value(tree_id, node_id);
+          float partial_all_logit = 0.0;
+          while (true) {
+            if (resource->is_leaf(tree_id, node_id)) {
+              partial_tree_logit += resource->node_value(tree_id, node_id);
+
+              // Tree is done
+              partial_all_logit +=
+                  resource->GetTreeWeight(tree_id) * partial_tree_logit;
+              partial_tree_logit = 0.0;
+              // Stop if it was the latest tree.
+              if (tree_id == latest_tree) {
+                break;
+              }
+              // Move onto other trees.
+              ++tree_id;
+              node_id = 0;
+            } else {
+              node_id = resource->next_node(tree_id, node_id, i,
+                                            batch_bucketized_features);
+            }
+          }
+          output_node_ids(i) = node_id;
+          output_partial_logits(i, 0) = partial_all_logit;
+        }
+      };
+      // Assume we will not go over more than one full tree. 4 is a magic
+      // number.
+      const int64 cost = 4 * max_depth_;
+      thread::ThreadPool* const worker_threads =
+          context->device()->tensorflow_cpu_worker_threads()->workers;
+      Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+            /*cost_per_unit=*/cost, do_work);
+    }
+  }
+
+ private:
+  int32 logits_dimension_;         // the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+  int32 max_depth_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesTrainingPredict").Device(DEVICE_CPU),
+                        BoostedTreesTrainingPredictOp);
+
+// The Op to get the predictions at the evaluation/inference time.
+class BoostedTreesPredictOp : public OpKernel {
+ public:
+  explicit BoostedTreesPredictOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
+                                             &num_bucketized_features_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+    OP_REQUIRES(context, logits_dimension_ == 1,
+                errors::InvalidArgument(
+                    "Currently only one dimensional outputs are supported."));
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    BoostedTreesEnsembleResource* resource;
+    // Get the resource.
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &resource));
+    // Release the reference to the resource once we're done using it.
+    core::ScopedUnref unref_me(resource);
+
+    // Get the inputs.
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> batch_bucketized_features;
+    batch_bucketized_features.reserve(bucketized_features_list.size());
+    for (const Tensor& tensor : bucketized_features_list) {
+      batch_bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+    const int batch_size = batch_bucketized_features[0].size();
+
+    // Allocate outputs.
+    Tensor* output_logits_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "logits", {batch_size, logits_dimension_},
+                                &output_logits_t));
+    auto output_logits = output_logits_t->matrix<float>();
+
+    const int32 latest_tree = resource->num_trees() - 1;
+
+    auto do_work = [&resource, &batch_bucketized_features, &output_logits,
+                    batch_size, latest_tree](int32 start, int32 end) {
+      for (int32 i = start; i < end; ++i) {
+        float tree_logit = 0.0;
+        int32 tree_id = 0;
+        int32 node_id = 0;
+        while (true) {
+          if (resource->is_leaf(tree_id, node_id)) {
+            tree_logit += resource->GetTreeWeight(tree_id) *
+                          resource->node_value(tree_id, node_id);
+
+            // Stop if it was the latest tree.
+            if (tree_id == latest_tree) {
+              break;
+            }
+            // Move onto other trees.
+            ++tree_id;
+            node_id = 0;
+          } else {
+            node_id = resource->next_node(tree_id, node_id, i,
+                                          batch_bucketized_features);
+          }
+        }
+        output_logits(i, 0) = tree_logit;
+      }
+    };
+    const int64 cost = (latest_tree + 1) * max_depth_;
+    thread::ThreadPool* const worker_threads =
+        context->device()->tensorflow_cpu_worker_threads()->workers;
+    Shard(worker_threads->NumThreads(), worker_threads, batch_size,
+          /*cost_per_unit=*/cost, do_work);
+  }
+
+ private:
+  int32
+      logits_dimension_;  // Indicates the size of the output prediction vector.
+  int32 num_bucketized_features_;  // Indicates the number of features.
+  int32 max_depth_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesPredict").Device(DEVICE_CPU),
+                        BoostedTreesPredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f49242d8566f09d34088131b7f74ea4362a86860
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_KERNEL(BoostedTreesEnsembleResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("IsBoostedTreesEnsembleInitialized").Device(DEVICE_CPU),
+    IsResourceInitialized<BoostedTreesEnsembleResource>);
+
+// Creates a tree ensemble resource.
+class BoostedTreesCreateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesCreateEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    std::unique_ptr<BoostedTreesEnsembleResource> result(
+        new BoostedTreesEnsembleResource());
+    if (!result->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token)) {
+      result->Unref();
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument("Unable to parse tree ensemble proto."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status =
+        CreateResource(context, HandleFromInput(context, 0), result.release());
+    if (status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES_OK(context, status);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesCreateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesCreateEnsembleOp);
+
+// Op for retrieving some model states (needed for training).
+class BoostedTreesGetEnsembleStatesOp : public OpKernel {
+ public:
+  explicit BoostedTreesGetEnsembleStatesOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    // Looks up the resource.
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Sets the outputs.
+    const int num_trees = tree_ensemble_resource->num_trees();
+    const int num_finalized_trees =
+        (num_trees <= 0 ||
+         tree_ensemble_resource->IsTreeFinalized(num_trees - 1))
+            ? num_trees
+            : num_trees - 1;
+    const int num_attempted_layers =
+        tree_ensemble_resource->GetNumLayersAttempted();
+
+    // growing_metadata
+    Tensor* output_stamp_token_t = nullptr;
+    Tensor* output_num_trees_t = nullptr;
+    Tensor* output_num_finalized_trees_t = nullptr;
+    Tensor* output_num_attempted_layers_t = nullptr;
+
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape(),
+                                                     &output_num_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(2, TensorShape(),
+                                            &output_num_finalized_trees_t));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(3, TensorShape(),
+                                            &output_num_attempted_layers_t));
+
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    output_num_trees_t->scalar<int32>()() = num_trees;
+    output_num_finalized_trees_t->scalar<int32>()() = num_finalized_trees;
+    output_num_attempted_layers_t->scalar<int32>()() = num_attempted_layers;
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesGetEnsembleStates").Device(DEVICE_CPU),
+    BoostedTreesGetEnsembleStatesOp);
+
+// Op for serializing a model.
+class BoostedTreesSerializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesSerializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    tf_shared_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+    Tensor* output_stamp_token_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape(),
+                                                     &output_stamp_token_t));
+    output_stamp_token_t->scalar<int64>()() = tree_ensemble_resource->stamp();
+    Tensor* output_proto_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, TensorShape(), &output_proto_t));
+    output_proto_t->scalar<string>()() =
+        tree_ensemble_resource->SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesSerializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesSerializeEnsembleOp);
+
+// Op for deserializing a tree ensemble variable from a checkpoint.
+class BoostedTreesDeserializeEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesDeserializeEnsembleOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    BoostedTreesEnsembleResource* tree_ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &tree_ensemble_resource));
+    mutex_lock l(*tree_ensemble_resource->get_mutex());
+    core::ScopedUnref unref_me(tree_ensemble_resource);
+
+    // Get the stamp token.
+    const Tensor* stamp_token_t;
+    OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    int64 stamp_token = stamp_token_t->scalar<int64>()();
+
+    // Get the tree ensemble proto.
+    const Tensor* tree_ensemble_serialized_t;
+    OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
+                                           &tree_ensemble_serialized_t));
+    // Deallocate all the previous objects on the resource.
+    tree_ensemble_resource->Reset();
+    OP_REQUIRES(
+        context,
+        tree_ensemble_resource->InitFromSerialized(
+            tree_ensemble_serialized_t->scalar<string>()(), stamp_token),
+        errors::InvalidArgument("Unable to parse tree ensemble proto."));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesDeserializeEnsemble").Device(DEVICE_CPU),
+    BoostedTreesDeserializeEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.cc b/tensorflow/core/kernels/boosted_trees/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ea12c522c8bb73d64715e3a75c481ccf6d054dc
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.cc
@@ -0,0 +1,301 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+int32 BoostedTreesEnsembleResource::next_node(
+    const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+    const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const auto& split = node.bucketized_split();
+  if (bucketized_features[split.feature_id()](index_in_batch) <=
+      split.threshold()) {
+    return split.left_id();
+  } else {
+    return split.right_id();
+  }
+}
+
+float BoostedTreesEnsembleResource::node_value(const int32 tree_id,
+                                               const int32 node_id) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  if (node.node_case() == boosted_trees::Node::kLeaf) {
+    return node.leaf().scalar();
+  } else {
+    return node.metadata().original_leaf().scalar();
+  }
+}
+
+void BoostedTreesEnsembleResource::UpdateGrowingMetadata() const {
+  tree_ensemble_->mutable_growing_metadata()->set_num_layers_attempted(
+      tree_ensemble_->growing_metadata().num_layers_attempted() + 1);
+
+  const int n_trees = num_trees();
+
+  if (n_trees <= 0 ||
+      // Checks if we are building the first layer of the dummy empty tree
+      ((n_trees == 1 || IsTreeFinalized(n_trees - 2)) &&
+       (tree_ensemble_->trees(n_trees - 1).nodes_size() == 1))) {
+    tree_ensemble_->mutable_growing_metadata()->set_num_trees_attempted(
+        tree_ensemble_->growing_metadata().num_trees_attempted() + 1);
+  }
+}
+
+// Add a tree to the ensemble and returns a new tree_id.
+int32 BoostedTreesEnsembleResource::AddNewTree(const float weight) {
+  const int32 new_tree_id = tree_ensemble_->trees_size();
+  auto* node = tree_ensemble_->add_trees()->add_nodes();
+  node->mutable_leaf()->set_scalar(0.0);
+  tree_ensemble_->add_tree_weights(weight);
+  tree_ensemble_->add_tree_metadata();
+
+  return new_tree_id;
+}
+
+void BoostedTreesEnsembleResource::AddBucketizedSplitNode(
+    const int32 tree_id, const int32 node_id, const int32 feature_id,
+    const int32 threshold, const float gain, const float left_contrib,
+    const float right_contrib, int32* left_node_id, int32* right_node_id) {
+  auto* tree = tree_ensemble_->mutable_trees(tree_id);
+  auto* node = tree->mutable_nodes(node_id);
+  DCHECK_EQ(node->node_case(), boosted_trees::Node::kLeaf);
+  float prev_node_value = node->leaf().scalar();
+  *left_node_id = tree->nodes_size();
+  *right_node_id = *left_node_id + 1;
+  auto* left_node = tree->add_nodes();
+  auto* right_node = tree->add_nodes();
+  if (node_id != 0) {
+    // Save previous leaf value if it is not the first leaf in the tree.
+    node->mutable_metadata()->mutable_original_leaf()->Swap(
+        node->mutable_leaf());
+  }
+  node->mutable_metadata()->set_gain(gain);
+  auto* new_split = node->mutable_bucketized_split();
+  new_split->set_feature_id(feature_id);
+  new_split->set_threshold(threshold);
+  new_split->set_left_id(*left_node_id);
+  new_split->set_right_id(*right_node_id);
+  // TODO(npononareva): this is LAYER-BY-LAYER boosting; add WHOLE-TREE.
+  left_node->mutable_leaf()->set_scalar(prev_node_value + left_contrib);
+  right_node->mutable_leaf()->set_scalar(prev_node_value + right_contrib);
+}
+
+void BoostedTreesEnsembleResource::Reset() {
+  // Reset stamp.
+  set_stamp(-1);
+
+  // Clear tree ensemle.
+  arena_.Reset();
+  CHECK_EQ(0, arena_.SpaceAllocated());
+  tree_ensemble_ =
+      protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(&arena_);
+}
+
+void BoostedTreesEnsembleResource::PostPruneTree(const int32 current_tree) {
+  // No-op if tree is empty.
+  auto* tree = tree_ensemble_->mutable_trees(current_tree);
+  int32 num_nodes = tree->nodes_size();
+  if (num_nodes == 0) {
+    return;
+  }
+
+  std::vector<int32> nodes_to_delete;
+  // If a node was pruned, we need to save the change of the prediction from
+  // this node to its parent, as well as the parent id.
+  std::vector<std::pair<int32, float>> nodes_changes;
+  nodes_changes.reserve(num_nodes);
+  for (int32 i = 0; i < num_nodes; ++i) {
+    nodes_changes.emplace_back(i, 0.0);
+  }
+  // Prune the tree recursively starting from the root. Each node that has
+  // negative gain and only leaf children will be pruned recursively up from
+  // the bottom of the tree. This method returns the list of nodes pruned, and
+  // updates the nodes in the tree not to refer to those pruned nodes.
+  RecursivelyDoPostPrunePreparation(current_tree, 0, &nodes_to_delete,
+                                    &nodes_changes);
+
+  if (nodes_to_delete.empty()) {
+    // No pruning happened, and no post-processing needed.
+    return;
+  }
+
+  // Sort node ids so they are in asc order.
+  std::sort(nodes_to_delete.begin(), nodes_to_delete.end());
+
+  // We need to
+  // - update split left and right children ids with new indices
+  // - actually remove the nodes that need to be removed
+  // - save the information about pruned node so we could recover the
+  // predictions from cache. Build a map for old node index=>new node index.
+  // nodes_to_delete contains nodes who's indices should be skipped, in
+  // ascending order. Save the information about new indices into meta.
+  std::map<int32, int32> old_to_new_ids;
+  int32 new_index = 0;
+  int32 index_for_deleted = 0;
+  auto* post_prune_meta = tree_ensemble_->mutable_tree_metadata(current_tree)
+                              ->mutable_post_pruned_nodes_meta();
+
+  for (int32 i = 0; i < num_nodes; ++i) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      // Node i will get removed,
+      ++index_for_deleted;
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      int32 new_id;
+      float logit_change;
+      CalculateParentAndLogitUpdate(i, nodes_changes, &new_id, &logit_change);
+      auto* meta = post_prune_meta->Add();
+      meta->set_new_node_id(old_to_new_ids[new_id]);
+      meta->set_logit_change(logit_change);
+    } else {
+      old_to_new_ids[i] = new_index++;
+      auto* meta = post_prune_meta->Add();
+      // Update meta info that will allow us to use cached predictions from
+      // those nodes.
+      meta->set_new_node_id(old_to_new_ids[i]);
+      meta->set_logit_change(0.0);
+    }
+  }
+  index_for_deleted = 0;
+  int32 i = 0;
+  protobuf::RepeatedPtrField<boosted_trees::Node> new_nodes;
+  new_nodes.Reserve(old_to_new_ids.size());
+  for (auto node : *(tree->mutable_nodes())) {
+    if (index_for_deleted < nodes_to_delete.size() &&
+        i == nodes_to_delete[index_for_deleted]) {
+      ++index_for_deleted;
+      ++i;
+      continue;
+    } else {
+      if (node.node_case() == boosted_trees::Node::kBucketizedSplit) {
+        node.mutable_bucketized_split()->set_left_id(
+            old_to_new_ids[node.bucketized_split().left_id()]);
+        node.mutable_bucketized_split()->set_right_id(
+            old_to_new_ids[node.bucketized_split().right_id()]);
+      }
+      *new_nodes.Add() = std::move(node);
+    }
+    ++i;
+  }
+  // Replace all the nodes in a tree with the ones we keep.
+  *tree->mutable_nodes() = std::move(new_nodes);
+
+  // Note that if the whole tree got pruned, we will end up with one node.
+  // We can't remove that tree because it will cause problems with cache.
+}
+
+void BoostedTreesEnsembleResource::GetPostPruneCorrection(
+    const int32 tree_id, const int32 initial_node_id, int32* current_node_id,
+    float* logit_update) const {
+  DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+  if (IsTreeFinalized(tree_id) && IsTreePostPruned(tree_id)) {
+    DCHECK_LT(
+        initial_node_id,
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta_size());
+    const auto& meta =
+        tree_ensemble_->tree_metadata(tree_id).post_pruned_nodes_meta(
+            initial_node_id);
+    *current_node_id = meta.new_node_id();
+    *logit_update += meta.logit_change();
+  }
+}
+
+bool BoostedTreesEnsembleResource::IsTerminalSplitNode(
+    const int32 tree_id, const int32 node_id) const {
+  const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+  DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+  const int32 left_id = node.bucketized_split().left_id();
+  const int32 right_id = node.bucketized_split().right_id();
+  return is_leaf(tree_id, left_id) && is_leaf(tree_id, right_id);
+}
+
+// For each pruned node, finds the leaf where it finally ended up and
+// calculates the total update from that pruned node prediction.
+void BoostedTreesEnsembleResource::CalculateParentAndLogitUpdate(
+    const int32 start_node_id,
+    const std::vector<std::pair<int32, float>>& nodes_change, int32* parent_id,
+    float* change) const {
+  *change = 0.0;
+  int32 node_id = start_node_id;
+  int32 parent = nodes_change[node_id].first;
+
+  while (parent != node_id) {
+    (*change) += nodes_change[node_id].second;
+    node_id = parent;
+    parent = nodes_change[node_id].first;
+  }
+  *parent_id = parent;
+}
+
+void BoostedTreesEnsembleResource::RecursivelyDoPostPrunePreparation(
+    const int32 tree_id, const int32 node_id,
+    std::vector<int32>* nodes_to_delete,
+    std::vector<std::pair<int32, float>>* nodes_meta) {
+  auto* node = tree_ensemble_->mutable_trees(tree_id)->mutable_nodes(node_id);
+  DCHECK_NE(node->node_case(), boosted_trees::Node::NODE_NOT_SET);
+  // Base case when we reach a leaf.
+  if (node->node_case() == boosted_trees::Node::kLeaf) {
+    return;
+  }
+
+  // Traverse node children first and recursively prune their sub-trees.
+  RecursivelyDoPostPrunePreparation(tree_id, node->bucketized_split().left_id(),
+                                    nodes_to_delete, nodes_meta);
+  RecursivelyDoPostPrunePreparation(tree_id,
+                                    node->bucketized_split().right_id(),
+                                    nodes_to_delete, nodes_meta);
+
+  // Two conditions must be satisfied to prune the node:
+  // 1- The split gain is negative.
+  // 2- After depth-first pruning, the node only has leaf children.
+  const auto& node_metadata = node->metadata();
+  if (node_metadata.gain() < 0 && IsTerminalSplitNode(tree_id, node_id)) {
+    const int32 left_id = node->bucketized_split().left_id();
+    const int32 right_id = node->bucketized_split().right_id();
+
+    // Save children that need to be deleted.
+    nodes_to_delete->push_back(left_id);
+    nodes_to_delete->push_back(right_id);
+
+    // Change node back into leaf.
+    *node->mutable_leaf() = node_metadata.original_leaf();
+    const float parent_value = node_value(tree_id, node_id);
+
+    // Save the old values of weights of children.
+    (*nodes_meta)[left_id].first = node_id;
+    (*nodes_meta)[left_id].second = parent_value - node_value(tree_id, left_id);
+
+    (*nodes_meta)[right_id].first = node_id;
+    (*nodes_meta)[right_id].second =
+        parent_value - node_value(tree_id, right_id);
+
+    // Clear gain for leaf node.
+    node->clear_metadata();
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/resources.h b/tensorflow/core/kernels/boosted_trees/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..c82588b9507800a860e6fc7af4a51541f09cad5b
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/resources.h
@@ -0,0 +1,221 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// A StampedResource is a resource that has a stamp token associated with it.
+// Before reading from or applying updates to the resource, the stamp should
+// be checked to verify that the update is not stale.
+class StampedResource : public ResourceBase {
+ public:
+  StampedResource() : stamp_(-1) {}
+
+  bool is_stamp_valid(int64 stamp) const { return stamp_ == stamp; }
+
+  int64 stamp() const { return stamp_; }
+  void set_stamp(int64 stamp) { stamp_ = stamp; }
+
+ private:
+  int64 stamp_;
+};
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class BoostedTreesEnsembleResource : public StampedResource {
+ public:
+  // Constructor.
+  BoostedTreesEnsembleResource()
+      : tree_ensemble_(
+            protobuf::Arena::CreateMessage<boosted_trees::TreeEnsemble>(
+                &arena_)) {}
+
+  string DebugString() override {
+    return strings::StrCat("TreeEnsemble[size=", tree_ensemble_->trees_size(),
+                           "]");
+  }
+
+  bool InitFromSerialized(const string& serialized, const int64 stamp_token) {
+    CHECK_EQ(stamp(), -1) << "Must Reset before Init.";
+    if (ParseProtoUnlimited(tree_ensemble_, serialized)) {
+      set_stamp(stamp_token);
+      return true;
+    }
+    return false;
+  }
+
+  string SerializeAsString() const {
+    return tree_ensemble_->SerializeAsString();
+  }
+
+  int32 num_trees() const { return tree_ensemble_->trees_size(); }
+
+  // Find the next node to which the example (specified by index_in_batch)
+  // traverses down from the current node indicated by tree_id and node_id.
+  // Args:
+  //   tree_id: the index of the tree in the ensemble.
+  //   node_id: the index of the node within the tree.
+  //   index_in_batch: the index of the example within the batch (relevant to
+  //       the index of the row to read in each bucketized_features).
+  //   bucketized_features: vector of feature Vectors.
+  int32 next_node(
+      const int32 tree_id, const int32 node_id, const int32 index_in_batch,
+      const std::vector<TTypes<int32>::ConstVec>& bucketized_features) const;
+
+  float node_value(const int32 tree_id, const int32 node_id) const;
+
+  int32 GetNumLayersGrown(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id).num_layers_grown();
+  }
+
+  void SetNumLayersGrown(const int32 tree_id, int32 new_num_layers) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    tree_ensemble_->mutable_tree_metadata(tree_id)->set_num_layers_grown(
+        new_num_layers);
+  }
+
+  void UpdateGrowingMetadata() const;
+
+  int32 GetNumLayersAttempted() {
+    return tree_ensemble_->growing_metadata().num_layers_attempted();
+  }
+
+  bool is_leaf(const int32 tree_id, const int32 node_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    DCHECK_LT(node_id, tree_ensemble_->trees(tree_id).nodes_size());
+    const auto& node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    return node.node_case() == boosted_trees::Node::kLeaf;
+  }
+
+  int32 feature_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().feature_id();
+  }
+
+  int32 bucket_threshold(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().threshold();
+  }
+
+  int32 left_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().left_id();
+  }
+
+  int32 right_id(const int32 tree_id, const int32 node_id) const {
+    const auto node = tree_ensemble_->trees(tree_id).nodes(node_id);
+    DCHECK_EQ(node.node_case(), boosted_trees::Node::kBucketizedSplit);
+    return node.bucketized_split().right_id();
+  }
+
+  // Add a tree to the ensemble and returns a new tree_id.
+  int32 AddNewTree(const float weight);
+
+  // Grows the tree by adding a split and leaves.
+  void AddBucketizedSplitNode(const int32 tree_id, const int32 node_id,
+                              const int32 feature_id, const int32 threshold,
+                              const float gain, const float left_contrib,
+                              const float right_contrib, int32* left_node_id,
+                              int32* right_node_id);
+
+  // Retrieves tree weights and returns as a vector.
+  // It involves a copy, so should be called only sparingly (like once per
+  // iteration, not per example).
+  std::vector<float> GetTreeWeights() const {
+    return {tree_ensemble_->tree_weights().begin(),
+            tree_ensemble_->tree_weights().end()};
+  }
+
+  float GetTreeWeight(const int32 tree_id) const {
+    return tree_ensemble_->tree_weights(tree_id);
+  }
+
+  float IsTreeFinalized(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id).is_finalized();
+  }
+
+  float IsTreePostPruned(const int32 tree_id) const {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->tree_metadata(tree_id)
+               .post_pruned_nodes_meta_size() > 0;
+  }
+
+  void SetIsFinalized(const int32 tree_id, const bool is_finalized) {
+    DCHECK_LT(tree_id, tree_ensemble_->trees_size());
+    return tree_ensemble_->mutable_tree_metadata(tree_id)->set_is_finalized(
+        is_finalized);
+  }
+
+  // Sets the weight of i'th tree.
+  void SetTreeWeight(const int32 tree_id, const float weight) {
+    DCHECK_GE(tree_id, 0);
+    DCHECK_LT(tree_id, num_trees());
+    tree_ensemble_->set_tree_weights(tree_id, weight);
+  }
+
+  // Resets the resource and frees the protos in arena.
+  // Caller needs to hold the mutex lock while calling this.
+  virtual void Reset();
+
+  void PostPruneTree(const int32 current_tree);
+
+  // For a given node, returns the id in a pruned tree, as well as correction
+  // to the cached prediction that should be applied. If tree was not
+  // post-pruned, current_node_id will be equal to initial_node_id and logit
+  // update will be equal to zero.
+  void GetPostPruneCorrection(const int32 tree_id, const int32 initial_node_id,
+                              int32* current_node_id,
+                              float* logit_update) const;
+  mutex* get_mutex() { return &mu_; }
+
+ private:
+  // Helper method to check whether a node is a terminal node in that it
+  // only has leaf nodes as children.
+  bool IsTerminalSplitNode(const int32 tree_id, const int32 node_id) const;
+
+  // For each pruned node, finds the leaf where it finally ended up and
+  // calculates the total update from that pruned node prediction.
+  void CalculateParentAndLogitUpdate(
+      const int32 start_node_id,
+      const std::vector<std::pair<int32, float>>& nodes_change,
+      int32* parent_id, float* change) const;
+
+  // Helper method to collect the information to be used to prune some nodes in
+  // the tree.
+  void RecursivelyDoPostPrunePreparation(
+      const int32 tree_id, const int32 node_id,
+      std::vector<int32>* nodes_to_delete,
+      std::vector<std::pair<int32, float>>* nodes_meta);
+
+ protected:
+  protobuf::Arena arena_;
+  mutex mu_;
+  boosted_trees::TreeEnsemble* tree_ensemble_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_RESOURCES_H_
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33fdab6a860358fab05abbb361bf004174e85658
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -0,0 +1,296 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+
+namespace {
+const float kEps = 1e-15;
+}  // namespace
+
+class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
+ public:
+  explicit BoostedTreesCalculateBestGainsPerFeatureOp(
+      OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("l1", &l1_));
+    OP_REQUIRES_OK(context, context->GetAttr("l2", &l2_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("tree_complexity", &tree_complexity_));
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_id_range
+    const Tensor* node_id_range_t;
+    OP_REQUIRES_OK(context, context->input("node_id_range", &node_id_range_t));
+    const auto node_id_range = node_id_range_t->vec<int32>();
+    int32 node_id_first = node_id_range(0);
+    int32 node_id_last = node_id_range(1);  // inclusive.
+    // stats_summary_list
+    OpInputList stats_summary_list;
+    OP_REQUIRES_OK(context, context->input_list("stats_summary_list",
+                                                &stats_summary_list));
+    const int64 num_buckets = stats_summary_list[0].dim_size(1);
+    std::vector<TTypes<float, 3>::ConstTensor> stats_summary;
+    stats_summary.reserve(stats_summary_list.size());
+    for (const auto& tensor : stats_summary_list) {
+      stats_summary.emplace_back(tensor.tensor<float, 3>());
+    }
+
+    // Allocate output lists of tensors:
+    OpOutputList output_node_ids_list;
+    OP_REQUIRES_OK(
+        context, context->output_list("node_ids_list", &output_node_ids_list));
+    OpOutputList output_gains_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("gains_list", &output_gains_list));
+    OpOutputList output_thresholds_list;
+    OP_REQUIRES_OK(context, context->output_list("thresholds_list",
+                                                 &output_thresholds_list));
+    OpOutputList output_left_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("left_node_contribs_list",
+                                        &output_left_node_contribs_list));
+    OpOutputList output_right_node_contribs_list;
+    OP_REQUIRES_OK(context,
+                   context->output_list("right_node_contribs_list",
+                                        &output_right_node_contribs_list));
+
+    // Get the best split info per node for each feature.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      std::vector<float> cum_grad;
+      std::vector<float> cum_hess;
+      cum_grad.reserve(num_buckets);
+      cum_hess.reserve(num_buckets);
+
+      std::vector<int32> output_node_ids;
+      std::vector<float> output_gains;
+      std::vector<int32> output_thresholds;
+      std::vector<float> output_left_node_contribs;
+      std::vector<float> output_right_node_contribs;
+      for (int node_id = node_id_first; node_id <= node_id_last; ++node_id) {
+        // Calculate gains.
+        cum_grad.clear();
+        cum_hess.clear();
+        float total_grad = 0.0;
+        float total_hess = 0.0;
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          // TODO(nponomareva): Consider multi-dimensional gradients/hessians.
+          total_grad += stats_summary[feature_idx](node_id, bucket, 0);
+          total_hess += stats_summary[feature_idx](node_id, bucket, 1);
+          cum_grad.push_back(total_grad);
+          cum_hess.push_back(total_hess);
+        }
+        float best_gain = std::numeric_limits<float>::lowest();
+        float best_bucket = 0;
+        float best_contrib_for_left = 0.0;
+        float best_contrib_for_right = 0.0;
+        // Parent gain.
+        float parent_gain;
+        float unused;
+        CalculateWeightsAndGains(total_grad, total_hess, &unused, &parent_gain);
+
+        for (int bucket = 0; bucket < num_buckets; ++bucket) {
+          const float cum_grad_bucket = cum_grad[bucket];
+          const float cum_hess_bucket = cum_hess[bucket];
+          // Left child.
+          float contrib_for_left;
+          float gain_for_left;
+          CalculateWeightsAndGains(cum_grad_bucket, cum_hess_bucket,
+                                   &contrib_for_left, &gain_for_left);
+          // Right child.
+          float contrib_for_right;
+          float gain_for_right;
+          CalculateWeightsAndGains(total_grad - cum_grad_bucket,
+                                   total_hess - cum_hess_bucket,
+                                   &contrib_for_right, &gain_for_right);
+
+          if (gain_for_left + gain_for_right > best_gain) {
+            best_gain = gain_for_left + gain_for_right;
+            best_bucket = bucket;
+            best_contrib_for_left = contrib_for_left;
+            best_contrib_for_right = contrib_for_right;
+          }
+        }  // for bucket
+        output_node_ids.push_back(node_id);
+        // Remove the parent gain for the parent node.
+        output_gains.push_back(best_gain - parent_gain);
+        output_thresholds.push_back(best_bucket);
+        output_left_node_contribs.push_back(best_contrib_for_left);
+        output_right_node_contribs.push_back(best_contrib_for_right);
+      }  // for node_id
+      const int num_nodes = output_node_ids.size();
+      // output_node_ids
+      Tensor* output_node_ids_t;
+      OP_REQUIRES_OK(context,
+                     output_node_ids_list.allocate(feature_idx, {num_nodes},
+                                                   &output_node_ids_t));
+      auto output_node_ids_vec = output_node_ids_t->vec<int32>();
+      // output_gains
+      Tensor* output_gains_t;
+      OP_REQUIRES_OK(context, output_gains_list.allocate(
+                                  feature_idx, {num_nodes}, &output_gains_t));
+      auto output_gains_vec = output_gains_t->vec<float>();
+      // output_thresholds
+      Tensor* output_thresholds_t;
+      OP_REQUIRES_OK(context,
+                     output_thresholds_list.allocate(feature_idx, {num_nodes},
+                                                     &output_thresholds_t));
+      auto output_thresholds_vec = output_thresholds_t->vec<int32>();
+      // output_left_node_contribs
+      Tensor* output_left_node_contribs_t;
+      OP_REQUIRES_OK(context, output_left_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_left_node_contribs_t));
+      auto output_left_node_contribs_matrix =
+          output_left_node_contribs_t->matrix<float>();
+      // output_right_node_contribs
+      Tensor* output_right_node_contribs_t;
+      OP_REQUIRES_OK(context, output_right_node_contribs_list.allocate(
+                                  feature_idx, {num_nodes, 1},
+                                  &output_right_node_contribs_t));
+      auto output_right_node_contribs_matrix =
+          output_right_node_contribs_t->matrix<float>();
+      // Sets output tensors from vectors.
+      for (int i = 0; i < num_nodes; ++i) {
+        output_node_ids_vec(i) = output_node_ids[i];
+        // Adjust the gains to penalize by tree complexity.
+        output_gains_vec(i) = output_gains[i] - tree_complexity_;
+        output_thresholds_vec(i) = output_thresholds[i];
+        // Logits are 1-dimensional for now.
+        // TODO(nponomareva): Consider multi-dimensional logits.
+        output_left_node_contribs_matrix(i, 0) = output_left_node_contribs[i];
+        output_right_node_contribs_matrix(i, 0) = output_right_node_contribs[i];
+      }
+    }  // for f
+  }
+
+ private:
+  void CalculateWeightsAndGains(const float g, const float h, float* weight,
+                                float* gain) {
+    //
+    // The formula for weight is -(g+l1*sgn(w))/(H+l2), for gain it is
+    // (g+l1*sgn(w))^2/(h+l2).
+    // This is because for each leaf we optimize
+    // 1/2(h+l2)*w^2+g*w+l1*abs(w)
+    float g_with_l1 = g;
+    // Apply L1 regularization.
+    // 1) Assume w>0 => w=-(g+l1)/(h+l2)=> g+l1 < 0 => g < -l1
+    // 2) Assume w<0 => w=-(g-l1)/(h+l2)=> g-l1 > 0 => g > l1
+    // For g from (-l1, l1), thus there is no solution => set to 0.
+    if (l1_ > 0) {
+      if (g > l1_) {
+        g_with_l1 -= l1_;
+      } else if (g < -l1_) {
+        g_with_l1 += l1_;
+      } else {
+        *weight = 0.0;
+        *gain = 0.0;
+        return;
+      }
+    }
+    // Apply L2 regularization.
+    if (h + l2_ <= kEps) {
+      // Avoid division by 0 or infinitesimal.
+      *weight = 0;
+      *gain = 0;
+    } else {
+      *weight = -g_with_l1 / (h + l2_);
+      *gain = -g_with_l1 * (*weight);
+    }
+  }
+
+  float l1_;
+  float l2_;
+  float tree_complexity_;
+  int max_splits_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("BoostedTreesCalculateBestGainsPerFeature").Device(DEVICE_CPU),
+    BoostedTreesCalculateBestGainsPerFeatureOp);
+
+class BoostedTreesMakeStatsSummaryOp : public OpKernel {
+ public:
+  explicit BoostedTreesMakeStatsSummaryOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // node_ids
+    const Tensor* node_ids_t;
+    OP_REQUIRES_OK(context, context->input("node_ids", &node_ids_t));
+    const auto node_ids = node_ids_t->vec<int32>();
+    // gradients
+    const Tensor* gradients_t;
+    OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
+    const auto gradients = gradients_t->matrix<float>();
+    // hessians
+    const Tensor* hessians_t;
+    OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
+    const auto hessians = hessians_t->matrix<float>();
+    // bucketized_features
+    OpInputList bucketized_features_list;
+    OP_REQUIRES_OK(context, context->input_list("bucketized_features_list",
+                                                &bucketized_features_list));
+    std::vector<tensorflow::TTypes<int32>::ConstVec> bucketized_features;
+    bucketized_features.reserve(num_features_);
+    for (const Tensor& tensor : bucketized_features_list) {
+      bucketized_features.emplace_back(tensor.vec<int32>());
+    }
+
+    // Infer batch size.
+    const int64 batch_size = node_ids_t->dim_size(0);
+    // Allocate output stats tensor (Rank 4).
+    Tensor* output_stats_summary_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "stats_summary",
+                                {num_features_, max_splits_, num_buckets_, 2},
+                                &output_stats_summary_t));
+    auto output_stats_summary = output_stats_summary_t->tensor<float, 4>();
+    output_stats_summary.setZero();
+
+    // Partition by node, and then bucketize.
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& features = bucketized_features[feature_idx];
+      for (int i = 0; i < batch_size; ++i) {
+        const int32 node = node_ids(i);
+        const int32 bucket = features(i);
+        output_stats_summary(feature_idx, node, bucket, 0) += gradients(i, 0);
+        output_stats_summary(feature_idx, node, bucket, 1) += hessians(i, 0);
+      }
+    }
+  }
+
+ private:
+  int max_splits_;
+  int num_buckets_;
+  int num_features_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesMakeStatsSummary").Device(DEVICE_CPU),
+                        BoostedTreesMakeStatsSummaryOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ded4054aced4bcb27b0590a44e1f86f6b0a1c2
--- /dev/null
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -0,0 +1,219 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/boosted_trees/resources.h"
+
+namespace tensorflow {
+
+namespace {
+constexpr float kLayerByLayerTreeWeight = 1.0;
+
+// TODO(nponomareva, youngheek): consider using vector.
+struct SplitCandidate {
+  SplitCandidate() {}
+
+  // Index in the list of the feature ids.
+  int64 feature_idx;
+
+  // Index in the tensor of node_ids for the feature with idx feature_idx.
+  int64 candidate_idx;
+
+  float gain;
+};
+
+enum PruningMode { kNoPruning = 0, kPrePruning = 1, kPostPruning = 2 };
+
+}  // namespace
+
+class BoostedTreesUpdateEnsembleOp : public OpKernel {
+ public:
+  explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("max_depth", &max_depth_));
+    OP_REQUIRES_OK(context, context->GetAttr("learning_rate", &learning_rate_));
+    OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
+
+    int32 pruning_index;
+    OP_REQUIRES_OK(context, context->GetAttr("pruning_mode", &pruning_index));
+    pruning_mode_ = static_cast<PruningMode>(pruning_index);
+  }
+
+  void Compute(OpKernelContext* const context) override {
+    // Get decision tree ensemble.
+    BoostedTreesEnsembleResource* ensemble_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &ensemble_resource));
+    core::ScopedUnref unref_me(ensemble_resource);
+    mutex_lock l(*ensemble_resource->get_mutex());
+    // Increase the ensemble stamp.
+    ensemble_resource->set_stamp(ensemble_resource->stamp() + 1);
+
+    // Read node ids, gains, thresholds and node contribs.
+    OpInputList node_ids_list;
+    OpInputList gains_list;
+    OpInputList thresholds_list;
+    OpInputList left_node_contribs;
+    OpInputList right_node_contribs;
+    OP_REQUIRES_OK(context, context->input_list("node_ids", &node_ids_list));
+    OP_REQUIRES_OK(context, context->input_list("gains", &gains_list));
+    OP_REQUIRES_OK(context,
+                   context->input_list("thresholds", &thresholds_list));
+    OP_REQUIRES_OK(context, context->input_list("left_node_contribs",
+                                                &left_node_contribs));
+    OP_REQUIRES_OK(context, context->input_list("right_node_contribs",
+                                                &right_node_contribs));
+
+    const Tensor* feature_ids_t;
+    OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+
+    auto feature_ids = feature_ids_t->vec<int32>();
+
+    // Find best splits for each active node.
+    std::map<int32, SplitCandidate> best_splits;
+    FindBestSplitsPerNode(context, node_ids_list, gains_list, &best_splits);
+
+    int32 current_tree =
+        UpdateGlobalAttemptsAndRetrieveGrowableTree(ensemble_resource);
+
+    // No-op if no new splits can be considered.
+    if (best_splits.empty()) {
+      LOG(WARNING) << "Not growing tree ensemble as no good splits were found.";
+      return;
+    }
+
+    const int32 new_num_layers =
+        ensemble_resource->GetNumLayersGrown(current_tree) + 1;
+    VLOG(1) << "Adding layer #" << new_num_layers - 1 << " to tree #"
+            << current_tree << " of ensemble of " << current_tree + 1
+            << " trees.";
+    bool split_happened = false;
+    // Add the splits to the tree.
+    for (auto& split_entry : best_splits) {
+      const int32 node_id = split_entry.first;
+      const SplitCandidate& candidate = split_entry.second;
+
+      const int64 feature_idx = candidate.feature_idx;
+      const int64 candidate_idx = candidate.candidate_idx;
+
+      const int32 feature_id = feature_ids(feature_idx);
+      const int32 threshold =
+          thresholds_list[feature_idx].vec<int32>()(candidate_idx);
+      const float gain = gains_list[feature_idx].vec<float>()(candidate_idx);
+
+      if (pruning_mode_ == kPrePruning) {
+        // Don't consider negative splits if we're pre-pruning the tree.
+        // Note that zero-gain splits are acceptable.
+        if (gain < 0) {
+          continue;
+        }
+      }
+      // For now assume that the weights vectors are one dimensional.
+      // TODO(nponomareva): change here for multiclass.
+      const float left_contrib =
+          learning_rate_ *
+          left_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+      const float right_contrib =
+          learning_rate_ *
+          right_node_contribs[feature_idx].matrix<float>()(candidate_idx, 0);
+
+      // unused.
+      int32 left_node_id;
+      int32 right_node_id;
+
+      ensemble_resource->AddBucketizedSplitNode(
+          current_tree, node_id, feature_id, threshold, gain, left_contrib,
+          right_contrib, &left_node_id, &right_node_id);
+      split_happened = true;
+    }
+    if (split_happened) {
+      // Update growable tree metadata.
+      ensemble_resource->SetNumLayersGrown(current_tree, new_num_layers);
+      // Finalize the tree if needed.
+      if (ensemble_resource->GetNumLayersGrown(current_tree) >= max_depth_) {
+        ensemble_resource->SetIsFinalized(current_tree, true);
+        if (pruning_mode_ == kPostPruning) {
+          ensemble_resource->PostPruneTree(current_tree);
+        }
+        if (ensemble_resource->num_trees() > 0) {
+          // Create a dummy new tree with an empty node.
+          ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+        }
+      }
+    }
+  }
+
+ private:
+  int32 UpdateGlobalAttemptsAndRetrieveGrowableTree(
+      BoostedTreesEnsembleResource* const ensemble_resource) {
+    int32 num_trees = ensemble_resource->num_trees();
+    int32 current_tree = num_trees - 1;
+
+    // Increment global attempt stats.
+    ensemble_resource->UpdateGrowingMetadata();
+
+    // Note we don't set tree weight to be equal to learning rate, since we
+    // apply learning rate to leaf weights instead, when doing layer-by-layer
+    // boosting.
+    if (num_trees <= 0) {
+      // Create a new tree with a no-op leaf.
+      current_tree = ensemble_resource->AddNewTree(kLayerByLayerTreeWeight);
+    }
+    return current_tree;
+  }
+
+  // Helper method which effectively does a reduce over all split candidates
+  // and finds the best split for each node.
+  void FindBestSplitsPerNode(
+      OpKernelContext* const context, const OpInputList& node_ids_list,
+      const OpInputList& gains_list,
+      std::map<int32, SplitCandidate>* best_split_per_node) {
+    // Find best split per node going through every feature candidate.
+    for (int64 feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
+      const auto& gains = gains_list[feature_idx].vec<float>();
+
+      for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
+           ++candidate_idx) {
+        // Get current split candidate.
+        const auto& node_id = node_ids(candidate_idx);
+        const auto& gain = gains(candidate_idx);
+
+        auto best_split_it = best_split_per_node->find(node_id);
+        SplitCandidate candidate;
+        candidate.feature_idx = feature_idx;
+        candidate.candidate_idx = candidate_idx;
+        candidate.gain = gain;
+
+        if (best_split_it == best_split_per_node->end() ||
+            gain > best_split_it->second.gain) {
+          (*best_split_per_node)[node_id] = candidate;
+        }
+      }
+    }
+  }
+
+ private:
+  int32 num_features_;
+  float learning_rate_;
+  int32 max_depth_;
+  PruningMode pruning_mode_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsemble").Device(DEVICE_CPU),
+                        BoostedTreesUpdateEnsembleOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de41bac723ce2e62258c521a34d4775426643bd
--- /dev/null
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -0,0 +1,266 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+
+namespace {
+class CollectiveOpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveOpKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {}
+
+  // A string encoding instance, frame and iter to be handed off to
+  // the implementation for use in generating RecvBuf keys.
+  string GetCollectiveKey(OpKernelContext* c) {
+    return strings::StrCat(col_params_.instance.instance_key, ":",
+                           c->frame_iter().frame_id, ":",
+                           c->frame_iter().iter_id);
+  }
+
+  // Returns false if calling invocation of ComputeAsync should return
+  // immediately.
+  bool CanProceedWithCompute(OpKernelContext* c, CollectiveExecutor* col_exec,
+                             const DoneCallback& done) {
+    if (col_params_.group.group_size >
+        col_params_.instance.device_names.size()) {
+      // This is the first invocation: Finish initializing col_params_.
+      // Call in a blockable thread because it's not guaranteed that
+      // this call cannot block.
+      c->env()->SchedClosure([this, c, done, col_exec]() {
+        col_exec->CompleteParamsAsync(c->device()->name(), &col_params_,
+                                      c->cancellation_manager(),
+                                      [this, c, done](const Status& s) {
+                                        if (s.ok()) {
+                                          ComputeAsync(c, done);
+                                        } else {
+                                          c->SetStatus(s);
+                                          done();
+                                        }
+                                      });
+      });
+      return false;
+    }
+    return true;
+  }
+
+  CollectiveParams col_params_;
+};
+
+class CollectiveReduceOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveReduceOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = REDUCTION_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("subdiv_offsets",
+                      &col_params_.instance.impl_details.subdiv_offsets));
+    string merge_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("merge_op", &merge_op_name));
+    OP_REQUIRES(c, merge_op_name == "Add" || merge_op_name == "Mul",
+                errors::InvalidArgument(
+                    "merge_op must be one of {\"Add\", \"Mul\"} but got ",
+                    merge_op_name));
+    string final_op_name;
+    OP_REQUIRES_OK(c, c->GetAttr("final_op", &final_op_name));
+    OP_REQUIRES(c, final_op_name == "Id" || final_op_name == "Div",
+                errors::InvalidArgument(
+                    "final_op must be one of {\"Id\", \"Div\"} but got ",
+                    final_op_name));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+
+    const NodeDef& real_node = c->def();
+    col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
+                                       merge_op_name, ",", final_op_name, ")");
+    col_params_.group.device_type = c->device_type();
+
+    // Find the OpKernels by name, type and device type.
+    NodeDef sub_node;
+    // The merge_op takes two inputs
+    sub_node.add_input(real_node.input(0));
+    sub_node.add_input(real_node.input(0));
+    sub_node.set_device(real_node.device());
+    SetAttrValue(col_params_.instance.data_type,
+                 &(*sub_node.mutable_attr())["T"]);
+    col_params_.merge_op = BuildOpKernel(c, merge_op_name, &sub_node);
+    col_params_.final_op = BuildOpKernel(c, final_op_name, &sub_node);
+  }
+
+  std::unique_ptr<OpKernel> BuildOpKernel(OpKernelConstruction* c,
+                                          const string& name,
+                                          NodeDef* sub_node) {
+    std::unique_ptr<OpKernel> k;
+    if (name.empty() || name == "Id") return k;
+    sub_node->set_name(name);
+    sub_node->set_op(name);
+    Status status;
+    k = CreateOpKernel(c->device_type(), c->device(),
+                       c->device()->GetAllocator(AllocatorAttributes()),
+                       *sub_node, c->graph_def_version(), &status);
+    if (!status.ok()) {
+      c->CtxFailureWithWarning(errors::Internal("Failed to build OpKernel for ",
+                                                name, " : ",
+                                                status.error_message()));
+    }
+    return k;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // Allocate the output tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->forward_input_or_allocate_output(
+                             {0}, 0, c->input(0).shape(), &output),
+                         done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveReduceOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_CPU),
+                        CollectiveReduceOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduce").Device(DEVICE_GPU),
+                        CollectiveReduceOpKernel);
+
+class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastSendOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = true;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    OP_REQUIRES_ASYNC(
+        c, shape_.IsSameSize(c->input(0).shape()),
+        errors::Internal("Declared shape of op ", col_params_.name,
+                         " does not match shape of input"),
+        done);
+    // Allocate the output Tensor, trying to reuse the input.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->forward_input_or_allocate_output({0}, 0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastSendOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_CPU),
+                        CollectiveBcastSendOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastSend").Device(DEVICE_GPU),
+                        CollectiveBcastSendOpKernel);
+
+class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
+ public:
+  explicit CollectiveBcastRecvOpKernel(OpKernelConstruction* c)
+      : CollectiveOpKernel(c) {
+    col_params_.instance.type = BROADCAST_COLLECTIVE;
+    OP_REQUIRES_OK(c, c->GetAttr("group_size", &col_params_.group.group_size));
+    OP_REQUIRES_OK(c, c->GetAttr("group_key", &col_params_.group.group_key));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("instance_key", &col_params_.instance.instance_key));
+    OP_REQUIRES_OK(c, c->GetAttr("T", &col_params_.instance.data_type));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    col_params_.is_source = false;
+    col_params_.instance.impl_details.subdiv_offsets = {0};
+
+    col_params_.name =
+        strings::StrCat(name(), ": Broadcast(", col_params_.is_source, ")");
+    col_params_.group.device_type = c->device_type();
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            col_params_.name),
+        done);
+    if (!CanProceedWithCompute(c, col_exec, done)) return;
+    // No input, so must allocate output.
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape_, &output), done);
+
+    auto actual_done = [c, col_exec, done](const Status& s) {
+      OP_REQUIRES_OK_ASYNC(c, s, done);
+      done();
+    };
+    col_exec->ExecuteAsync(c, col_params_, GetCollectiveKey(c), actual_done);
+  }
+
+ private:
+  TensorShape shape_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(CollectiveBcastRecvOpKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_CPU),
+                        CollectiveBcastRecvOpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveBcastRecv").Device(DEVICE_GPU),
+                        CollectiveBcastRecvOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index d8643c0b2fb2633f6b640b4f54dc2f8c92da654d..93e392d3032405ea848bd2f147653c9a5c7a1818 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -118,6 +118,7 @@ TF_CALL_complex128(REGISTER);
 TF_CALL_int64(REGISTER);
 TF_CALL_bfloat16(REGISTER);
 TF_CALL_bool(REGISTER);
+TF_CALL_uint8(REGISTER);
 
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 0f7adaf24a8eff76c27109eb91389dffdca31380..a561d918bd36f711d1b813dfb533ec6d690af8ee 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -202,6 +202,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT32);
 TF_CALL_complex64(REGISTER_GPUCONCAT32);
 TF_CALL_complex128(REGISTER_GPUCONCAT32);
 TF_CALL_int64(REGISTER_GPUCONCAT32);
+TF_CALL_uint8(REGISTER_GPUCONCAT32);
 REGISTER_GPUCONCAT32(bfloat16);
 REGISTER_GPUCONCAT32(bool);
 
@@ -209,6 +210,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPUCONCAT64);
 TF_CALL_complex64(REGISTER_GPUCONCAT64);
 TF_CALL_complex128(REGISTER_GPUCONCAT64);
 TF_CALL_int64(REGISTER_GPUCONCAT64);
+TF_CALL_uint8(REGISTER_GPUCONCAT64);
 REGISTER_GPUCONCAT64(bfloat16);
 REGISTER_GPUCONCAT64(bool);
 
@@ -216,6 +218,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU32);
 TF_CALL_complex64(REGISTER_GPU32);
 TF_CALL_complex128(REGISTER_GPU32);
 TF_CALL_int64(REGISTER_GPU32);
+TF_CALL_uint8(REGISTER_GPU32);
 REGISTER_GPU32(bfloat16);
 REGISTER_GPU32(bool);
 
@@ -223,6 +226,7 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU64);
 TF_CALL_complex64(REGISTER_GPU64);
 TF_CALL_complex128(REGISTER_GPU64);
 TF_CALL_int64(REGISTER_GPU64);
+TF_CALL_uint8(REGISTER_GPU64);
 REGISTER_GPU64(bfloat16);
 REGISTER_GPU64(bool);
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index f16766315f2640ab7c42c077fc5156a3a825fbf9..a87b63f913c279d35f625b096bb7ac947cb9230b 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -212,6 +212,7 @@ REGISTER_CONCAT(qint32);
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 REGISTER_GPU(bfloat16);
+TF_CALL_uint8(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
 TF_CALL_int64(REGISTER_GPU);
diff --git a/tensorflow/core/kernels/concat_op_test.cc b/tensorflow/core/kernels/concat_op_test.cc
index e3ba8ae9f691c8ec9be79952d7f97801552b2a56..39b44b2fcc8eb8336bfcf721919201536ed56133 100644
--- a/tensorflow/core/kernels/concat_op_test.cc
+++ b/tensorflow/core/kernels/concat_op_test.cc
@@ -78,6 +78,9 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
 
+static void BM_ConcatDim1uint8(int iters, int dim2) {
+  ConcatHelper<uint8>(iters, 1, dim2);
+}
 static void BM_ConcatDim1int16(int iters, int dim2) {
   ConcatHelper<int16>(iters, 1, dim2);
 }
@@ -85,6 +88,7 @@ static void BM_ConcatDim1bfloat16(int iters, int dim2) {
   ConcatHelper<bfloat16>(iters, 1, dim2);
 }
 
+BENCHMARK(BM_ConcatDim1uint8)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
 
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 312c1a41d36245ae3ca5a09d2e76a430bc464953..fe1a1ba5a306422d410a7b4646078b7b5e4c31eb 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -258,13 +258,15 @@ REGISTER_KERNEL(GPU, Eigen::half);
 REGISTER_KERNEL(GPU, bfloat16);
 REGISTER_KERNEL(GPU, float);
 REGISTER_KERNEL(GPU, double);
+REGISTER_KERNEL(GPU, complex64);
+REGISTER_KERNEL(GPU, complex128);
 REGISTER_KERNEL(GPU, uint8);
 REGISTER_KERNEL(GPU, int8);
 REGISTER_KERNEL(GPU, uint16);
 REGISTER_KERNEL(GPU, int16);
 REGISTER_KERNEL(GPU, int64);
 REGISTER_KERNEL(GPU, bool);
-// Currently we do not support filling strings and complex64 on GPU
+// Currently we do not support filling strings on GPU
 
 // A special GPU kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index a35e1b0788dbc60d6609faf1dfb97d5e7e4f515b..709082e79903d041771cc19235d4dee76fce66b3 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -242,7 +243,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input image must be 4-D"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "input image must be 4-D"))
       << s;
 }
 
@@ -255,7 +256,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("box_index has incompatible shape"))
+      str_util::StrContains(s.ToString(), "box_index has incompatible shape"))
       << s;
 }
 
@@ -267,8 +268,8 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
   AddInputFromArray<int32>(TensorShape({2}), {3, 3});
   Status s = RunOpKernel();
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("box_index has values outside [0, batch_size)"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "box_index has values outside [0, batch_size)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 96bdb6a241b1d88c7b14f22fc618ea9c95fb7642..8cadeac68d7907443d860e67b26bdedaf3634e5e 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/ctc/ctc_beam_search.h"
 #include "tensorflow/core/util/sparse/sparse_tensor.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -213,20 +214,29 @@ class CTCGreedyDecoderOp : public OpKernel {
 
     // Perform best path decoding
     std::vector<std::vector<std::vector<int> > > sequences(batch_size);
-    for (int b = 0; b < batch_size; ++b) {
-      sequences[b].resize(1);
-      auto& sequence = sequences[b][0];
-      int prev_indices = -1;
-      for (int t = 0; t < seq_len_t(b); ++t) {
-        int max_class_indices;
-        log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
-        if (max_class_indices != blank_index &&
-            !(merge_repeated_ && max_class_indices == prev_indices)) {
-          sequence.push_back(max_class_indices);
+    auto decode = [&](const int64 begin, const int64 end) {
+      for (int b = begin; b < end; ++b) {
+        sequences[b].resize(1);
+        auto &sequence = sequences[b][0];
+        int prev_indices = -1;
+        for (int t = 0; t < seq_len_t(b); ++t) {
+          int max_class_indices;
+          log_prob_t(b, 0) += -RowMax(input_list_t[t], b, &max_class_indices);
+          if (max_class_indices != blank_index &&
+              !(merge_repeated_ && max_class_indices == prev_indices)) {
+            sequence.push_back(max_class_indices);
+          }
+          prev_indices = max_class_indices;
         }
-        prev_indices = max_class_indices;
       }
-    }
+    };
+
+    const int64 kCostPerUnit = 50 * max_time * num_classes;
+    const int64 total = batch_size;
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads.num_threads, worker_threads.workers, total,
+          kCostPerUnit, decode);
 
     OP_REQUIRES_OK(
         ctx, decode_helper_.StoreAllDecodedSequences(
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index ba9686e94ee7072cc485c955decb2287bd4a56f3..e4036ddaa9b25c0cc462f928e1e56e6dc12bfb8d 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -104,6 +104,7 @@ namespace {
 using perftools::gputools::DeviceMemory;
 using perftools::gputools::DeviceMemoryBase;
 using perftools::gputools::ScratchAllocator;
+using perftools::gputools::dnn::AlgorithmConfig;
 using perftools::gputools::dnn::RnnDirectionMode;
 using perftools::gputools::dnn::RnnInputMode;
 using perftools::gputools::dnn::RnnMode;
@@ -226,22 +227,43 @@ inline perftools::gputools::port::Status ToExecutorStatus(const Status& s) {
                       s.error_message());
 }
 
-// A helper to allocate temporary scratch memory for Cudnn RNN models. It takes
-// the ownership of the underlying memory. The expectation is that the memory
-// should be alive for the span of the Cudnn RNN itself.
-class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
+template <typename>
+struct ToTFDataType;
+
+template <>
+struct ToTFDataType<Eigen::half> : std::integral_constant<DataType, DT_HALF> {};
+
+template <>
+struct ToTFDataType<float> : std::integral_constant<DataType, DT_FLOAT> {};
+
+template <>
+struct ToTFDataType<double> : std::integral_constant<DataType, DT_DOUBLE> {};
+
+template <>
+struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
+
+// A helper to allocate temporary scratch memory for Cudnn RNN models. It
+// takes the ownership of the underlying memory. The expectation is that the
+// memory should be alive for the span of the Cudnn RNN itself.
+template <typename T>
+class CudnnRnnAllocatorInTemp : public ScratchAllocator {
  public:
-  ~CudnnRNNWorkspaceAllocator() override {}
-  explicit CudnnRNNWorkspaceAllocator(OpKernelContext* context)
+  ~CudnnRnnAllocatorInTemp() = default;
+
+  explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
   }
+
   StatusOr<DeviceMemory<uint8>> AllocateBytes(
       perftools::gputools::Stream* stream, int64 byte_size) override {
     Tensor temporary_memory;
+    const DataType tf_data_type = ToTFDataType<T>::value;
+    int64 allocate_count =
+        Eigen::divup(byte_size, static_cast<int64>(sizeof(T)));
     Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+        tf_data_type, TensorShape({allocate_count}), &temporary_memory));
     if (!allocation_status.ok()) {
       return ToExecutorStatus(allocation_status);
     }
@@ -249,10 +271,16 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return StatusOr<DeviceMemory<uint8>>(
-        AsDeviceMemory<uint8>(&temporary_memory));
+    return DeviceMemory<uint8>::MakeFromByteSize(
+        temporary_memory.template flat<T>().data(),
+        temporary_memory.template flat<T>().size() * sizeof(T));
+  }
+
+  int64 TotalByteSize() const { return total_byte_size_; }
+
+  Tensor get_allocated_tensor(int index) const {
+    return allocated_tensors_[index];
   }
-  int64 TotalByteSize() { return total_byte_size_; }
 
  private:
   int64 total_byte_size_ = 0;
@@ -260,15 +288,15 @@ class CudnnRNNWorkspaceAllocator : public ScratchAllocator {
   std::vector<Tensor> allocated_tensors_;
 };
 
-// A helper to allocate reserve-space memory for Cudnn RNN models. The tensors
-// are allocated as a kernel output, and will be fed into the backward pass.
+// A helper to allocate memory for Cudnn RNN models as a kernel output. It is
+// used by forward pass kernel to feed the output to the backward pass.
 // The memory is expected to live long enough after the backward pass is
 // finished.
 template <typename T>
-class CudnnRNNReserveSpaceAllocator : public ScratchAllocator {
+class CudnnRnnAllocatorInOutput : public ScratchAllocator {
  public:
-  ~CudnnRNNReserveSpaceAllocator() override {}
-  CudnnRNNReserveSpaceAllocator(OpKernelContext* context, int output_index)
+  ~CudnnRnnAllocatorInOutput() override {}
+  CudnnRnnAllocatorInOutput(OpKernelContext* context, int output_index)
       : context_(context), output_index_(output_index) {}
   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     return std::numeric_limits<int64>::max();
@@ -342,13 +370,14 @@ struct CudnnModelTypes {
   TFRNNInputMode rnn_input_mode;
   RnnDirectionMode rnn_direction_mode;
   bool HasInputC() const {
-    // For Cudnn 5.0, only LSTM has input-c. All other models use only input-h.
+    // For Cudnn 5.0, only LSTM has input-c. All other models use only
+    // input-h.
     return rnn_mode == RnnMode::kRnnLstm;
   }
 };
 
 // A helper class that collects the shapes to describe a RNN model.
-struct CudnnModelShapes {
+struct CudnnRnnModelShapes {
   int num_layers;
   int input_size;
   int num_units;
@@ -359,7 +388,7 @@ struct CudnnModelShapes {
   TensorShape output_shape;
   TensorShape hidden_state_shape;
   // At present only fields related to cached RnnDescriptor are concerned.
-  bool IsCompatibleWith(const CudnnModelShapes& rhs) const {
+  bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const {
     return num_layers == rhs.num_layers && input_size == rhs.input_size &&
            num_units == rhs.num_units && dir_count == rhs.dir_count;
   }
@@ -370,9 +399,9 @@ struct CudnnModelShapes {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesHasher {
-  uint64 operator()(const CudnnModelShapes& to_hash) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesHasher {
+  uint64 operator()(const CudnnRnnModelShapes& to_hash) const {
     uint64 hash = static_cast<uint64>(to_hash.num_layers);
     hash = tensorflow::FingerprintCat64(
         hash, static_cast<uint64>(to_hash.input_size));
@@ -383,21 +412,21 @@ struct CudnnModelShapesHasher {
   }
 };
 
-// Utility class for using CudnnModelShapes as a hash table key.
-struct CudnnModelShapesComparator {
-  bool operator()(const CudnnModelShapes& first,
-                  const CudnnModelShapes& second) const {
+// Utility class for using CudnnRnnModelShapes as a hash table key.
+struct CudnnRnnModelShapesComparator {
+  bool operator()(const CudnnRnnModelShapes& first,
+                  const CudnnRnnModelShapes& second) const {
     return first.IsCompatibleWith(second);
   }
 };
 
-// Extract and checks the forward input tensors, parameters, and shapes from the
-// OpKernelContext.
+// Extract and checks the forward input tensors, parameters, and shapes from
+// the OpKernelContext.
 Status ExtractForwardInput(OpKernelContext* context,
                            const CudnnModelTypes& model_types,
                            const Tensor** input, const Tensor** input_h,
                            const Tensor** input_c, const Tensor** params,
-                           CudnnModelShapes* model_shapes) {
+                           CudnnRnnModelShapes* model_shapes) {
   TF_RETURN_IF_ERROR(context->input("input", input));
   TF_RETURN_IF_ERROR(context->input("input_h", input_h));
   if (model_types.HasInputC()) {
@@ -544,9 +573,10 @@ class CudnnRNNKernelCommon : public OpKernel {
     auto* stream = context->op_device_context()->stream();
     // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
     // random number generator, therefore set state_allocator to nullptr.
+    const AlgorithmConfig algo_config;
     auto rnn_desc_s = stream->parent()->createRnnDescriptor(
         num_layers, num_units, input_size, input_mode, rnn_direction_mode(),
-        rnn_mode(), ToDataType<T>::value, dropout(), seed(),
+        rnn_mode(), ToDataType<T>::value, algo_config, dropout(), seed(),
         nullptr /* state_allocator */);
     if (!rnn_desc_s.ok()) {
       return FromExecutorStatus(rnn_desc_s);
@@ -808,7 +838,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -874,7 +904,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     // Creates a memory callback for the reserve_space. The memory lives in the
     // output of this kernel. And it will be fed into the backward pass when
     // needed.
-    CudnnRNNReserveSpaceAllocator<T> reserve_space_allocator(context, 3);
+    CudnnRnnAllocatorInOutput<T> reserve_space_allocator(context, 3);
     if (!is_training_) {
       Tensor* dummy_reserve_space = nullptr;
       OP_REQUIRES_OK(context,
@@ -882,7 +912,7 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     }
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -891,22 +921,24 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnForward(*rnn_state.rnn_desc, *input_desc, input_data,
-                               *hidden_state_desc, input_h_data,
-                               *hidden_state_desc, input_c_data, params_data,
-                               *output_desc, &output_data, *hidden_state_desc,
-                               &output_h_data, *hidden_state_desc,
-                               &output_c_data, is_training_,
-                               &reserve_space_allocator, &workspace_allocator)
+              ->ThenRnnForward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, &output_data,
+                  *hidden_state_desc, &output_h_data, *hidden_state_desc,
+                  &output_c_data, is_training_, &reserve_space_allocator,
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -916,8 +948,8 @@ class CudnnRNNForwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
  private:
   mutex mu_;
   bool is_training_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
@@ -945,7 +977,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     const Tensor* input_h = nullptr;
     const Tensor* input_c = nullptr;
     const Tensor* params = nullptr;
-    CudnnModelShapes model_shapes;
+    CudnnRnnModelShapes model_shapes;
     OP_REQUIRES_OK(context,
                    ExtractForwardInput(context, model_types(), &input, &input_h,
                                        &input_c, &params, &model_shapes));
@@ -1086,7 +1118,7 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
     auto reserve_space_uint8 = CastDeviceMemory<uint8, T>(reserve_space);
     // Creates a memory callback for the workspace. The memory lives to the end
     // of this kernel calls.
-    CudnnRNNWorkspaceAllocator workspace_allocator(context);
+    CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
     bool launch_status = false;
     {
       mutex_lock l(mu_);
@@ -1095,25 +1127,27 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
         CudnnRNNPersistentSpaceAllocator* dropout_state_allocator =
             new CudnnRNNPersistentSpaceAllocator(context);
         rnn_state.dropout_state_allocator.reset(dropout_state_allocator);
+        const AlgorithmConfig algo_config;
         auto rnn_desc_s = executor->createRnnDescriptor(
             model_shapes.num_layers, model_shapes.num_units,
             model_shapes.input_size, input_mode, rnn_direction_mode(),
-            rnn_mode(), data_type, dropout(), seed(), dropout_state_allocator);
+            rnn_mode(), data_type, algo_config, dropout(), seed(),
+            dropout_state_allocator);
         OP_REQUIRES_OK(context, FromExecutorStatus(rnn_desc_s));
         rnn_state.rnn_desc = std::move(rnn_desc_s.ConsumeValueOrDie());
       }
       launch_status =
           stream
-              ->ThenRnnBackward(*rnn_state.rnn_desc, *input_desc, input_data,
-                                *hidden_state_desc, input_h_data,
-                                *hidden_state_desc, input_c_data, params_data,
-                                *output_desc, output_data, *hidden_state_desc,
-                                output_h_data, *hidden_state_desc,
-                                output_c_data, output_backprop_data,
-                                output_h_backprop_data, output_c_backprop_data,
-                                &input_backprop_data, &input_h_backprop_data,
-                                &input_c_backprop_data, &params_backprop_data,
-                                &reserve_space_uint8, &workspace_allocator)
+              ->ThenRnnBackward(
+                  *rnn_state.rnn_desc, *input_desc, input_data,
+                  *hidden_state_desc, input_h_data, *hidden_state_desc,
+                  input_c_data, params_data, *output_desc, output_data,
+                  *hidden_state_desc, output_h_data, *hidden_state_desc,
+                  output_c_data, output_backprop_data, output_h_backprop_data,
+                  output_c_backprop_data, &input_backprop_data,
+                  &input_h_backprop_data, &input_c_backprop_data,
+                  &params_backprop_data, &reserve_space_uint8,
+                  &workspace_allocator, /*output_result_profile=*/nullptr)
               .ok();
     }
     OP_REQUIRES(context, launch_status,
@@ -1122,8 +1156,8 @@ class CudnnRNNBackwardOp<GPUDevice, T> : public CudnnRNNKernelCommon {
 
  private:
   mutex mu_;
-  std::unordered_map<CudnnModelShapes, RnnScratchSpace, CudnnModelShapesHasher,
-                     CudnnModelShapesComparator>
+  std::unordered_map<CudnnRnnModelShapes, RnnScratchSpace,
+                     CudnnRnnModelShapesHasher, CudnnRnnModelShapesComparator>
       rnn_state_cache_ GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 1466f24202fea4200f752985d620f1fbea61d35a..1920c54e80759735686d8ac3e17feb4fb4310337 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,9 +18,7 @@ limitations under the License.
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Abs", functor::abs, float, Eigen::half, double, int32,
           int64);
-#if !defined(IS_MOBILE_PLATFORM)
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
-#endif
 
 #if GOOGLE_CUDA
 REGISTER4(UnaryOp, GPU, "Abs", functor::abs, float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bd22f5777c27bba4a1694443206855380b226921
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -0,0 +1,224 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+// Basic coefficient-wise tenary operations.
+// This is the case for example of the clip_by_value.
+//   Device: E.g., CPUDevice, GPUDevice.
+//   Functor: defined above. E.g., functor::clip.
+template <typename Device, typename T>
+class ClipOp : public OpKernel {
+ public:
+  explicit ClipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& in0 = ctx->input(0);
+    const Tensor& in1 = ctx->input(1);
+    const Tensor& in2 = ctx->input(2);
+
+    auto in0_flat = in0.flat<T>();
+    auto in1_flat = in1.flat<T>();
+    auto in2_flat = in2.flat<T>();
+    const Device& d = ctx->eigen_device<Device>();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    auto out_flat = out->flat<T>();
+    if (in1.shape() == in2.shape()) {
+      if (in0.shape() == in1.shape()) {
+        functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                            out_flat);
+      } else {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                          out_flat);
+      }
+    } else {
+      if (in0.shape() == in1.shape()) {
+        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                               out_flat);
+      } else {
+        OP_REQUIRES(ctx, (in0.shape() == in2.shape() &&
+                          TensorShapeUtils::IsScalar(in1.shape())),
+                    errors::InvalidArgument(
+                        "clip_value_min and clip_value_max must be either of "
+                        "the same shape as input, or a scalar. ",
+                        "input shape: ", in0.shape().DebugString(),
+                        "clip_value_min shape: ", in1.shape().DebugString(),
+                        "clip_value_max shape: ", in2.shape().DebugString()));
+        functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
+                                                out_flat);
+      }
+    }
+  }
+};
+
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipFunc {
+  UnaryClipFunc(const T& value_min, const T& value_max)
+      : value_min_(value_min), value_max_(value_max) {}
+  const T operator()(const T& value) const {
+    return std::max(std::min(value, value_max_), value_min_);
+  }
+  T value_min_;
+  T value_max_;
+};
+template <typename T>
+struct UnaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat = in0_flat.unaryExpr(UnaryClipFunc<T>(in1_flat(0), in2_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipFunc {
+  BinaryRightClipFunc(const T& value_min) : value_min_(value_min) {}
+  const T operator()(const T& value, const T& value_max) const {
+    return std::max(std::min(value, value_max), value_min_);
+  }
+  T value_min_;
+};
+template <typename T>
+struct BinaryRightClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in2_flat, BinaryRightClipFunc<T>(in1_flat(0)));
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipFunc {
+  BinaryLeftClipFunc(const T& value_max) : value_max_(value_max) {}
+  const T operator()(const T& value, const T& value_min) const {
+    return std::max(std::min(value, value_max_), value_min);
+  }
+  T value_max_;
+};
+template <typename T>
+struct BinaryLeftClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat =
+        in0_flat.binaryExpr(in1_flat, BinaryLeftClipFunc<T>(in2_flat(0)));
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat& in0_flat,
+                  typename TTypes<T>::ConstFlat& in1_flat,
+                  typename TTypes<T>::ConstFlat& in2_flat,
+                  typename TTypes<T>::Flat& out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_CPU(T)                         \
+  template struct UnaryClipOp<CPUDevice, T>;       \
+  template struct BinaryRightClipOp<CPUDevice, T>; \
+  template struct BinaryLeftClipOp<CPUDevice, T>;  \
+  template struct TernaryClipOp<CPUDevice, T>;
+INSTANTIATE_CPU(Eigen::half);
+INSTANTIATE_CPU(float);
+INSTANTIATE_CPU(double);
+INSTANTIATE_CPU(int8);
+INSTANTIATE_CPU(int16);
+INSTANTIATE_CPU(int32);
+INSTANTIATE_CPU(int64);
+INSTANTIATE_CPU(uint8);
+INSTANTIATE_CPU(uint16);
+#undef INSTANTIATE_CPU
+}  // namespace functor
+
+#define REGISTER_CPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
+      ClipOp<CPUDevice, type>);
+
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+REGISTER_CPU_KERNEL(int8);
+REGISTER_CPU_KERNEL(int16);
+REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int64);
+REGISTER_CPU_KERNEL(uint8);
+REGISTER_CPU_KERNEL(uint16);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+
+#define REGISTER_GPU_KERNEL(type)                                       \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ClipByValue").Device(DEVICE_GPU).TypeConstraint<type>("T"), \
+      ClipOp<GPUDevice, type>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+REGISTER_GPU_KERNEL(int8);
+REGISTER_GPU_KERNEL(int16);
+REGISTER_GPU_KERNEL(int64);
+REGISTER_GPU_KERNEL(uint8);
+REGISTER_GPU_KERNEL(uint16);
+
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("ClipByValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("t")
+                            .HostMemory("clip_value_min")
+                            .HostMemory("clip_value_max")
+                            .HostMemory("output")
+                            .TypeConstraint<int32>("T"),
+                        ClipOp<CPUDevice, int32>);
+
+#undef REGISTER_GPU_KERNEL
+#endif
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_clip.h b/tensorflow/core/kernels/cwise_op_clip.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a4bf8cf1d63736cce079fc7823c16d585007ca0
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip.h
@@ -0,0 +1,61 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+#define TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+namespace functor {
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename Device, typename T>
+struct UnaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename Device, typename T>
+struct BinaryRightClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename Device, typename T>
+struct BinaryLeftClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename Device, typename T>
+struct TernaryClipOp {
+  void operator()(const Device &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const;
+};
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_CWISE_OP_CLIP_H_
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c0784754817bf4534b3137b9a1a39f02c287ec4
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -0,0 +1,134 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/cwise_op_clip.h"
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+template <typename T>
+__global__ void UnaryClipCustomKernel(const int32 size_in, const T *in0,
+                                      const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryRightClipCustomKernel(const int32 size_in, const T *in0,
+                                            const T *in1, const T *in2,
+                                            T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[i] < in0[i] ? in2[i] : in0[i];
+    out[i] = value < in1[0] ? in1[0] : value;
+  }
+}
+
+template <typename T>
+__global__ void BinaryLeftClipCustomKernel(const int32 size_in, const T *in0,
+                                           const T *in1, const T *in2, T *out) {
+  CUDA_1D_KERNEL_LOOP(i, size_in) {
+    T value = in2[0] < in0[i] ? in2[0] : in0[i];
+    out[i] = value < in1[i] ? in1[i] : value;
+  }
+}
+
+namespace functor {
+
+// Unary functor for clip [Tensor, Scalar, Scalar]
+template <typename T>
+struct UnaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    UnaryClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Scalar, Tensor]
+template <typename T>
+struct BinaryRightClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryRightClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Binary functor for clip [Tensor, Tensor, Scalar]
+template <typename T>
+struct BinaryLeftClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    CudaLaunchConfig config = GetCudaLaunchConfig(in0_flat.size(), d);
+
+    BinaryLeftClipCustomKernel<
+        T><<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+        in0_flat.size(), in0_flat.data(), in1_flat.data(), in2_flat.data(),
+        out_flat.data());
+  }
+};
+
+// Ternary functor for clip [Tensor, Tensor, Tensor]
+template <typename T>
+struct TernaryClipOp<GPUDevice, T> {
+  void operator()(const GPUDevice &d, typename TTypes<T>::ConstFlat &in0_flat,
+                  typename TTypes<T>::ConstFlat &in1_flat,
+                  typename TTypes<T>::ConstFlat &in2_flat,
+                  typename TTypes<T>::Flat &out_flat) const {
+    out_flat.device(d) = in0_flat.cwiseMin(in2_flat).cwiseMax(in1_flat);
+  }
+};
+
+#define INSTANTIATE_GPU(T)                         \
+  template struct UnaryClipOp<GPUDevice, T>;       \
+  template struct BinaryRightClipOp<GPUDevice, T>; \
+  template struct BinaryLeftClipOp<GPUDevice, T>;  \
+  template struct TernaryClipOp<GPUDevice, T>;
+INSTANTIATE_GPU(Eigen::half);
+INSTANTIATE_GPU(float);
+INSTANTIATE_GPU(double);
+INSTANTIATE_GPU(int8);
+INSTANTIATE_GPU(int16);
+INSTANTIATE_GPU(int32);
+INSTANTIATE_GPU(int64);
+INSTANTIATE_GPU(uint8);
+INSTANTIATE_GPU(uint16);
+#undef INSTANTIATE_GPU
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index 98936e0f960f1f407c2187746ca80d3db0a93412..5d17c890cfec77cd3f50ee649adf4af6e20b5ed7 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
+          bfloat16, complex64, complex128);
 
 #if GOOGLE_CUDA
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 06918075a42648a3cf7135376d728fa466e7c469..a80905d1450cc38619bb27c2e27eda58b3cf169d 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -27,27 +27,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 
 namespace Eigen {
-namespace numext {
-#if GOOGLE_CUDA
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  auto com = ::expf(x.real());
-  auto res_real = com * ::cosf(x.imag());
-  auto res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  auto com = ::exp(x.real());
-  auto res_real = com * ::cos(x.imag());
-  auto res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-}  // namespace numext
-
 namespace internal {
 
 template <typename T>
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 01754ec21acd2196dd907747da45071022bcebc9..8c4f0218ee174328ec5c2484d08d639251047728 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -10,18 +10,7 @@ licenses(["notice"])  # Apache 2.0
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_kernel_library",
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
+    "tf_cc_test",
 )
 
 cc_library(
@@ -295,11 +284,31 @@ tf_kernel_library(
     ],
 )
 
+cc_library(
+    name = "prefetch_autotuner",
+    srcs = ["prefetch_autotuner.cc"],
+    hdrs = ["prefetch_autotuner.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "prefetch_autotuner_test",
+    srcs = ["prefetch_autotuner_test.cc"],
+    deps = [
+        ":prefetch_autotuner",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "prefetch_dataset_op",
     srcs = ["prefetch_dataset_op.cc"],
     deps = [
         ":dataset",
+        ":prefetch_autotuner",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 834c06bb930d1c723c5b3f880dcc13a892bb44f7..46f43dd1b1dcd79e1fc1f8fadc24858f3f7eae9f 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -263,6 +263,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 }
                 const int64 window_size =
                     window_size_func_output[0].scalar<int64>()();
+                if (window_size <= 0) {
+                  return errors::InvalidArgument(
+                      "Window size must be greater than zero, but got ",
+                      window_size, ".");
+                }
                 window_sizes_[key] = window_size;
               }
 
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.cc b/tensorflow/core/kernels/data/prefetch_autotuner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b3272f6bcde56029d878f3e61a7809594db86b24
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.cc
@@ -0,0 +1,46 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+namespace tensorflow {
+
+PrefetchAutotuner::PrefetchAutotuner(int64 initial_buffer_size)
+    : buffer_limit_(initial_buffer_size) {
+  if (initial_buffer_size == kAutoTune) {
+    mode_ = Mode::kUpswing;
+    buffer_limit_ = 1;
+  }
+}
+
+void PrefetchAutotuner::RecordConsumption(size_t current_buffer_size) {
+  switch (mode_) {
+    case Mode::kDisabled:
+      return;
+    case Mode::kUpswing:
+      if (current_buffer_size == buffer_limit_) {
+        mode_ = Mode::kDownswing;
+      }
+      return;
+    case Mode::kDownswing:
+      if (current_buffer_size == 0) {
+        buffer_limit_ *= 2;  // Increase the buffer size.
+        mode_ = Mode::kUpswing;
+      }
+      return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner.h b/tensorflow/core/kernels/data/prefetch_autotuner.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa8a1840723ec8af1d1314af1b89bed7f120abc0
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner.h
@@ -0,0 +1,71 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
+
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+// PrefetchAutotuner dynamically adjusts the buffer size of a prefetch iterator.
+//
+// PrefetchAutotuner attempts to find the minimum buffer size such that there is
+// always at least 1 element in the prefetch queue every time the downstream
+// iterator calls GetNext().
+//
+// One common failure mode of input pipelines is being throughput bound. No
+// amount of prefetching can address that performance mode. In order to guard
+// against this condition, PrefetchAutotuner will only increase the buffer_limit
+// if the prefetching thread is able to successfully fill the buffer at its
+// current size.
+//
+// Note: in the current implementation, we never decrease the buffer_limit().
+// This should change in the future!
+//
+// PrefetchAutotuner is NOT thread safe.
+class PrefetchAutotuner {
+ public:
+  static const int64 kAutoTune = -1;
+
+  explicit PrefetchAutotuner(int64 initial_buffer_size);
+
+  int64 buffer_limit() const { return buffer_limit_; }
+
+  void RecordConsumption(size_t current_buffer_size);
+  void RecordEmpty() { RecordConsumption(0); }
+
+ private:
+  // PrefetchAutotuner operates as a state machine.
+  enum class Mode {
+    // Disables the autotuning.
+    kDisabled,
+
+    // We have increased the size of the buffer, and will transition to
+    // kDownswing if we successfully fill the buffer.
+    kUpswing,
+
+    // We have successfully filled a buffer of this size. If we ever block the
+    // downstream iterator, we should increase the buffer size.
+    kDownswing,
+  };
+
+  int64 buffer_limit_;
+  Mode mode_ = Mode::kDisabled;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_PREFETCH_AUTOTUNER_H_
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f573dfb3555b2466d84c6341eaa77e69414d103
--- /dev/null
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(PrefetchAutotuner, Disabled) {
+  PrefetchAutotuner t(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  t.RecordConsumption(0);
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, Enabled) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(4);
+  EXPECT_EQ(4, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
+  EXPECT_EQ(8, t.buffer_limit());
+}
+
+TEST(PrefetchAutotuner, EnabledSteady) {
+  PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(1);
+  EXPECT_EQ(1, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(2);
+  EXPECT_EQ(2, t.buffer_limit());
+  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  EXPECT_EQ(4, t.buffer_limit());
+
+  // Never reach zero again.
+  std::vector<size_t> consumption_values = {2, 3, 1, 4, 1, 2, 3, 1};
+  for (int i = 0; i < consumption_values.size(); ++i) {
+    t.RecordConsumption(consumption_values[i]);
+    EXPECT_EQ(4, t.buffer_limit())
+        << "Failed at index " << i << " with value: " << consumption_values[i];
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 1c548a30d2c8e7f33db85000d0f480b3151d6ecf..536de81fd891f1849cd285d6be4ddefb79fd3386 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/kernels/data/prefetch_autotuner.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 
 namespace tensorflow {
@@ -37,7 +38,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
     int64 buffer_size;
     OP_REQUIRES_OK(
         ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
-    OP_REQUIRES(ctx, buffer_size > 0,
+    OP_REQUIRES(ctx,
+                buffer_size > 0 || buffer_size == PrefetchAutotuner::kAutoTune,
                 errors::InvalidArgument("buffer_size must be > 0"));
 
     *output = new Dataset(ctx, input, buffer_size);
@@ -85,7 +87,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
+            auto_tuner_(params.dataset->buffer_size_) {}
 
       ~Iterator() override {
         // Signal the prefetch thread to terminate it. We will then
@@ -113,6 +116,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // Wait until the next element in the buffer has been
           // produced, or we are shutting down.
           while (!cancelled_ && !prefetch_thread_finished_ && buffer_.empty()) {
+            auto_tuner_.RecordEmpty();
             cond_var_.wait(l);
           }
 
@@ -129,6 +133,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
             if (s.ok()) {
               *out_tensors = std::move(buffer_.front().value);
             }
+            auto_tuner_.RecordConsumption(buffer_.size());
             buffer_.pop_front();
             *end_of_sequence = false;
 
@@ -242,7 +247,8 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
           // 1. Wait for a slot in the buffer.
           {
             mutex_lock l(mu_);
-            while (!cancelled_ && buffer_.size() == dataset()->buffer_size_) {
+            while (!cancelled_ &&
+                   buffer_.size() == auto_tuner_.buffer_limit()) {
               cond_var_.wait(l);
             }
 
@@ -323,6 +329,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       mutex parent_mu_ ACQUIRED_BEFORE(mu_);
       const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
+      PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<Thread> prefetch_thread_ GUARDED_BY(mu_);
       bool cancelled_ GUARDED_BY(mu_) = false;
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/sql/BUILD
index f4698bdaf7ae9767e068e49dad61d2a3d9f739a8..dc591208752c52d3f53484f5a1c564666727bb16 100644
--- a/tensorflow/core/kernels/data/sql/BUILD
+++ b/tensorflow/core/kernels/data/sql/BUILD
@@ -7,18 +7,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "sql",
     srcs = [
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 39ef8ee3ac429e1db96692eb9302616ed9ba61db..4485152e96ec680ef10f1ef15608e4bbb6d09952 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -37,25 +37,37 @@ class DataFormatDimMapOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context, src_format.size() == 4,
+                errors::InvalidArgument(strings::StrCat(
+                    "Source format must of length 4, received src_format = ",
+                    src_format)));
     OP_REQUIRES(
-        context, src_format == "NHWC",
+        context, dst_format.size() == 4,
         errors::InvalidArgument(strings::StrCat(
-            "Current implementation doesn't support source data format ",
-            src_format)));
-    OP_REQUIRES(context, dst_format == "NCHW",
-                errors::InvalidArgument(strings::StrCat(
-                    "Current implementation doesn't support dst data format ",
-                    dst_format)));
+            "Destination format must of length 4, received dst_format = ",
+            dst_format)));
+    dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
+    for (int i = 0; i < src_format.size(); ++i) {
+      for (int j = 0; j < dst_format.size(); ++j) {
+        if (dst_format[j] == src_format[i]) {
+          dst_idx_.vec<int>()(i) = j;
+          break;
+        }
+      }
+    }
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    Tensor* output = nullptr;
+    Tensor* output;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     functor::DataFormatDimMap<Device, T>()(context->eigen_device<Device>(),
-                                           input.flat<T>(), output->flat<T>());
+                                           input.flat<T>(), output->flat<T>(),
+                                           dst_idx_.vec<int>());
   }
+
+  Tensor dst_idx_;
 };
 
 template <typename Device, typename T>
@@ -147,11 +159,11 @@ TF_CALL_int64(REGISTER_KERNEL);
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                \
-  template <>                                              \
-  void DataFormatDimMap<GPUDevice, T>::operator()(         \
-      const GPUDevice& d, typename TTypes<T>::ConstFlat x, \
-      typename TTypes<T>::Flat y);                         \
+#define DECLARE_GPU_SPEC(T)                                    \
+  template <>                                                  \
+  void DataFormatDimMap<GPUDevice, T>::operator()(             \
+      const GPUDevice& d, typename TTypes<T>::ConstFlat x,     \
+      typename TTypes<T>::Flat y, const TTypes<int>::Vec dst); \
   extern template struct DataFormatDimMap<GPUDevice, T>;
 #define DECLARE_GPU_SPECS(T) DECLARE_GPU_SPEC(T);
 TF_CALL_int32(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index 2ccc919586551cefa887718481277d4a0e673dbb..1ca144cb400ff828d334495b57572b67f60e28ef 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -27,15 +27,25 @@ namespace functor {
 template <typename Device, typename T>
 struct DataFormatDimMap {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
-                  typename TTypes<T>::Flat y) {
+                  typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
     auto zero = x.constant(0);
     auto one = x.constant(1);
-    auto three = x.constant(3);
+    auto two = x.constant(2);
+
+    auto f_zero = x.constant(dst(0));
+    auto f_one = x.constant(dst(1));
+    auto f_two = x.constant(dst(2));
+    auto f_three = x.constant(dst(3));
+
     auto four = x.constant(4);
     auto x_mod = (x + four) % 4;
+
     auto is_zero = (x_mod == zero);
-    auto is_three = (x_mod == three);
-    y.device(d) = is_zero.select(zero, is_three.select(one, x_mod + one));
+    auto is_one = (x_mod == one);
+    auto is_two = (x_mod == two);
+
+    y.device(d) = is_zero.select(
+        f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
   }
 };
 
diff --git a/tensorflow/core/kernels/decode_csv_op.cc b/tensorflow/core/kernels/decode_csv_op.cc
index 0c42f632521dd86760e791626c8978c0b1e82709..3eed847c16229f20df7495e0f17b4e5e35a64a8f 100644
--- a/tensorflow/core/kernels/decode_csv_op.cc
+++ b/tensorflow/core/kernels/decode_csv_op.cc
@@ -34,6 +34,19 @@ class DecodeCSVOp : public OpKernel {
                 errors::InvalidArgument("Out type too large"));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("field_delim", &delim));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_quote_delim", &use_quote_delim_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("select_cols", &select_cols_));
+    OP_REQUIRES(
+        ctx, out_type_.size() == select_cols_.size() || select_cols_.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    select_all_cols_ = select_cols_.empty();
+    for (int i = 1; i < select_cols_.size(); i++) {
+      OP_REQUIRES(ctx, select_cols_[i - 1] < select_cols_[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols_.empty() || select_cols_.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
     OP_REQUIRES(ctx, delim.size() == 1,
                 errors::InvalidArgument("field_delim should be only 1 char"));
     delim_ = delim[0];
@@ -183,13 +196,18 @@ class DecodeCSVOp : public OpKernel {
 
  private:
   std::vector<DataType> out_type_;
+  std::vector<int64> select_cols_;
   char delim_;
   bool use_quote_delim_;
+  bool select_all_cols_;
   string na_value_;
 
   void ExtractFields(OpKernelContext* ctx, StringPiece input,
                      std::vector<string>* result) {
     int64 current_idx = 0;
+    int64 num_fields_parsed = 0;
+    int64 selector_idx = 0;  // Keep track of index into select_cols
+
     if (!input.empty()) {
       while (static_cast<size_t>(current_idx) < input.size()) {
         if (input[current_idx] == '\n' || input[current_idx] == '\r') {
@@ -198,6 +216,10 @@ class DecodeCSVOp : public OpKernel {
         }
 
         bool quoted = false;
+        bool include =
+            (select_all_cols_ || select_cols_[selector_idx] ==
+                                     static_cast<size_t>(num_fields_parsed));
+
         if (use_quote_delim_ && input[current_idx] == '"') {
           quoted = true;
           current_idx++;
@@ -214,7 +236,7 @@ class DecodeCSVOp : public OpKernel {
                             input[current_idx] != '\r',
                         errors::InvalidArgument(
                             "Unquoted fields cannot have quotes/CRLFs inside"));
-            field += input[current_idx];
+            if (include) field += input[current_idx];
             current_idx++;
           }
 
@@ -226,14 +248,14 @@ class DecodeCSVOp : public OpKernel {
               (static_cast<size_t>(current_idx) < input.size() - 1) &&
               (input[current_idx] != '"' || input[current_idx + 1] != delim_)) {
             if (input[current_idx] != '"') {
-              field += input[current_idx];
+              if (include) field += input[current_idx];
               current_idx++;
             } else {
               OP_REQUIRES(
                   ctx, input[current_idx + 1] == '"',
                   errors::InvalidArgument("Quote inside a string has to be "
                                           "escaped by another quote"));
-              field += '"';
+              if (include) field += '"';
               current_idx += 2;
             }
           }
@@ -250,11 +272,20 @@ class DecodeCSVOp : public OpKernel {
           current_idx += 2;
         }
 
-        result->push_back(field);
+        num_fields_parsed++;
+        if (include) {
+          result->push_back(field);
+          selector_idx++;
+          if (selector_idx == select_cols_.size()) return;
+        }
       }
 
+      bool include =
+          (select_all_cols_ || select_cols_[selector_idx] ==
+                                   static_cast<size_t>(num_fields_parsed));
       // Check if the last field is missing
-      if (input[input.size() - 1] == delim_) result->push_back(string());
+      if (include && input[input.size() - 1] == delim_)
+        result->push_back(string());
     }
   }
 };
diff --git a/tensorflow/core/kernels/decode_image_op.cc b/tensorflow/core/kernels/decode_image_op.cc
index 912d04c1536600348e8263f03709f2305607d11f..2cafa44f37a76116386f0d37d50cb66ff37f1ed8 100644
--- a/tensorflow/core/kernels/decode_image_op.cc
+++ b/tensorflow/core/kernels/decode_image_op.cc
@@ -41,9 +41,9 @@ enum FileFormat {
 // Classify the contents of a file based on starting bytes (the magic number).
 FileFormat ClassifyFileFormat(StringPiece data) {
   // The 4th byte of JPEG is '\xe0' or '\xe1', so check just the first three
-  if (data.starts_with("\xff\xd8\xff")) return kJpgFormat;
-  if (data.starts_with("\x89PNG\r\n\x1a\n")) return kPngFormat;
-  if (data.starts_with("\x47\x49\x46\x38")) return kGifFormat;
+  if (str_util::StartsWith(data, "\xff\xd8\xff")) return kJpgFormat;
+  if (str_util::StartsWith(data, "\x89PNG\r\n\x1a\n")) return kPngFormat;
+  if (str_util::StartsWith(data, "\x47\x49\x46\x38")) return kGifFormat;
   return kUnknownFormat;
 }
 
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4e5b776ed656a16cabde5e6823cf5c21efcaa59
--- /dev/null
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -0,0 +1,1011 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// DecodeProto is a TensorFlow Op which extracts arbitrary fields
+// from protos serialized as strings.
+//
+// See docs in ../ops/decode_proto_op.cc.
+//
+// This implementation reads the serialized format using a handful of
+// calls from the WireFormatLite API used by generated proto code.
+// WireFormatLite is marked as an "internal" proto API but is widely
+// used in practice and highly unlikely to change.
+// This will be much faster than the previous implementation based on
+// constructing a temporary dynamic message in memory and using the
+// proto reflection api to read it.
+// It can be used with any proto whose descriptors are available at
+// runtime but should be competitive in speed with approaches that
+// compile in the proto definitions.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/decode.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::MakeUnique;
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::DynamicMessageFactory;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::Message;
+using ::tensorflow::protobuf::TextFormat;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedInputStream;
+
+const bool kFailOnDecodeError = true;
+
+// Returns true if the proto field type can be converted to the
+// tensorflow::DataType.
+bool CheckOutputType(FieldDescriptor::Type field_type, DataType output_type) {
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return output_type == tensorflow::DT_FLOAT ||
+             output_type == tensorflow::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_BOOL:
+      return output_type == tensorflow::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_GROUP:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_MESSAGE:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_BYTES:
+      return output_type == tensorflow::DT_STRING;
+    case WireFormatLite::TYPE_UINT32:
+      return output_type == tensorflow::DT_INT32 ||
+             output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_ENUM:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SFIXED64:
+      return output_type == tensorflow::DT_INT64;
+    case WireFormatLite::TYPE_SINT32:
+      return output_type == tensorflow::DT_INT32;
+    case WireFormatLite::TYPE_SINT64:
+      return output_type == tensorflow::DT_INT64;
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// A FieldInfo holds a handful of information from the FieldDescriptor
+// and user attributes.
+struct FieldInfo {
+  FieldInfo(const FieldDescriptor* field_desc, int user_index)
+      : output_index(user_index) {
+    // Without this intermediate data structure, the profile had hotspots
+    // calling methods of FieldDescriptor.
+    number = field_desc->number();
+
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    type = static_cast<WireFormatLite::FieldType>(field_desc->type());
+    is_repeated = field_desc->is_repeated();
+  }
+
+  // Disable copy and move.
+  FieldInfo(const FieldInfo&) = delete;
+  FieldInfo& operator=(const FieldInfo&) = delete;
+
+  // Internally we sort field descriptors by wire number for
+  // fast lookup. In general this is different from the order
+  // given by the user. Output_index gives the index into
+  // the field_names and output_types attributes and into
+  // the output tensor list.
+  int output_index = -1;
+
+  // This is a cache of the relevant fields from `FieldDescriptorProto`.
+  // This was added after noticing that FieldDescriptor->type() was
+  // using 6% of the cpu profile.
+  WireFormatLite::FieldType type;
+  int number;
+  bool is_repeated;
+};
+
+// A CountCollector counts sizes of repeated and optional fields in a proto.
+//
+// Each field is tracked by a single CountCollector instance. The
+// instance manages a single count, which is stored as a pointer (it
+// is intended to be a reference to the `sizes` output which is being
+// filled in). The pointer is passed in at initialization.
+//
+// Counting is done as a separate pass in order to allocate output tensors
+// all at once. This allows the TensorFlow runtime to optimize allocation
+// for the consumer, while removing the need for copying inside this op.
+// After this pass, the DenseCollector class (below) gathers the data:
+// It is more complex and provides better motivation for the API here.
+class CountCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  CountCollector() = default;
+
+  // The count may be stored inside an Eigen Tensor to eliminate copying.
+  explicit CountCollector(int32* count) : count_ptr_(count) {}
+
+  // Reads (in this case counts) a single value.
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // Only repeated fields can have count > 1.
+    if (*count_ptr_ == 0 || field.is_repeated) {
+      (*count_ptr_)++;
+    }
+    // We expect a wire type based on the schema field_type, to allow
+    // a little more checking.
+    if (!SkipValue(input, field)) {
+      return errors::DataLoss("ReadValue: Failed skipping field when counting");
+    }
+    return Status::OK();
+  }
+
+  // Reads (in this case counts) a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          size_t buf_size) {
+    if (buf_size == 0) {
+      return Status::OK();
+    }
+
+    const void* tmpbuf;
+    int unused_max_buf_size;
+
+    input->GetDirectBufferPointerInline(&tmpbuf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    const uint8* buf = reinterpret_cast<const uint8*>(tmpbuf);
+
+    // Important: we skipped the input->{Push,Pop}Limit() calls for speed,
+    // so the bounds check on buf_size inside Skip() is critical, and
+    // must be done before scanning the contents.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss("ReadPackedValues: Skipping packed field failed");
+    }
+
+    // Dispatch to the appropriately typed field reader based on the
+    // schema type.
+    Status st;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        st = CountPackedFixed<double>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FLOAT:
+        st = CountPackedFixed<float>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_UINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_INT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED64:
+        st = CountPackedFixed<uint64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_FIXED32:
+        st = CountPackedFixed<uint32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_BOOL:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_STRING:
+        st = errors::DataLoss("TYPE_STRING encountered as packed");
+        break;
+      case WireFormatLite::TYPE_GROUP:
+        st = errors::DataLoss("TYPE_GROUP encountered as packed");
+        break;
+      case WireFormatLite::TYPE_MESSAGE:
+        st = errors::DataLoss("TYPE_MESSAGE encountered as packed");
+        break;
+      case WireFormatLite::TYPE_BYTES:
+        st = errors::DataLoss("TYPE_BYTES encountered as packed");
+        break;
+      case WireFormatLite::TYPE_UINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_ENUM:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED32:
+        st = CountPackedFixed<int32>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SFIXED64:
+        st = CountPackedFixed<int64>(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT32:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+      case WireFormatLite::TYPE_SINT64:
+        st = CountPackedVarint(buf, buf_size);
+        break;
+        // default: intentionally omitted in order to enable static checking.
+    }
+    if (!st.ok()) {
+      return st;
+    }
+
+    if (!field.is_repeated && *count_ptr_ > 1) {
+      *count_ptr_ = 1;
+    }
+    return Status::OK();
+  }
+
+ private:
+  // Skips a length-delimited value.
+  static bool SkipBytes(CodedInputStream* input) {
+    uint32 length;
+    if (!input->ReadVarint32(&length)) {
+      return false;
+    }
+    return input->Skip(length);
+  }
+
+  // Counts the number of packed varints in an array.
+  // The end of a varint is signaled by a value < 0x80,
+  // so counting them requires parsing the bytestream.
+  // It is the caller's responsibility to ensure that len > 0.
+  Status CountPackedVarint(const uint8* buf, size_t len) {
+    const uint8* bound = buf + len;
+    int count;
+
+    // The last byte in a valid encoded varint is guaranteed to have
+    // the high bit unset. We rely on this property to prevent
+    // ReadVarint64FromArray from going out of bounds, so validate
+    // the end of the buf before scanning anything.
+    if (bound[-1] & 0x80) {
+      return errors::DataLoss("Corrupt packed varint");
+    }
+
+    // Now we can trust ReadVarint64FromArray to stay in bounds.
+    for (count = 0; buf < bound; ++count) {
+      uint64 temp;
+      bool ok;
+      buf = internal::ReadVarint64FromArray(buf, &ok, &temp);
+      if (!ok) {
+        return errors::DataLoss("Corrupt packed varint");
+      }
+    }
+
+    *count_ptr_ += count;
+    return Status::OK();
+  }
+
+  // Counts the number of fixed-size values in a packed field.
+  // This can be done without actually parsing anything.
+  template <typename T>
+  Status CountPackedFixed(const uint8* unused_buf, size_t len) {
+    int count = len / sizeof(T);
+    if (count * sizeof(T) != len) {
+      return errors::DataLoss(
+          "Illegal data length for packed fixed-size type: ", len);
+    }
+    *count_ptr_ += len / sizeof(T);
+    return Status::OK();
+  }
+
+  // Skips a single value in the input stream.
+  // Dispatches to the appropriately typed field skipper based on the
+  // schema type tag.
+  // This is not as permissive as just handling the wire type.
+  static bool SkipValue(CodedInputStream* input, const FieldInfo& field) {
+    uint32 tmp32;
+    protobuf_uint64 tmp64;
+    switch (field.type) {
+      case WireFormatLite::TYPE_DOUBLE:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FLOAT:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_INT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_UINT64:
+        return input->ReadVarint64(&tmp64);
+      case WireFormatLite::TYPE_INT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_FIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_FIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_BOOL:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_STRING:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_GROUP:
+        return WireFormatLite::SkipField(
+            input, WireFormatLite::MakeTag(
+                       field.number, WireFormatLite::WIRETYPE_START_GROUP));
+      case WireFormatLite::TYPE_MESSAGE:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_BYTES:
+        return SkipBytes(input);
+      case WireFormatLite::TYPE_UINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_ENUM:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED32:
+        return input->ReadLittleEndian32(&tmp32);
+      case WireFormatLite::TYPE_SFIXED64:
+        return input->ReadLittleEndian64(&tmp64);
+      case WireFormatLite::TYPE_SINT32:
+        return input->ReadVarint32(&tmp32);
+      case WireFormatLite::TYPE_SINT64:
+        return input->ReadVarint64(&tmp64);
+        // default: intentionally omitted in order to enable static checking.
+    }
+  }
+
+  int32* count_ptr_ = nullptr;
+};
+
+// A DenseCollector accumulates values from a proto into a tensor.
+//
+// There is an instance of DenseCollector for each field of each
+// proto. The DenseCollector deserializes the value from the wire
+// directly into the preallocated output Tensor.
+//
+// This class is named DenseCollector because in the future there should
+// be a SparseCollector that accumulates field data into sparse tensors if
+// the user requests it.
+class DenseCollector {
+ public:
+  // Default constructor allows the collector to be a vector element.
+  DenseCollector() = default;
+
+  // A DenseCollector applies to one field of a serialized message.
+  DenseCollector(uint8* datap, DataType dtype, int max_repeat_count)
+      : datap_(datap), dtype_(dtype), max_repeat_count_(max_repeat_count) {}
+
+  // Reads a value from the input stream and stores it.
+  //
+  // Always inlining gave a ~50% speedup on microbenchmarks at one point.
+  // TODO(nix): try removing it to see if that still holds.
+  // TODO(jsimsa): ABSL_ATTRIBUTE_ALWAYS_INLINE
+  Status ReadValue(CodedInputStream* input, const FieldInfo& field) {
+    // For required and optional fields, we overwrite values[0] with
+    // the latest one in the wire stream.
+    // See https://developers.google.com/protocol-buffers/docs/encoding#optional
+    // Only for repeated fields do we advance the next_repeat_index_ past 1.
+    // TODO(nix): to handle oneof we must also zero out any previous values
+    //  seen on the wire.
+    int32 index = 0;
+    if (field.is_repeated) {
+      index = next_repeat_index_;
+    }
+    next_repeat_index_ = index + 1;
+
+    return internal::ReadValue(input, field.type, field.number, dtype_, index,
+                               datap_);
+  }
+
+  // Reads and stores a length-delimited list of values.
+  Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
+                          const size_t buf_size) {
+    const void* buf;
+    int unused_max_buf_size;
+    input->GetDirectBufferPointerInline(&buf, &unused_max_buf_size);
+    // This is safe because the underlying storage for the CodedInputStream is
+    // owned by the input tensor. If it were a Cord or file-backed stream this
+    // pointer would go stale after the bytes were skipped.
+    if (!input->Skip(buf_size)) {
+      return errors::DataLoss(
+          "ReadPackedValues: Skipping packed field failed.  Field tag: ",
+          field.number);
+    }
+
+    // Setting stride=0 causes new values to overwrite old ones for
+    // non-repeated fields.
+    const int stride = field.is_repeated ? 1 : 0;
+
+    if (next_repeat_index_ >= max_repeat_count_) {
+      return errors::DataLoss(
+          "ReadPackedValues: Tried to write more entries than allowed.  "
+          "Field tag: ",
+          field.number, ", Max entries allowed: ", max_repeat_count_);
+    } else {
+      return internal::ReadPackedFromArray(buf, buf_size, field.type,
+                                           field.number, dtype_, stride,
+                                           &next_repeat_index_, datap_);
+    }
+  }
+
+  // Fills in any missing values in the output array with defaults.
+  // Dispatches to the appropriately typed field default based on the
+  // runtime type tag.
+  Status FillWithDefaults() {
+    switch (dtype_) {
+      case DataType::DT_FLOAT:
+        return FillDefault<float>();
+      case DataType::DT_DOUBLE:
+        return FillDefault<double>();
+      case DataType::DT_INT32:
+        return FillDefault<int32>();
+      case DataType::DT_UINT8:
+        return FillDefault<uint8>();
+      case DataType::DT_INT8:
+        return FillDefault<int8>();
+      case DataType::DT_STRING:
+        return FillDefault<string>();
+      case DataType::DT_INT64:
+        return FillDefault<int64>();
+      case DataType::DT_BOOL:
+        return FillDefault<bool>();
+      default:
+        // There are many tensorflow dtypes not handled here, but they
+        // should not come up unless type casting is added to the Op.
+        // Chaining with tf.cast() should do the right thing until then.
+        return errors::DataLoss(
+            "Failed filling defaults in unknown tf::DataType");
+    }
+  }
+
+ private:
+  // Fills empty values in the dense representation with a
+  // default value. This uses next_repeat_index_ which counts the number
+  // of parsed values for the field.
+  template <class T>
+  Status FillDefault() {
+    for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
+      reinterpret_cast<T*>(datap_)[i] = T();
+    }
+    return Status::OK();
+  }
+
+  int32 next_repeat_index_ = 0;
+
+  // This is a pointer to data_[message_index_].
+  // There is no bounds checking at this level: we computed the max
+  // repeat size for each field in CountCollector and use the same
+  // code to traverse it here, so we are guaranteed not to be called
+  // for more items than we have allocated space.
+  void* const datap_ = nullptr;
+
+  const DataType dtype_ = DataType::DT_INVALID;
+  const int max_repeat_count_ = 0;
+};
+
+class DecodeProtoOp : public OpKernel {
+ public:
+  explicit DecodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    std::vector<string> field_names;
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names));
+    std::vector<DataType> output_types;
+    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_types));
+    OP_REQUIRES(
+        context, field_names.size() == output_types.size(),
+        errors::InvalidArgument("field_names and output_types attributes must "
+                                "have the same length"));
+
+    // Gather the field descriptors and check that requested output types match.
+
+    int field_index = 0;
+    std::vector<const FieldDescriptor*> field_descs;
+    for (const string& name : field_names) {
+      auto fd = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, fd != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+      OP_REQUIRES(context,
+                  CheckOutputType(fd->type(), output_types[field_index]),
+                  // Many TensorFlow types don't have corresponding proto types
+                  // and the user will get an error if they are requested. It
+                  // would be nice to allow conversions here, but tf.cast
+                  // already exists so we don't duplicate the functionality.
+                  // Known unhandled types:
+                  //   DT_INT16 DT_COMPLEX64 DT_QINT8 DT_QUINT8 DT_QINT32
+                  //   DT_BFLOAT16 DT_QINT16 DT_QUINT16 DT_UINT16
+                  errors::InvalidArgument("Unexpected output type for ",
+                                          fd->full_name(), ": ", fd->cpp_type(),
+                                          " to ", output_types[field_index]));
+
+      field_index++;
+      field_descs.push_back(fd);
+    }
+
+    // Internally we want the field_descs sorted by their number on the wire.
+    // But the output tensors are allocated in the order given by the caller.
+    // Build a mapping i->j, where field_descs[i] corresponds to outputs[j].
+    std::vector<int> output_indices;
+    output_indices.reserve(field_names.size());
+    for (int i = 0; i < field_names.size(); i++) {
+      output_indices.push_back(i);
+    }
+    std::sort(output_indices.begin(), output_indices.end(),
+              [field_descs](int a, int b) {
+                return field_descs[a]->number() < field_descs[b]->number();
+              });
+
+    // Now store the fields in sorted order.
+    for (int i = 0; i < field_names.size(); i++) {
+      fields_.push_back(MakeUnique<FieldInfo>(field_descs[output_indices[i]],
+                                              output_indices[i]));
+    }
+
+    message_prototype_ = message_factory_.GetPrototype(message_desc);
+    OP_REQUIRES(context, message_prototype_ != nullptr,
+                errors::InvalidArgument("Couldn't get prototype message: ",
+                                        message_desc->full_name()));
+    string format;
+    OP_REQUIRES_OK(context, context->GetAttr("message_format", &format));
+    OP_REQUIRES(
+        context, format == "binary" || format == "text",
+        errors::InvalidArgument("format must be one of binary or text"));
+    is_binary_ = format == "binary";
+
+    // Enable the initial protobuf sanitizer, which is much
+    // more expensive than the decoder.
+    // TODO(nix): Remove this once the fast decoder
+    // has passed security review.
+    OP_REQUIRES_OK(context, context->GetAttr("sanitize", &sanitize_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& buf_tensor = ctx->input(0);
+    int message_count = buf_tensor.NumElements();
+    OP_REQUIRES(ctx, message_count >= 1,
+                errors::InvalidArgument(
+                    "Bufs argument must contain at least one value"));
+
+    int field_count = fields_.size();
+
+    // Save the argument shape for later, then flatten the input
+    // Tensor since we are working componentwise. We will restore
+    // the same shape in the returned Tensor.
+    const TensorShape& shape_prefix = buf_tensor.shape();
+
+    TensorShape sizes_shape = shape_prefix;
+    sizes_shape.AddDim(field_count);
+    Tensor* sizes_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, sizes_shape, &sizes_tensor));
+
+    // This is used to allocate binary bufs if used. It serves only
+    // to define memory ownership.
+    std::vector<string> tmp_binary_bufs(message_count);
+
+    // These are the actual buffers to use, which may be in tmp_binary_bufs
+    // or may be pointers into the buf_tensor. Either way they are not owned
+    // here.
+    std::vector<const string*> bufs;
+
+    if (is_binary_ && !sanitize_) {
+      // Fast path.
+      for (int mi = 0; mi < message_count; ++mi) {
+        const string* buf = &buf_tensor.flat<string>()(mi);
+        bufs.push_back(buf);
+      }
+    } else {
+      // We will have to allocate a copy, either to convert from text to
+      // binary or to sanitize a binary proto.
+      for (int mi = 0; mi < message_count; ++mi) {
+        ReserializeMessage(ctx, buf_tensor.flat<string>()(mi),
+                           &tmp_binary_bufs[mi]);
+        if (!ctx->status().ok()) {
+          return;
+        }
+        bufs.push_back(&tmp_binary_bufs[mi]);
+      }
+    }
+
+    // Walk through all the strings in the input tensor, counting
+    // the number of fields in each.
+    // We can't allocate our actual output Tensor until we know the
+    // maximum repeat count, so we do a first pass through the serialized
+    // proto just counting fields.
+    // We always allocate at least one value so that optional fields
+    // are populated with default values - this avoids a TF
+    // conditional when handling the output data.
+    // The caller can distinguish between real data and defaults
+    // using the repeat count matrix that is returned by decode_proto.
+    std::vector<int32> max_sizes(field_count, 1);
+    for (int mi = 0; mi < message_count; ++mi) {
+      CountFields(ctx, mi, *bufs[mi], sizes_tensor, &max_sizes);
+      if (!ctx->status().ok()) {
+        return;
+      }
+    }
+
+    // Allocate the output tensors now that we've seen the max size.
+    // TODO(nix): Use allocate_output_or_forward_input for the largest
+    //   output tensor. This can avoid one large allocation by re-using
+    //   the memory of the input tensor.
+    std::vector<Tensor*> outputs(field_count);
+    for (int fi = 0; fi < field_count; ++fi) {
+      TensorShape flat_shape = {static_cast<int64>(message_count),
+                                max_sizes[fi]};
+      TensorShape out_shape = shape_prefix;
+      out_shape.AddDim(max_sizes[fi]);
+
+      // Surprisingly we don't specify the types from the output_types
+      // attribute: that is done for us based on the Op declaration:
+      //  REGISTER_OP(...)
+      //    .Attr("output_types: list(type) >= 0")
+      //    .Output("values: output_types")
+      OP_REQUIRES_OK(ctx,
+                     // ctx->allocate_output(output_indices_[fi] + 1,
+                     ctx->allocate_output(fields_[fi]->output_index + 1,
+                                          out_shape, &outputs[fi]));
+    }
+
+    // Make the second pass through the serialized proto, decoding
+    // into preallocated tensors.
+    AccumulateFields(ctx, bufs, outputs);
+  }
+
+ private:
+  // Copy a serialized message to binary, e.g. to handle text proto inputs.
+  void ReserializeMessage(OpKernelContext* ctx, const string& buf,
+                          string* binary_buf) {
+    // Handle text protos by translating them to binary.
+    std::unique_ptr<Message> message(message_prototype_->New());
+    OP_REQUIRES(ctx, message, errors::DataLoss("Initializing message failed"));
+
+    if (is_binary_) {
+      // If we get here we are sanitizing the input protobuf by parsing
+      // and reserializing it with a trusted (but very slow) library.
+      OP_REQUIRES(ctx, message->ParseFromString(buf),
+                  errors::DataLoss("Unable to parse binary protobuf"));
+    } else {
+      OP_REQUIRES(ctx, TextFormat::ParseFromString(buf, message.get()),
+                  errors::DataLoss("Unable to parse text protobuf"));
+    }
+
+    OP_REQUIRES(ctx, message->SerializeToString(binary_buf),
+                errors::DataLoss("Unable to reserialize text proto as binary"));
+  }
+
+  // Count the number of occurrences of each requested field in a message batch.
+  void CountFields(OpKernelContext* ctx, int message_index, const string& buf,
+                   Tensor* sizes_tensor, std::vector<int32>* max_sizes) {
+    int field_count = fields_.size();
+
+    CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                           buf.size());
+
+    std::vector<int32> field_sizes(field_count, 0);
+    std::vector<CountCollector> counters;
+    counters.reserve(field_count);
+    for (int i = 0; i < field_count; i++) {
+      counters.emplace_back(&field_sizes[i]);
+    }
+
+    Status st = Collect(&input, &counters);
+    if (st.ok() && !input.ConsumedEntireMessage()) {
+      st = errors::DataLoss("CountFields: Failed to consume entire buffer");
+    }
+    if (kFailOnDecodeError) {
+      OP_REQUIRES_OK(ctx, st);  // NOLINT
+    }
+    if (!st.ok()) {
+      // This code suppresses the corrupt proto, treating it as empty
+      // to avoid crashing the process.
+      LOG(WARNING) << "Proto counting error for message type " << message_type_
+                   << ": " << st;
+
+      for (int fi = 0; fi < field_count; fi++) {
+        field_sizes[fi] = 0;
+      }
+      // Finished decoding this message.
+      return;
+    }
+
+    // Update the size tensor and max repeat size for each field.
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+    for (int fi = 0; fi < field_count; fi++) {
+      int32 size = field_sizes[fi];
+      sizes(message_index, fields_[fi]->output_index) = size;
+      if ((*max_sizes)[fi] < size) {
+        (*max_sizes)[fi] = size;
+      }
+    }
+  }
+
+  // Parse fields from a serialized message into preallocated tensors.
+  void AccumulateFields(OpKernelContext* ctx,
+                        const std::vector<const string*>& bufs,
+                        std::vector<Tensor*> outputs) {
+    struct TensorInfo {
+      explicit TensorInfo(Tensor* tensor) {
+        // Note that we can decode only max_repeat_count values before overflow.
+        // No other bounds checking is done for repeated fields. For
+        // optional fields there is a check to make sure that only the last
+        // value on the wire appears in the output tensor.
+        dtype = tensor->dtype();
+        last_dim_size = tensor->dim_size(tensor->dims() - 1);
+
+        if (dtype != DT_STRING) {
+          const int element_size = DataTypeSize(dtype);
+          CHECK_GT(element_size, 0);
+          stride = last_dim_size * element_size;
+
+          const int64 flatshape[1] = {tensor->NumElements() * element_size};
+          data = tensor->bit_casted_shaped<uint8, 1>(flatshape).data();
+        } else {
+          // DataTypeSize() returns 0 for string types.
+          stride = last_dim_size * sizeof(string);
+          data = reinterpret_cast<uint8*>(tensor->flat<string>().data());
+        }
+      }
+
+      DataType dtype;
+      int last_dim_size;
+      int stride;
+      uint8* data;
+    };
+
+    int field_count = fields_.size();
+
+    std::vector<TensorInfo> tensors;
+    tensors.reserve(field_count);
+    for (int fi = 0; fi < field_count; fi++) {
+      tensors.emplace_back(outputs[fi]);
+    }
+
+    for (int message_index = 0; message_index < bufs.size(); ++message_index) {
+      const string& buf = *bufs[message_index];
+
+      std::vector<DenseCollector> collectors;
+      collectors.reserve(field_count);
+      for (const TensorInfo& info : tensors) {
+        collectors.emplace_back(info.data + message_index * info.stride,
+                                info.dtype, info.last_dim_size);
+      }
+
+      // Fill in output tensors from the wire.
+      CodedInputStream input(reinterpret_cast<const uint8*>(buf.c_str()),
+                             buf.size());
+      Status st = Collect(&input, &collectors);
+      if (st.ok() && !input.ConsumedEntireMessage()) {
+        st = errors::DataLoss(
+            "AccumulateFields: Failed to consume entire buffer");
+      }
+      if (kFailOnDecodeError) {
+        OP_REQUIRES_OK(ctx, st);  // NOLINT
+      }
+      if (!st.ok()) {
+        // This code suppresses the corrupt proto, treating it as empty
+        // to avoid crashing training.
+        LOG(WARNING) << "Proto counting error for message type "
+                     << message_type_ << ": " << st;
+      }
+
+      // Fill the remainder of the dense outputs with default values.
+      for (auto& collector : collectors) {
+        OP_REQUIRES_OK(ctx, collector.FillWithDefaults());
+      }
+    }
+  }
+
+  // Look up the FieldDescriptor for a particular field number.
+  bool LookupField(int field_number, int* field_index) {
+    // Look up the FieldDescriptor using linear search.
+    // TODO(nix): this could be sped up with binary search, but we are
+    // already way off the fastpath at this point. If you see a hotspot
+    // here, somebody is sending you very inefficient protos.
+    for (int fi = fields_.size() - 1; fi >= 0; fi--) {
+      if (field_number == fields_[fi]->number) {
+        *field_index = fi;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Traverses a serialized protobuf, dispatching values to the collectors.
+  template <class CollectorClass>
+  Status Collect(CodedInputStream* input,
+                 std::vector<CollectorClass>* collectors) {
+    int last_good_field_index = -1;
+    bool fields_disordered = false;
+    int prev_field_number = -1;
+    int field_number = -1;
+    int last_good_field_number = -1;
+    int next_good_field_number = fields_[0]->number;
+
+    // The 'tag' variable should always be treated as tainted.
+    for (uint32 tag = input->ReadTag();
+         tag != 0 && WireFormatLite::GetTagWireType(tag) !=
+                         WireFormatLite::WIRETYPE_END_GROUP;
+         tag = input->ReadTag(), prev_field_number = field_number) {
+      field_number = WireFormatLite::GetTagFieldNumber(tag);
+      const FieldInfo* field = nullptr;
+
+      // This takes advantage of the sorted field numbers in most serialized
+      // protos: it tries the next expected field first rather than doing
+      // a lookup by field number.
+      // TODO(nix): haberman@ suggests a hybrid approach with a lookup table
+      // for small field numbers and a hash table for larger ones. This would
+      // be a simpler approach that should offer comparable speed in most
+      // cases.
+      if (field_number == last_good_field_number) {
+        field = fields_[last_good_field_index].get();
+      } else {
+        if (field_number < prev_field_number) {
+          fields_disordered = true;
+        }
+
+        // If fields are out of order, fall back to slow lookup.
+        if (fields_disordered) {
+          int field_index;
+          if (LookupField(field_number, &field_index)) {
+            field = fields_[field_index].get();
+            last_good_field_index = field_index;
+          }
+        } else {
+          // If we see a field that is past the next field we want,
+          // it was empty. Look for the one after that.
+          // Repeat until we run out of fields that we care about.
+          while (field_number >= next_good_field_number) {
+            if (field_number == next_good_field_number) {
+              last_good_field_number = field_number;
+              field = fields_[last_good_field_index + 1].get();
+            }
+
+            // Start looking for the field after the current one.
+            ++last_good_field_index;
+            if (last_good_field_index < fields_.size() - 1) {
+              next_good_field_number =
+                  fields_[last_good_field_index + 1]->number;
+            } else {
+              // Saw something past the last field we care about.
+              // Continue parsing the message just in case there
+              // are disordered fields later, but any remaining
+              // ordered fields will have no effect.
+              next_good_field_number = INT_MAX;
+            }
+          }
+        }
+      }
+
+      if (!field) {
+        // Unknown and unrequested fields are skipped.
+        if (!WireFormatLite::SkipField(input, tag)) {
+          return errors::DataLoss("Failed skipping unrequested field");
+        }
+        continue;
+      }
+
+      Status st = CollectField(*field, WireFormatLite::GetTagWireType(tag),
+                               input, &(*collectors)[last_good_field_index]);
+      if (!st.ok()) {
+        return st;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Collects values for a single field.
+  template <class CollectorClass>
+  Status CollectField(const FieldInfo& field,
+                      WireFormatLite::WireType wire_type,
+                      CodedInputStream* input, CollectorClass* collector) {
+    // The wire format library defines the same constants used in
+    // descriptor.proto. This static_cast is safe because they
+    // are guaranteed to stay in sync.
+    // We need the field type from the FieldDescriptor here
+    // because the wire format doesn't tell us anything about
+    // what happens inside a packed repeated field: there is
+    // enough information in the wire format to skip the
+    // whole field but not enough to know how to parse what's
+    // inside. For that we go to the schema.
+    WireFormatLite::WireType schema_wire_type =
+        WireFormatLite::WireTypeForFieldType(field.type);
+
+    // Handle packed repeated fields. SkipField would skip the
+    // whole length-delimited blob without letting us count the
+    // values, so we have to scan them ourselves.
+    if (wire_type == WireFormatLite::WIRETYPE_LENGTH_DELIMITED &&
+        schema_wire_type != WireFormatLite::WIRETYPE_LENGTH_DELIMITED) {
+      // Handle packed repeated primitives.
+      int length;
+      if (!input->ReadVarintSizeAsInt(&length)) {
+        return errors::DataLoss("CollectField: Failed reading packed size");
+      }
+      Status st = collector->ReadPackedValues(input, field, length);
+      if (!st.ok()) {
+        return st;
+      }
+      return Status::OK();
+    }
+
+    // Read ordinary values, including strings, bytes, and messages.
+    if (wire_type != schema_wire_type) {
+      if (!WireFormatLite::SkipField(
+              input, WireFormatLite::MakeTag(field.number, wire_type))) {
+        return errors::DataLoss(
+            "CollectField: Failed skipping malformed field");
+      }
+      return Status::OK();
+    }
+    return collector->ReadValue(input, field);
+  }
+
+  string message_type_;
+  // Note that fields are sorted by increasing field number,
+  // which is not in general the order given by the user-specified
+  // field_names and output_types Op attributes.
+  std::vector<std::unique_ptr<const FieldInfo>> fields_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+  DynamicMessageFactory message_factory_;
+  const Message* message_prototype_;
+
+  // True if decoding binary format, false if decoding text format.
+  bool is_binary_;
+
+  // True if the protos should be sanitized before parsing.
+  // Enables the initial protobuf sanitizer, which is much
+  // more expensive than the decoder. The flag defaults to true
+  // but can be set to false for trusted sources.
+  // TODO(nix): flip the default to false when the fast decoder
+  // has passed security review.
+  bool sanitize_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DecodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("DecodeProtoV2").Device(DEVICE_CPU),
+                        DecodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.cc b/tensorflow/core/kernels/dense_update_functor.cc
index a878fe9a97059e2d55164600923c4a2e1312161b..3ed3794e01d63d49e5be0406e3f892bfbec2c8c8 100644
--- a/tensorflow/core/kernels/dense_update_functor.cc
+++ b/tensorflow/core/kernels/dense_update_functor.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/dense_update_functor.h"
 
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -70,4 +71,59 @@ struct DenseUpdate<CPUDevice, string, ASSIGN> {
 
 }  // namespace functor
 
+#define CPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+
+#define INSTANTIATE_GET_VARIANT_COPY_FN(DEVICE, TYPE_CALLER, TYPE_DENSE_COPY) \
+  template <>                                                                 \
+  Status VariantCopyFn<DEVICE>(OpKernelContext * context, const Tensor& from, \
+                               Tensor* to) {                                  \
+    PersistentTensor tmp;                                                     \
+    Tensor* tensor;                                                           \
+    AllocatorAttributes attr;                                                 \
+    attr.set_gpu_compatible(true);                                            \
+    attr.set_nic_compatible(true);                                            \
+    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
+        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
+    switch (from.dtype()) {                                                   \
+      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
+      default:                                                                \
+        return errors::InvalidArgument(                                       \
+            "VariantCopyFn: Could not perform a deep copy of variant "        \
+            "element of type: ",                                              \
+            DataTypeString(from.dtype()),                                     \
+            " using device: ", context->device()->name());                    \
+    }                                                                         \
+    *to = *tensor;                                                            \
+    return Status::OK();                                                      \
+  }
+
+INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
+
+#if GOOGLE_CUDA
+#define GPU_DENSE_COPY(T)                                                \
+  case DataTypeToEnum<T>::value: {                                       \
+    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
+    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
+                  from.flat<T>());                                       \
+    break;                                                               \
+  }
+#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
+  TF_CALL_GPU_ALL_TYPES(T);                 \
+  TF_CALL_int32(T);                         \
+  TF_CALL_int64(T);
+INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
+                                GPU_DENSE_COPY);
+#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
+#undef GPU_DENSE_COPY
+#endif  // GOOGLE_CUDA
+
+#undef CPU_DENSE_COPY
+#undef INSTANTIATE_GET_VARIANT_COPY_FN
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 4aefe26c545ee5eaf3868b73cd9ace38fd135f53..240c13261eaf1da256a326329c8eb72cce2cbcab 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -19,11 +19,14 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
 #endif  // TENSORFLOW_USE_SYCL
@@ -89,6 +92,17 @@ struct DenseUpdate<SYCLDevice, T, ASSIGN> {
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
+
+template <typename Device>
+Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
+
+template <>
+Status VariantCopyFn<CPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+template <>
+Status VariantCopyFn<GPUDevice>(OpKernelContext* context, const Tensor& from,
+                                Tensor* to);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_DENSE_UPDATE_FUNCTOR_H_
diff --git a/tensorflow/core/kernels/dynamic_partition_op_test.cc b/tensorflow/core/kernels/dynamic_partition_op_test.cc
index 9a7ed0af217b1c31fa14917f10128bb229b18dfd..17eb4e24b71043e10076731a48034e9e6289abe3 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -153,8 +154,8 @@ TEST_F(DynamicPartitionOpTest, Error_IndexOutOfRange) {
                            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
   AddInputFromArray<int32>(TensorShape({5}), {0, 2, 99, 2, 2});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("partitions[2] = 99 is not in [0, 4)"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "partitions[2] = 99 is not in [0, 4)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/dynamic_stitch_op_test.cc b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
index 6775893ce636786218ae03b72085e9e7926b38d1..7fa6e320f5732cac68376b522e99b1d04bb9c3ac 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op_test.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -88,9 +89,9 @@ TEST_F(DynamicStitchOpTest, Error_IndicesMultiDimensional) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [5] does not start with "
-                            "indices[1].shape = [1,5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [5] does not start with indices[1].shape = [1,5]"))
       << s;
 }
 
@@ -103,9 +104,9 @@ TEST_F(DynamicStitchOpTest, Error_DataNumDimsMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({1, 5}), {10, 60, 20, 30, 50});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("data[1].shape = [1,5] does not start with "
-                            "indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [1,5] does not start with indices[1].shape = [5]"))
       << s;
 }
 
@@ -119,9 +120,10 @@ TEST_F(DynamicStitchOpTest, Error_DataDimSizeMismatch) {
   AddInputFromArray<float>(TensorShape({4, 2}),
                            {10, 11, 60, 61, 20, 21, 30, 31});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Need data[0].shape[1:] = data[1].shape[1:], "
-                            "got data[0].shape = [3,1], data[1].shape = [4,2]"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Need data[0].shape[1:] = data[1].shape[1:], got "
+                            "data[0].shape = [3,1], data[1].shape = [4,2]"))
       << s;
 }
 
@@ -134,10 +136,9 @@ TEST_F(DynamicStitchOpTest, Error_DataAndIndicesSizeMismatch) {
   AddInputFromArray<float>(TensorShape({3}), {0, 40, 70});
   AddInputFromArray<float>(TensorShape({4}), {10, 60, 20, 30});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "data[1].shape = [4] does not start with indices[1].shape = [5]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "data[1].shape = [4] does not start with indices[1].shape = [5]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b02ae52a23aeabe55e6233e34b15cffb2073ded
--- /dev/null
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -0,0 +1,591 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// EncodeProto is a TensorFlow Op which serializes tensors into
+// arbitrary protobufs.
+//
+// See the docstring in ../ops/encode_proto_op.cc for usage of the op.
+//
+// This implementation writes the serialized format using a handful of
+// calls from the WireFormatLite API.
+
+#include <memory>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::protobuf::Descriptor;
+using ::tensorflow::protobuf::DescriptorPool;
+using ::tensorflow::protobuf::FieldDescriptor;
+using ::tensorflow::protobuf::internal::WireFormatLite;
+using ::tensorflow::protobuf::io::CodedOutputStream;
+using ::tensorflow::protobuf::io::StringOutputStream;
+
+// Computes the total serialized size for a packed repeated field.
+// For fixed-size types this can just multiply, but for variable-sized
+// types it has to iterate through the values in the tensor.
+template <WireFormatLite::FieldType FieldType, typename TensorT>
+size_t TotalPackedSize(const Tensor& input, int message_index, int size);
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_DOUBLE, double>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kDoubleSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, double>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FLOAT, float>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  return size * WireFormatLite::kFloatSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT64, int64>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_INT32, int32>(const Tensor& input,
+                                                          int message_index,
+                                                          int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::Int32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED64, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int64>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_FIXED32, int32>(const Tensor& input,
+                                                            int message_index,
+                                                            int size) {
+  return size * WireFormatLite::kFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_BOOL, bool>(const Tensor& input,
+                                                        int message_index,
+                                                        int size) {
+  return size * WireFormatLite::kBoolSize;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_UINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::UInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_ENUM, int32>(const Tensor& input,
+                                                         int message_index,
+                                                         int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size +=
+        WireFormatLite::EnumSize(input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED32, int32>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed32Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SFIXED64, int64>(
+    const Tensor& input, int message_index, int size) {
+  return size * WireFormatLite::kSFixed64Size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT32, int32>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int32>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt32Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+template <>
+size_t TotalPackedSize<WireFormatLite::TYPE_SINT64, int64>(const Tensor& input,
+                                                           int message_index,
+                                                           int size) {
+  size_t data_size = 0;
+  auto input_t = input.flat_inner_dims<int64>();
+  for (int64 i = 0; i < size; i++) {
+    data_size += WireFormatLite::SInt64Size(
+        input_t(static_cast<int64>(message_index), i));
+  }
+  return data_size;
+}
+
+// Writes a possibly repeated primitive field.
+// TensorFlow does not have unsigned types, so we decode them to signed and
+// encode them back to unsigned.
+template <typename TensorT, typename ProtoT,
+          WireFormatLite::FieldType FieldType,
+          void Writer(ProtoT, CodedOutputStream*)>
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto wire_type = WireFormatLite::WireTypeForFieldType(
+      WireFormatLite::FieldType(field_desc.type()));
+
+  auto input_t = input.flat_inner_dims<TensorT>();
+  if (field_desc.options().packed()) {
+    // Write the tag for the packed field.
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_LENGTH_DELIMITED, output);
+
+    // Write the total packed length.
+    size_t data_size =
+        TotalPackedSize<FieldType, TensorT>(input, message_index, size);
+    output->WriteVarint32(data_size);
+
+    // Write individual values.
+    for (int64 i = 0; i < size; i++) {
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  } else {
+    for (int64 i = 0; i < size; i++) {
+      WireFormatLite::WriteTag(field_desc.number(), wire_type, output);
+
+      // Note implicit cast from signed to unsigned.
+      const ProtoT& value = input_t(static_cast<int64>(message_index), i);
+      Writer(value, output);
+    }
+  }
+}
+
+// Writes a possibly repeated string, bytes, or message field.
+template <typename T, void Writer(int, const T&, CodedOutputStream*)>
+void WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
+                      int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<T>();
+  for (int64 i = 0; i < size; i++) {
+    const T& value = input_t(static_cast<int64>(message_index), i);
+    // TODO(nix): there doesn't seem to be an inlined version of
+    // WireFormatLite::WriteString or its relatives, which might allow a
+    // small speedup.
+    Writer(field_desc.number(), value, output);
+  }
+}
+
+// Writes a group field.
+// Groups are treated like submessages, but tag-delimited
+// instead of length-delimited. WireFormatLite handles this
+// differently so we code it ourselves.
+void WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  auto input_t = input.flat_inner_dims<string>();
+  for (int64 i = 0; i < size; i++) {
+    const string& value = input_t(static_cast<int64>(message_index), i);
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_START_GROUP, output);
+    // Note the use of WriteRaw instead of WriteString to skip the length.
+    output->WriteRaw(value.data(), value.size());
+    WireFormatLite::WriteTag(field_desc.number(),
+                             WireFormatLite::WIRETYPE_END_GROUP, output);
+  }
+}
+
+// Writes a (possibly repeated) field into an output stream.
+// It is the caller's responsibility to ensure that the type of
+// the input tensor is compatible with the type of the proto
+// field descriptor, and that (message_index, size-1) is within
+// bounds.
+void WriteField(const FieldDescriptor& field_desc, const Tensor& input,
+                int message_index, int size, CodedOutputStream* output) {
+  DataType tf_type = input.dtype();
+
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return WriteField<double, double, WireFormatLite::TYPE_DOUBLE,
+                        WireFormatLite::WriteDoubleNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FLOAT:
+      switch (tf_type) {
+        case DataType::DT_FLOAT:
+          return WriteField<float, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_DOUBLE:
+          return WriteField<double, float, WireFormatLite::TYPE_FLOAT,
+                            WireFormatLite::WriteFloatNoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_INT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_INT64,
+                        WireFormatLite::WriteInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_UINT64,
+                        WireFormatLite::WriteUInt64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_INT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_INT32,
+                        WireFormatLite::WriteInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED64:
+      return WriteField<int64, protobuf_uint64, WireFormatLite::TYPE_FIXED64,
+                        WireFormatLite::WriteFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_FIXED32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_FIXED32,
+                            WireFormatLite::WriteFixed32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_BOOL:
+      return WriteField<bool, bool, WireFormatLite::TYPE_BOOL,
+                        WireFormatLite::WriteBoolNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_STRING:
+      return WriteVarLenField<string, WireFormatLite::WriteString>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_GROUP:
+      return WriteGroup(field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_MESSAGE:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_BYTES:
+      return WriteVarLenField<string, WireFormatLite::WriteBytes>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_UINT32:
+      switch (tf_type) {
+        case DataType::DT_INT64:
+          return WriteField<int64, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        case DataType::DT_INT32:
+          return WriteField<int32, uint32, WireFormatLite::TYPE_UINT32,
+                            WireFormatLite::WriteUInt32NoTag>(
+              field_desc, input, message_index, size, output);
+        default:
+          return;
+      }
+    case WireFormatLite::TYPE_ENUM:
+      return WriteField<int32, int32, WireFormatLite::TYPE_ENUM,
+                        WireFormatLite::WriteEnumNoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SFIXED32,
+                        WireFormatLite::WriteSFixed32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SFIXED64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SFIXED64,
+                        WireFormatLite::WriteSFixed64NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT32:
+      return WriteField<int32, int32, WireFormatLite::TYPE_SINT32,
+                        WireFormatLite::WriteSInt32NoTag>(
+          field_desc, input, message_index, size, output);
+    case WireFormatLite::TYPE_SINT64:
+      return WriteField<int64, protobuf_int64, WireFormatLite::TYPE_SINT64,
+                        WireFormatLite::WriteSInt64NoTag>(
+          field_desc, input, message_index, size, output);
+      // default: intentionally omitted in order to enable static checking.
+  }
+}
+
+// Checks that a Protobuf field is compatible with a TensorFlow datatype.
+// This is separated from WriteField to lift it out of the inner loop.
+bool IsCompatibleType(const FieldDescriptor& field_desc, DataType tf_type) {
+  switch (field_desc.type()) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_FLOAT:
+      return tf_type == DataType::DT_FLOAT || tf_type == DataType::DT_DOUBLE;
+    case WireFormatLite::TYPE_INT64:
+    case WireFormatLite::TYPE_SFIXED64:
+    case WireFormatLite::TYPE_SINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_UINT64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_INT32:
+    case WireFormatLite::TYPE_ENUM:
+    case WireFormatLite::TYPE_SFIXED32:
+    case WireFormatLite::TYPE_SINT32:
+      return tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_FIXED64:
+      return tf_type == DataType::DT_INT64;
+    case WireFormatLite::TYPE_FIXED32:
+    case WireFormatLite::TYPE_UINT32:
+      return tf_type == DataType::DT_INT64 || tf_type == DataType::DT_INT32;
+    case WireFormatLite::TYPE_BOOL:
+      return tf_type == DataType::DT_BOOL;
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return tf_type == DataType::DT_STRING;
+      // default: intentionally omitted in order to enable static checking.
+  }
+  return false;
+}
+
+class EncodeProtoOp : public OpKernel {
+ public:
+  explicit EncodeProtoOp(OpKernelConstruction* context) : OpKernel(context) {
+    string descriptor_source;
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("descriptor_source", &descriptor_source));
+    // We always get back a desc_pool, but we may not own it. If we own it,
+    // owned_desc_pool_ will be filled in.
+    DescriptorPool const* desc_pool;
+    OP_REQUIRES_OK(context, GetDescriptorPool(context->env(), descriptor_source,
+                                              &desc_pool, &owned_desc_pool_));
+
+    string message_type;
+    OP_REQUIRES_OK(context, context->GetAttr("message_type", &message_type));
+    const Descriptor* message_desc =
+        desc_pool->FindMessageTypeByName(message_type);
+    OP_REQUIRES(context, message_desc != nullptr,
+                errors::InvalidArgument("No descriptor found for message type ",
+                                        message_type));
+
+    OP_REQUIRES_OK(context, context->GetAttr("field_names", &field_names_));
+
+    // Gather the field descriptors for the given field_names.
+    field_descs_.resize(field_names_.size());
+    for (int i = 0; i < field_names_.size(); i++) {
+      const string& name = field_names_[i];
+      auto field_desc = message_desc->FindFieldByName(name);
+      OP_REQUIRES(context, field_desc != nullptr,
+                  errors::InvalidArgument("Unknown field: ", name,
+                                          " in message type ", message_type));
+
+      field_descs_[i] = field_desc;
+    }
+
+    // Build a list of indices into field_descs sorted by increasing
+    // field_number. This will be used to output fields in sorted order,
+    // which is strongly encouraged when serializing protobufs.
+    sorted_field_index_.resize(field_names_.size());
+    // Start with the fields sorted by current index.
+    for (int i = 0; i < field_names_.size(); i++) sorted_field_index_[i] = i;
+    // Then sort the field indices by their proto field number.
+    std::sort(sorted_field_index_.begin(), sorted_field_index_.end(),
+              [this](int a, int b) -> bool {
+                return field_descs_[a]->number() < field_descs_[b]->number();
+              });
+  }
+
+  void Compute(OpKernelContext* cx) override {
+    const Tensor* sizes_tensor;
+    OP_REQUIRES_OK(cx, cx->input("sizes", &sizes_tensor));
+
+    OpInputList values;
+    OP_REQUIRES_OK(cx, cx->input_list("values", &values));
+
+    OP_REQUIRES(cx, field_descs_.size() == values.size(),
+                errors::InvalidArgument(
+                    "Length of inputs list must match field_names"));
+
+    // Check the arguments for consistency.
+    TensorShape common_prefix;
+    int message_count;
+    for (int i = 0; i < field_descs_.size(); i++) {
+      const Tensor& v = values[i];
+
+      // The type of each value tensor must match the corresponding field.
+      OP_REQUIRES(cx, IsCompatibleType(*field_descs_[i], v.dtype()),
+                  errors::InvalidArgument(
+                      "Incompatible type for field " + field_names_[i] +
+                          ".  Saw dtype: ",
+                      DataTypeString(v.dtype()),
+                      " but field type is: ", field_descs_[i]->type_name()));
+
+      // All value tensors must have the same shape prefix (i.e. batch size).
+      TensorShape shape_prefix = v.shape();
+      shape_prefix.RemoveDim(shape_prefix.dims() - 1);
+
+      // Do some initialization on the first input value. The rest will
+      // have to match this one.
+      if (i == 0) {
+        OP_REQUIRES(cx, v.dims() >= 1,
+                    errors::InvalidArgument(
+                        "Expected value to be at least a vector, saw shape: ",
+                        v.shape().DebugString()));
+        common_prefix = shape_prefix;
+        message_count = common_prefix.num_elements();
+      } else {
+        OP_REQUIRES(cx, shape_prefix == common_prefix,
+                    errors::InvalidArgument(
+                        "Values must match up to the last dimension"));
+      }
+    }
+
+    TensorShape expected_sizes_shape = common_prefix;
+    expected_sizes_shape.AddDim(field_descs_.size());
+
+    OP_REQUIRES(cx, sizes_tensor->shape() == expected_sizes_shape,
+                errors::InvalidArgument(
+                    "sizes should be batch_size + [len(field_names)].  Saw: ",
+                    sizes_tensor->shape().DebugString(),
+                    " but expected: ", expected_sizes_shape.DebugString()));
+
+    auto sizes = sizes_tensor->flat_inner_dims<int32>();
+
+    for (int i = 0; i < field_descs_.size(); ++i) {
+      const Tensor& v = values[i];
+      int max_size = v.dim_size(v.dims() - 1);
+
+      // The last dimension of a value tensor must be greater than the
+      // corresponding
+      // size in the sizes tensor.
+      for (int message_index = 0; message_index < message_count;
+           message_index++) {
+        OP_REQUIRES(
+            cx, sizes(message_index, i) <= max_size,
+            errors::InvalidArgument(
+                "Size to write must not be larger than value tensor; but saw: ",
+                sizes(message_index, i), " > ", max_size, " at message ",
+                message_index, " field ", i));
+      }
+    }
+
+    // This pointer is owned by the context.
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(cx, cx->allocate_output(0, common_prefix, &output_tensor));
+
+    auto bufs = output_tensor->flat<string>();
+    for (int message_index = 0; message_index < message_count;
+         message_index++) {
+      // TODO(nix): possibly optimize allocation here by calling
+      //   bufs(message_index).reserve(DEFAULT_BUF_SIZE);
+      StringOutputStream output_string(&bufs(message_index));
+      CodedOutputStream out(&output_string);
+      // Write fields in ascending field_number order.
+      for (int i : sorted_field_index_) {
+        auto& field_desc = *field_descs_[i];
+        const Tensor& v = values[i];
+        int size = sizes(message_index, i);
+        if (!size) continue;
+        WriteField(field_desc, v, message_index, size, &out);
+      }
+    }
+  }
+
+ private:
+  std::vector<string> field_names_;
+  std::vector<const FieldDescriptor*> field_descs_;
+
+  // Owned_desc_pool_ is null when using descriptor_source=local.
+  std::unique_ptr<DescriptorPool> owned_desc_pool_;
+
+  // Contains indices into field_names_, sorted by field number since
+  // that's the order of writing.
+  std::vector<int> sorted_field_index_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EncodeProtoOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("EncodeProto").Device(DEVICE_CPU), EncodeProtoOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.cu.cc b/tensorflow/core/kernels/fill_functor.cu.cc
index 3487606778eabde386335f8450d627b7bf74ad42..050c95cf40d4b29bde66b6b6e72b1b48a7199965 100644
--- a/tensorflow/core/kernels/fill_functor.cu.cc
+++ b/tensorflow/core/kernels/fill_functor.cu.cc
@@ -76,7 +76,7 @@ struct FillFunctor<GPUDevice, T> {
 };
 
 #define DEFINE_FILL_GPU(T) template struct FillFunctor<GPUDevice, T>;
-TF_CALL_REAL_NUMBER_TYPES(DEFINE_FILL_GPU);
+TF_CALL_NUMBER_TYPES(DEFINE_FILL_GPU);
 TF_CALL_bool(DEFINE_FILL_GPU);
 #undef DEFINE_FILL_GPU
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 351aad72135da9c11dcef7ce4ff19cd158a50a1b..f8e0267578054b2cb8bb5371545bf8547ad38c22 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -144,6 +144,11 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
                                                    .HostMemory("input")
                                                    .TypeConstraint<int32>("T"),
                                                RetvalOp);
+REGISTER_KERNEL_BUILDER(Name(kRetOp)
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<ResourceHandle>("T")
+                            .HostMemory("input"),
+                        RetvalOp);
 #undef REGISTER
 
 class PassOn : public OpKernel {
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index b687088db16a31d8ecb74a7a483c35d2c65a74f9..911aa3a78fff2f6f7272e7408388e6625df52037 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -21,10 +20,12 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/mutex.h"
 
-namespace tensorflow {
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/stream.h"
+#endif  // GOOGLE_CUDA
 
+namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef FunctionLibraryRuntime::Handle FHandle;
@@ -106,11 +107,9 @@ void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
   opts->runner = ctx->runner();
 }
 
-}  // end namespace
-
-class FunctionalIf : public AsyncOpKernel {
+class IfOp : public AsyncOpKernel {
  public:
-  explicit FunctionalIf(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit IfOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     auto lib = ctx->function_library();
     OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
     const NameAttrList* func;
@@ -120,7 +119,7 @@ class FunctionalIf : public AsyncOpKernel {
     OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &else_handle_));
   }
 
-  ~FunctionalIf() override {}
+  ~IfOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     bool cond;
@@ -134,8 +133,7 @@ class FunctionalIf : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalIf* kernel, OpKernelContext* ctx, bool cond,
-          DoneCallback done)
+    State(IfOp* kernel, OpKernelContext* ctx, bool cond, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
           cond_(cond),
@@ -168,7 +166,7 @@ class FunctionalIf : public AsyncOpKernel {
     }
 
    private:
-    FunctionalIf* const kernel_;
+    IfOp* const kernel_;
     OpKernelContext* const ctx_;
     const bool cond_;
     const DoneCallback done_;
@@ -179,18 +177,22 @@ class FunctionalIf : public AsyncOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), FunctionalIf);
+// TODO(drpng): remove this.
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
-                        FunctionalIf);
+                        IfOp);
+
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
 
-class FunctionalWhile : public AsyncOpKernel {
+class WhileOp : public AsyncOpKernel {
  public:
-  explicit FunctionalWhile(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit WhileOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("cond", &cond_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &body_func_));
   }
 
-  ~FunctionalWhile() override {}
+  ~WhileOp() override {}
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
     auto lib = ctx->function_library();
@@ -234,7 +236,7 @@ class FunctionalWhile : public AsyncOpKernel {
 
   class State {
    public:
-    State(FunctionalWhile* kernel, OpKernelContext* ctx, FHandle cond_handle,
+    State(WhileOp* kernel, OpKernelContext* ctx, FHandle cond_handle,
           FHandle body_handle, DoneCallback done)
         : kernel_(kernel),
           ctx_(ctx),
@@ -253,7 +255,7 @@ class FunctionalWhile : public AsyncOpKernel {
     void Start() { EvalCond(); }
 
    private:
-    FunctionalWhile* const kernel_;
+    WhileOp* const kernel_;
     OpKernelContext* const ctx_;
     const FHandle cond_handle_;
     const FHandle body_handle_;
@@ -316,7 +318,152 @@ class FunctionalWhile : public AsyncOpKernel {
     }
   };
 };
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), FunctionalWhile);
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), FunctionalWhile);
+// TODO(drpng): remove these.
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), WhileOp);
+
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
+
+Status GetScalar(OpKernelContext* ctx, int index, int32* value,
+                 const char* label) {
+  Tensor t = ctx->input(index);
+  if (!TensorShapeUtils::IsScalar(t.shape())) {
+    return errors::InvalidArgument(label, " must be a scalar, but ",
+                                   t.shape().DebugString());
+  }
+  *value = t.scalar<int32>()();
+  return Status::OK();
+}
+
+class ForOp : public AsyncOpKernel {
+ public:
+  explicit ForOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+    auto lib = ctx->function_library();
+    OP_REQUIRES(ctx, lib != nullptr, errors::Internal("No function library"));
+    const NameAttrList* func;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("body", &func));
+    OP_REQUIRES_OK(ctx, Instantiate(lib, *func, &body_handle_));
+  }
+
+  ~ForOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    (new State(this, ctx, done))->Start();
+  }
+
+ private:
+  FHandle body_handle_;
+
+  class State {
+   public:
+    State(ForOp* kernel, OpKernelContext* ctx, DoneCallback done)
+        : kernel_(kernel),
+          ctx_(ctx),
+          done_(std::move(done)),
+          lib_(CHECK_NOTNULL(ctx_->function_library())),
+          args_(1 + ctx_->num_inputs() - 3) {
+      args_[0] = Tensor(DT_INT32, {});
+      iter_ = &args_[0].scalar<int32>()();
+
+      const int32 num_loop_inputs = ctx_->num_inputs() - 3;
+      rets_.reserve(num_loop_inputs);
+      for (int i = 0; i < num_loop_inputs; ++i) {
+        rets_.push_back(ctx_->input(3 + i));
+      }
+    }
+
+    ~State() {}
+
+    void Start() {
+      Status s = StartLoop();
+      if (!s.ok()) Finish(s);
+    }
+
+   private:
+    ForOp* const kernel_;
+    OpKernelContext* const ctx_;
+    const DoneCallback done_;
+    FunctionLibraryRuntime* const lib_;
+    FunctionLibraryRuntime::Options opts_;
+    TensorVec args_;
+    TensorVec rets_;
+
+    int32* iter_;  // points to args_[0].
+    int32 limit_;
+    int32 delta_;
+
+    // If an error e is returned, caller must call Finish(e).
+    // If OK is returned, the async loop execution has been started.
+    Status StartLoop() {
+      SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
+
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 0, iter_, "start"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 1, &limit_, "limit"));
+      TF_RETURN_IF_ERROR(GetScalar(ctx_, 2, &delta_, "delta"));
+
+      if ((delta_ > 0 && *iter_ <= limit_) ||
+          (delta_ < 0 && *iter_ >= limit_) ||
+          (delta_ == 0 && *iter_ == limit_)) {
+        RunNext();
+        return Status::OK();
+      } else {
+        return errors::InvalidArgument("Invalid start/limit/delta: ", *iter_,
+                                       " ", limit_, " ", delta_);
+      }
+    }
+
+    void RunNext() {
+      bool done_loop;
+      if (delta_ > 0) {
+        done_loop = *iter_ >= limit_;
+      } else {
+        done_loop = *iter_ <= limit_;
+      }
+      if (done_loop) {
+        Finish(Status::OK());
+        return;
+      }
+
+      if (rets_.size() >= args_.size()) {
+        Finish(errors::InvalidArgument(
+            "For loop body returned ", rets_.size(),
+            " arguments. Expected: ", args_.size() - 1));
+        return;
+      }
+      for (int i = 0; i < rets_.size(); ++i) {
+        args_[1 + i] = std::move(rets_[i]);
+      }
+      rets_.clear();
+      lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
+                [this](const Status& s) {
+                  if (s.ok()) {
+                    *iter_ += delta_;
+                    RunNext();
+                  } else {
+                    Finish(s);
+                  }
+                });
+    }
+
+    void Finish(Status s) {
+      if (s.ok()) {
+        s = SetOutputs(kernel_, ctx_, rets_);
+      }
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    }
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("For").Device(DEVICE_CPU), ForOp);
+REGISTER_KERNEL_BUILDER(Name("For")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("start")
+                            .HostMemory("limit")
+                            .HostMemory("delta"),
+                        ForOp);
 
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 9a7eca03ce276d26321f01f80ad7f1a0a254e4db..aab4b009b505417a93238683b617f603ffc256be 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -17,18 +17,6 @@ cc_library(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
 
 tf_ops_fuzz_target_lib("identity")
diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index 16ccb03b8502dd626c0dc4f0c10fcfe50224c7b8..2c6e8bf3bcbd9270ed47d37eec6c88d7b3cfdb1c 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
 
@@ -50,7 +51,7 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   }
   // Compute slice_bytes here so that static knowledge is available
   const size_t slice_bytes = slice_elems * sizeof(T);
-  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  auto* worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
   mutex mu;
   // Store the value of invalidate index for printing error information, it's a
   // shared variable.
@@ -162,6 +163,16 @@ struct GatherFunctor<CPUDevice, T, Index> {
   }
 };
 
+template <typename Index>
+struct GatherFunctor<GPUDevice, Variant, Index> {
+  int64 operator()(OpKernelContext* ctx,
+                   typename TTypes<Variant, 3>::ConstTensor params,
+                   typename TTypes<Index>::ConstFlat indices,
+                   typename TTypes<Variant, 3>::Tensor out) {
+    return GatherFunctorCPU<Variant, Index>()(ctx, params, indices, out);
+  }
+};
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index 3edcb34bca3eaa28249cffaa8b0a79f90cdfb7dd..0409cadb67f38aea0ae5b9f1859b9e721a67637b 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -171,7 +172,7 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 7688305019cdbca94b2094c12380dab4162353d7..4870d9ae200cd55adc4833c044e5588aa1d6aa89 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_test(
     name = "graph_transferer_test",
     size = "small",
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index b16c76dc7feaeb2ea1d98ed5290b3fd851339daf..edb779540fb12e97740fd54a159887c8b3170f9a 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -92,6 +92,8 @@ class InitializableLookupTable : public LookupInterface {
   //
   // Then the iterator is exhausted, valid returns false and status returns
   // Status::OutOfRange.
+  //
+  // This class is Thread-unsafe.
   class InitTableIterator {
    public:
     InitTableIterator() {}
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index a71d047ed1a381bfc0311f86987f585f51b02536..ef6ce0546b0811edda3331de69906237cca76dd4 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -213,13 +213,13 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
 
 typedef Eigen::GpuDevice GPUDevice;
 
-#define REGISTER_EMPTY(type)                                  \
+#define REGISTER_PARALLEL_CONCAT_START(type)                  \
   REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
                               .Device(DEVICE_GPU)             \
                               .TypeConstraint<type>("dtype"), \
                           ParallelConcatStart<GPUDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_PARALLEL_CONCAT_START)
+#undef REGISTER_PARALLEL_CONCAT_START
 
 #define REGISTER_PARALLEL_CONCAT(type)                                     \
   REGISTER_KERNEL_BUILDER(                                                 \
@@ -248,5 +248,295 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
+class InplaceOpBase : public OpKernel {
+ public:
+  explicit InplaceOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    auto i = ctx->input(1);
+    auto v = ctx->input(2);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(i.shape()),
+                errors::InvalidArgument("i must be a vector. ",
+                                        i.shape().DebugString()));
+    OP_REQUIRES(ctx, x.dims() == v.dims(),
+                errors::InvalidArgument(
+                    "x and v shape doesn't match (ranks differ): ",
+                    x.shape().DebugString(), " vs. ", v.shape().DebugString()));
+    for (int i = 1; i < x.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, x.dim_size(i) == v.dim_size(i),
+          errors::InvalidArgument("x and v shape doesn't match at index ", i,
+                                  " : ", x.shape().DebugString(), " vs. ",
+                                  v.shape().DebugString()));
+    }
+    OP_REQUIRES(ctx, i.dim_size(0) == v.dim_size(0),
+                errors::InvalidArgument(
+                    "i and x shape doesn't match at index 0: ",
+                    i.shape().DebugString(), " vs. ", v.shape().DebugString()));
+
+    Tensor y = x;  // This creates an alias intentionally.
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, i, v, &y));
+    ctx->set_output(0, y);
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& i,
+                           const Tensor& v, Tensor* y) = 0;
+};
+
+}  // end namespace
+
+namespace functor {
+
+template <typename T>
+void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<T>();
+  auto Ty = y->flat_outer_dims<T>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    switch (op) {
+      case I_UPDATE:
+        Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+        break;
+      case I_ADD:
+        Ty.template chip<0>(r).device(d) += Tv.template chip<0>(j);
+        break;
+      case I_SUB:
+        Ty.template chip<0>(r).device(d) -= Tv.template chip<0>(j);
+        break;
+    }
+  }
+}
+
+// String type only supports inplace update.
+void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
+                             const Tensor& v, Tensor* y) {
+  auto Ti = i.flat<int32>();
+  auto Tv = v.flat_outer_dims<string>();
+  auto Ty = y->flat_outer_dims<string>();
+  auto nrows = Ty.dimension(0);
+  for (int64 j = 0; j < Ti.size(); ++j) {
+    auto r = (Ti(j) % nrows + nrows) % nrows;  // Guard index range.
+    Ty.template chip<0>(r).device(d) = Tv.template chip<0>(j);
+  }
+}
+
+template <>
+Status DoInplace(const CPUDevice& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  if (op == I_UPDATE) {
+    if (v.dtype() == DT_STRING) {
+      DoInplaceStringUpdateOp(device, i, v, y);
+      return Status::OK();
+    } else if (v.dtype() == DT_BOOL) {
+      DoInplaceOp<bool>(device, op, i, v, y);
+      return Status::OK();
+    }
+  }
+  switch (v.dtype()) {
+#define CASE(type)                          \
+  case DataTypeToEnum<type>::value:         \
+    DoInplaceOp<type>(device, op, i, v, y); \
+    break;
+    TF_CALL_NUMBER_TYPES(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, functor::InplaceOpType op>
+class InplaceOp : public InplaceOpBase {
+ public:
+  explicit InplaceOp(OpKernelConstruction* ctx) : InplaceOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& i, const Tensor& v,
+                   Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoInplace(d, op, i, v, y);
+  }
+};
+
+class CopyOpBase : public OpKernel {
+ public:
+  explicit CopyOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto x = ctx->input(0);
+    Tensor* y;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x.shape(), &y));
+    OP_REQUIRES_OK(ctx, DoCompute(ctx, x, y));
+  }
+
+ protected:
+  virtual Status DoCompute(OpKernelContext* ctx, const Tensor& x,
+                           Tensor* y) = 0;
+};
+
+template <typename Device>
+class CopyOp : public CopyOpBase {
+ public:
+  explicit CopyOp(OpKernelConstruction* ctx) : CopyOpBase(ctx) {}
+
+ protected:
+  Status DoCompute(OpKernelContext* ctx, const Tensor& x, Tensor* y) override {
+    const auto& d = ctx->eigen_device<Device>();
+    return ::tensorflow::functor::DoCopy(d, x, y);
+  }
+};
+
+}  // end namespace
+
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <>
+Status DoCopy(const CPUDevice& device, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                                   \
+  case DataTypeToEnum<type>::value:                  \
+    y->flat<type>().device(device) = x.flat<type>(); \
+    break;
+
+    TF_CALL_NUMBER_TYPES(CASE);
+    TF_CALL_bool(CASE);
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", x.dtype());
+  }
+  return Status::OK();
+}
+
+}  // end namespace functor
+
+namespace {
+template <typename Device, typename T>
+class EmptyOp : public OpKernel {
+ public:
+  explicit EmptyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init", &init_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& shape = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(shape.shape()),
+        errors::InvalidArgument("shape must be a vector of int32, got shape ",
+                                shape.shape().DebugString()));
+    auto dims = shape.flat<int32>();
+    TensorShape out_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                            reinterpret_cast<const int32*>(dims.data()),
+                            dims.size(), &out_shape));
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
+
+    if (init_) {
+      functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                           out->flat<T>());
+    }
+  }
+
+ private:
+  bool init_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub").Device(DEVICE_CPU),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+REGISTER_KERNEL_BUILDER(Name("DeepCopy").Device(DEVICE_CPU), CopyOp<CPUDevice>);
+
+#define REGISTER_EMPTY(type, dev)                             \
+  REGISTER_KERNEL_BUILDER(Name("Empty")                       \
+                              .Device(DEVICE_##dev)           \
+                              .HostMemory("shape")            \
+                              .TypeConstraint<type>("dtype"), \
+                          EmptyOp<dev##Device, type>)
+
+REGISTER_EMPTY(float, CPU)
+REGISTER_EMPTY(double, CPU)
+REGISTER_EMPTY(Eigen::half, CPU)
+REGISTER_EMPTY(string, CPU)
+REGISTER_EMPTY(int32, CPU)
+REGISTER_EMPTY(int64, CPU)
+REGISTER_EMPTY(bool, CPU)
+
+#if GOOGLE_CUDA
+
+typedef Eigen::GpuDevice GPUDevice;
+
+#define REGISTER(TYPE)                                                    \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceUpdate").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
+      InplaceOp<GPUDevice, functor::I_UPDATE>);                           \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceAdd").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_ADD>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("InplaceSub").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),    \
+      InplaceOp<GPUDevice, functor::I_SUB>);                              \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("DeepCopy").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"),      \
+      CopyOp<GPUDevice>);
+
+REGISTER(float);
+REGISTER(double);
+REGISTER(Eigen::half);
+REGISTER(int64);
+
+REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_UPDATE>);
+REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_ADD>);
+REGISTER_KERNEL_BUILDER(Name("InplaceSub")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("i")
+                            .HostMemory("v")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        InplaceOp<CPUDevice, functor::I_SUB>);
+
+REGISTER_KERNEL_BUILDER(Name("DeepCopy")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .TypeConstraint<int32>("T"),
+                        CopyOp<CPUDevice>);
+REGISTER_EMPTY(float, GPU);
+REGISTER_EMPTY(double, GPU);
+REGISTER_EMPTY(Eigen::half, GPU);
+REGISTER_EMPTY(int64, GPU);
+
+#endif  // GOOGLE_CUDA
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops_functor.h b/tensorflow/core/kernels/inplace_ops_functor.h
index 53529f51653f35f0815bb640ec244e4acccade2a..b806787e91c39d0add8ec6bb386a56d12a3b4b24 100644
--- a/tensorflow/core/kernels/inplace_ops_functor.h
+++ b/tensorflow/core/kernels/inplace_ops_functor.h
@@ -26,6 +26,23 @@ template <typename Device>
 Status DoParallelConcat(const Device& device, const Tensor& value, int32 loc,
                         Tensor* output);
 
+// Inplace update/add/sub values in 'y'. It computes
+//   y[i, :] = v if op is I_UPDATE
+//   y[i, :] += v if op is I_ADD
+//   y[i, :] -= v if op is I_SUB
+// Returns an error if the operation fails.
+enum InplaceOpType {
+  I_UPDATE,  // x = y
+  I_ADD,     // x += y
+  I_SUB,     // x -= y
+};
+template <typename Device>
+Status DoInplace(const Device& device, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y);
+// Copies x into y.
+template <typename Device>
+Status DoCopy(const Device& device, const Tensor& x, Tensor* y);
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 8467360435af7b2a267a14e62d432808ec39e239..f1616b1ea88c93fc8ce039c8afd0be0d13504317 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -77,6 +77,103 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
   return Status::OK();
 }
 
+template <typename T, InplaceOpType op>
+__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
+                                  const int64 cols, const int64 n, const T* src,
+                                  const int32* rowids, T* dst) {
+  CUDA_1D_KERNEL_LOOP(idx, nthreads) {
+    int64 r = idx / cols;
+    int64 c = idx % cols;
+    r = (rowids[r] % rows + rows) % rows;  // Guard index range.
+    T* p = dst + r * cols + c;
+    const T* q = src + idx;
+    switch (op) {
+      case I_UPDATE:
+        *p = ldg(q);
+        break;
+      case I_ADD:
+        *p += ldg(q);
+        break;
+      case I_SUB:
+        *p -= ldg(q);
+        break;
+    }
+  }
+}
+
+template <typename T>
+void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  const int64 nelem = v.NumElements();
+  CudaLaunchConfig cfg = GetCudaLaunchConfig(nelem, d);
+  auto Ty = y->flat_outer_dims<T>();
+  const int64 nrows = Ty.dimension(0);
+  const int64 ncols = Ty.dimension(1);
+  const int64 n = i.NumElements();
+  const T* src = v.flat<T>().data();
+  // TODO(sjhwang): Check that first dimension fits in int32 range.
+  const int32* rowids = i.flat<int32>().data();
+  T* dst = y->flat<T>().data();
+  switch (op) {
+    case I_UPDATE:
+      DoInplaceOpKernel<T, I_UPDATE>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_ADD:
+      DoInplaceOpKernel<T, I_ADD>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+    case I_SUB:
+      DoInplaceOpKernel<T, I_SUB>
+          <<<cfg.block_count, cfg.thread_per_block, 0, d.stream()>>>(
+              cfg.virtual_thread_count, nrows, ncols, n, src, rowids, dst);
+      break;
+  }
+}
+
+template <>
+Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                 const Tensor& v, Tensor* y) {
+  CHECK_EQ(v.dtype(), y->dtype());
+  switch (v.dtype()) {
+#define CASE(type)                     \
+  case DataTypeToEnum<type>::value:    \
+    DoInplaceOp<type>(d, op, i, v, y); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported data type: ", v.dtype());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+  CHECK_EQ(x.dtype(), y->dtype());
+  switch (x.dtype()) {
+#define CASE(type)                              \
+  case DataTypeToEnum<type>::value:             \
+    y->flat<type>().device(d) = x.flat<type>(); \
+    break;
+
+    CASE(float)
+    CASE(double)
+    CASE(Eigen::half)
+    CASE(int64)
+#undef CASE
+    default:
+      return errors::InvalidArgument("Unsupported dtype: ", x.dtype());
+  }
+  return Status::OK();
+}
+
 }  // end namespace functor
 }  // namespace tensorflow
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index baf0a4abe48ea0c5a5fed5d7ef3e53925e393b10..9e7786f25e052b0113a8020e3af1e015eae41b8d 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -112,6 +112,7 @@ bool TensorList::Decode(const VariantTensorData& data) {
       dims.push_back(scratch);
     }
   }
+  element_shape = PartialTensorShape(dims);
   return true;
 }
 
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 9733883001d4ce7888b4893ecb43047b621a3eba..f3bbf3b6e37d0a2852b68a018e9d32ac88f610a7 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -83,7 +83,8 @@ class TensorListStack : public OpKernel {
                                         DataTypeString(l->element_dtype)));
     OP_REQUIRES(c, l->element_shape.IsFullyDefined(),
                 errors::InvalidArgument("Tried to stack elements from a list "
-                                        "with non-fully-defined shape."));
+                                        "with non-fully-defined shape: ",
+                                        l->element_shape.DebugString()));
     if (num_elements_ != -1) {
       OP_REQUIRES(c, l->tensors.size() == num_elements_,
                   errors::InvalidArgument("Operation expected a list with ",
@@ -159,15 +160,13 @@ class TensorListFromTensor : public OpKernel {
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
                   errors::Unknown("Unexpected shape error."));
-      if (tmp.IsAligned() || !DataTypeCanUseMemcpy(DataTypeToEnum<T>::value)) {
-        output_list.tensors.push_back(tmp);
-      } else {
-        Tensor aligned;
-        OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
-        aligned.flat<T>().device(c->eigen_device<Device>()) =
-            tmp.unaligned_flat<T>();
-        output_list.tensors.push_back(aligned);
-      }
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.push_back(aligned);
     }
     output_tensor->scalar<Variant>()() = std::move(output_list);
   }
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index e3872fee0edcae543b9193e0dcf6850d194ef067..57b7798ba04eab5d1a869d4782dfe7d0dc727df4 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/kernels/initializable_lookup_table.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -62,8 +63,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
 
     return Status::OK();
@@ -78,9 +78,8 @@ class MutableHashTableOfScalars final : public LookupInterface {
       table_.clear();
     }
     for (int64 i = 0; i < key_values.size(); ++i) {
-      gtl::InsertOrUpdate(&table_,
-                          SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-                          SubtleMustCopyUnlessStringOrFloat(value_values(i)));
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          SubtleMustCopyIfIntegral(value_values(i)));
     }
     return Status::OK();
   }
@@ -172,8 +171,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
 
     mutex_lock l(mu_);
     for (int64 i = 0; i < key_values.size(); ++i) {
-      ValueArray* value_vec = gtl::FindOrNull(
-          table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)));
+      ValueArray* value_vec =
+          gtl::FindOrNull(table_, SubtleMustCopyIfIntegral(key_values(i)));
       if (value_vec != nullptr) {
         for (int64 j = 0; j < value_dim; j++) {
           value_values(i, j) = value_vec->at(j);
@@ -203,8 +202,8 @@ class MutableHashTableOfTensors final : public LookupInterface {
         V value = value_values(i, j);
         value_vec.push_back(value);
       }
-      gtl::InsertOrUpdate(
-          &table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)), value_vec);
+      gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
+                          value_vec);
     }
     return Status::OK();
   }
@@ -379,15 +378,14 @@ class MutableDenseHashTable final : public LookupInterface {
           for (int64 j = 0; j < value_size; ++j) {
             // TODO(andreasst): check if we can get rid of SubtleMustCopy
             // here and elsewhere in this file.
-            value_matrix(i, j) = SubtleMustCopyUnlessStringOrFloat(
-                value_buckets_matrix(bucket_index, j));
+            value_matrix(i, j) =
+                SubtleMustCopyIfIntegral(value_buckets_matrix(bucket_index, j));
           }
           break;
         }
         if (IsEqualKey(key_buckets_matrix, bucket_index, empty_key_matrix, 0)) {
           for (int64 j = 0; j < value_size; ++j) {
-            value_matrix(i, j) =
-                SubtleMustCopyUnlessStringOrFloat(default_flat(j));
+            value_matrix(i, j) = SubtleMustCopyIfIntegral(default_flat(j));
           }
           break;
         }
@@ -531,7 +529,7 @@ class MutableDenseHashTable final : public LookupInterface {
         if (IsEqualKey(key_buckets_matrix, bucket_index, key_matrix, i)) {
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -539,11 +537,11 @@ class MutableDenseHashTable final : public LookupInterface {
           ++num_entries_;
           for (int64 j = 0; j < key_size; ++j) {
             key_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(key_matrix(i, j));
+                SubtleMustCopyIfIntegral(key_matrix(i, j));
           }
           for (int64 j = 0; j < value_size; ++j) {
             value_buckets_matrix(bucket_index, j) =
-                SubtleMustCopyUnlessStringOrFloat(value_matrix(i, j));
+                SubtleMustCopyIfIntegral(value_matrix(i, j));
           }
           break;
         }
@@ -849,6 +847,7 @@ REGISTER_KERNEL(string, int64);
 REGISTER_KERNEL(int64, string);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, float);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
@@ -899,6 +898,7 @@ REGISTER_KERNEL(int64, double);
 REGISTER_KERNEL(string, float);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int64, bool);
+REGISTER_KERNEL(int64, Variant);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 3657fd5b6a36360f6229374e66cd9002b0f24486..29a0cc91fe01e4e2fcede3f2f28c59ba7e89607d 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -125,19 +125,21 @@ namespace lookup {
 // integral types. However non-integer variables are not allowed and therefore
 // the local copy is unnecessary.
 template <typename T>
-T SubtleMustCopyUnlessStringOrFloat(const T& value) {
+T SubtleMustCopyIfIntegral(const T& value) {
   return internal::SubtleMustCopy(value);
 }
 
-inline const string& SubtleMustCopyUnlessStringOrFloat(const string& value) {
+inline const string& SubtleMustCopyIfIntegral(const string& value) {
   return value;
 }
 
-inline const float SubtleMustCopyUnlessStringOrFloat(const float value) {
+inline const float SubtleMustCopyIfIntegral(const float value) { return value; }
+
+inline const double SubtleMustCopyIfIntegral(const double value) {
   return value;
 }
 
-inline const double SubtleMustCopyUnlessStringOrFloat(const double value) {
+inline const Variant& SubtleMustCopyIfIntegral(const Variant& value) {
   return value;
 }
 
@@ -204,8 +206,8 @@ class HashTable : public InitializableLookupTable {
     const auto key_values = keys.flat<K>();
     const auto value_values = values.flat<V>();
     for (int64 i = 0; i < key_values.size(); ++i) {
-      const K key = SubtleMustCopyUnlessStringOrFloat(key_values(i));
-      const V value = SubtleMustCopyUnlessStringOrFloat(value_values(i));
+      const K key = SubtleMustCopyIfIntegral(key_values(i));
+      const V value = SubtleMustCopyIfIntegral(value_values(i));
       const V& previous_value = gtl::LookupOrInsert(table_.get(), key, value);
       if (previous_value != value) {
         return errors::FailedPrecondition(
@@ -224,8 +226,7 @@ class HashTable : public InitializableLookupTable {
 
     for (int64 i = 0; i < key_values.size(); ++i) {
       value_values(i) = gtl::FindWithDefault(
-          *table_, SubtleMustCopyUnlessStringOrFloat(key_values(i)),
-          default_val);
+          *table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 27031d9216129b842195993279f6d6c2acf7fb5f..77386a16e01352a7691c744ee882c5c6e1b0d5d9 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -101,9 +101,10 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && next_id_ != total_size()) {
+      if (errors::IsOutOfRange(status_) && vocab_size_ != -1 &&
+          next_id_ != vocab_size_) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
-                                          ": expected ", total_size(),
+                                          ": expected ", vocab_size_,
                                           " but got ", next_id_);
       }
       valid_ = false;
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 9be7408012bb81e80c73c29a6ee9bb6763c04490..23176b85778e60a0923a42c2cfcb46799050b022 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/core/kernels/maxpooling_op_gpu.h"
@@ -56,7 +57,7 @@ template <typename Device, typename T>
 static void SpatialMaxPoolWithArgMaxHelper(
     OpKernelContext* context, Tensor* output, Tensor* output_arg_max,
     Tensor* input_backprop, const Tensor& tensor_in, const Tensor& out_backprop,
-    const PoolParameters& params, const Padding& padding) {
+    const PoolParameters& params) {
   typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
       ConstEigenMatrixMap;
   typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
@@ -151,7 +152,7 @@ static void SpatialMaxPoolWithArgMaxHelper(
       }
     }
 
-    {
+    if (input_backprop != nullptr) {
       auto input_backprop_flat = input_backprop->flat<T>();
       auto out_arg_max_flat = output_arg_max->flat<int64>();
       auto out_backprop_flat = out_backprop.flat<T>();
@@ -173,9 +174,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
         // Although this check is in the inner loop, it is worth its value
         // so we don't end up with memory corruptions. Our benchmark shows that
         // the performance impact is quite small
-        CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
-            << "Invalid input backprop index: " << input_backprop_index << ", "
-            << in_start << ", " << in_end;
+        //CHECK(input_backprop_index >= in_start && input_backprop_index < in_end)
+        FastBoundsCheck(input_backprop_index - in_start, in_end - in_start);
         input_backprop_flat(input_backprop_index) += out_backprop_flat(index);
       }
     }
@@ -293,7 +293,7 @@ class MaxPoolingGradOp : public OpKernel {
 
     SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
         context, &tensor_out_dup, &tensor_out_arg_max, output, tensor_in,
-        out_backprop, params, padding_);
+        out_backprop, params);
   }
 
  private:
@@ -869,6 +869,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingWithArgmax<CPUDevice, T> {
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& input, Tensor* output, Tensor* argmax,
+                     bool propogate_nans) {
+    Tensor unused;
+    SpatialMaxPoolWithArgMaxHelper<CPUDevice, T>(
+        context, output, argmax, nullptr, input, unused, params);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingWithArgmaxOp : public OpKernel {
  public:
@@ -921,6 +932,53 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
 template <typename Device, typename T>
 struct LaunchMaxPoolingGradWithArgmax;
 
+template <typename T>
+struct LaunchMaxPoolingGradWithArgmax<CPUDevice, T> {
+  typedef Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
+      EigenMatrixMap;
+
+  static void launch(OpKernelContext* context, const PoolParameters& params,
+                     const Tensor& grad_in, const Tensor& argmax,
+                     Tensor* grad_out) {
+    const DeviceBase::CpuWorkerThreads& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+
+    auto shard = [&grad_in, &argmax, &grad_out](int64 start, int64 limit) {
+      const int64 batch_size =
+          GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+      const int64 output_size_per_batch = grad_out->NumElements() / batch_size;
+      const int64 input_size_per_batch = grad_in.NumElements() / batch_size;
+
+      {
+        auto grad_out_flat = grad_out->flat<T>();
+        auto argmax_flat = argmax.flat<int64>();
+        auto grad_in_flat = grad_in.flat<T>();
+
+        const int64 output_start = start * output_size_per_batch;
+        const int64 output_end = limit * output_size_per_batch;
+        EigenMatrixMap inputShard(grad_out_flat.data() + output_start, 1,
+                                  output_end - output_start);
+        inputShard.setConstant(T(0));
+
+        const int input_start = start * input_size_per_batch;
+        const int input_end = limit * input_size_per_batch;
+        for (int64 index = input_start; index < input_end; index++) {
+          const int64 grad_out_index = argmax_flat(index);
+          CHECK(grad_out_index >= output_start && grad_out_index < output_end)
+              << "Invalid output gradient index: " << grad_out_index << ", "
+              << output_start << ", " << output_end;
+          grad_out_flat(grad_out_index) += grad_in_flat(index);
+        }
+      }
+    };
+
+    const int64 batch_size = GetTensorDim(grad_out->shape(), FORMAT_NHWC, 'N');
+    const int64 shard_cost = grad_out->NumElements() / batch_size;
+    Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
+          shard_cost, shard);
+  }
+};
+
 template <typename Device, typename T>
 class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
@@ -1309,7 +1367,17 @@ struct LaunchMaxPoolingGradGradWithArgmax<Eigen::GpuDevice, T> {
                               .HostMemory("ksize")                       \
                               .HostMemory("strides")                     \
                               .TypeConstraint<T>("T"),                   \
-                          MaxPoolingGradGradOp<D##Device, T>);
+                          MaxPoolingGradGradOp<D##Device, T>)            \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                      \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<int64>("Targmax")          \
+                              .TypeConstraint<T>("T"),                   \
+                          MaxPoolingWithArgmaxOp<D##Device, T>);         \
+  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")                  \
+                              .Device(DEVICE_##D)                        \
+                              .TypeConstraint<T>("T")                    \
+                              .TypeConstraint<int64>("Targmax"),         \
+                          MaxPoolingGradWithArgmaxOp<D##Device, T>);
 
 // Below kernels implemented only for CPU device.
 #define REGISTER_CPU_ONLY_POOL_KERNELS(T)                          \
@@ -1374,16 +1442,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_MAX_POOL_KERNELS);
                               .HostMemory("strides")                 \
                               .TypeConstraint<T>("T"),               \
                           MaxPoolingNoMaskV2Op<GPUDevice, T>);       \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolWithArgmax")                  \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<int64>("Targmax")      \
-                              .TypeConstraint<T>("T"),               \
-                          MaxPoolingWithArgmaxOp<GPUDevice, T>);     \
-  REGISTER_KERNEL_BUILDER(Name("MaxPoolGradWithArgmax")              \
-                              .Device(DEVICE_GPU)                    \
-                              .TypeConstraint<T>("T")                \
-                              .TypeConstraint<int64>("Targmax"),     \
-                          MaxPoolingGradWithArgmaxOp<GPUDevice, T>); \
   REGISTER_KERNEL_BUILDER(Name("MaxPoolGradGradWithArgmax")          \
                               .Device(DEVICE_GPU)                    \
                               .TypeConstraint<T>("T")                \
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index d91f7107c5b1effdfa6c4c3b95b16bcf31750f42..68d3e1c9abde59d12a66d56896cc2e262794f756 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -263,21 +263,18 @@ class MklInputConversionOp : public OpKernel {
 
  private:
   void Compute(OpKernelContext* context) override {
-    const Tensor& input_tensor_0 = MklGetInput(context, 0);
+    const int kInputIndex_0 = 0, kInputIndex_1 = 1;
+    const Tensor& input_tensor_0 = MklGetInput(context, kInputIndex_0);
     MklDnnShape input_shape_0;
-    GetMklShape(context, 0, &input_shape_0);
+    GetMklShape(context, kInputIndex_0, &input_shape_0);
 
-    const Tensor& input_tensor_1 = MklGetInput(context, 1);
+    const Tensor& input_tensor_1 = MklGetInput(context, kInputIndex_1);
     MklDnnShape input_shape_1;
-    GetMklShape(context, 1, &input_shape_1);
-
-    bool tf_shapes_are_same =
-        context->input(0).shape() == context->input(1).shape();
+    GetMklShape(context, kInputIndex_1, &input_shape_1);
 
-    VLOG(1) << "MklInputConversionOp: Input shapes are "
-            << (tf_shapes_are_same ? "*same*" : "*different*") << ": "
-            << context->input(0).shape().DebugString() << " and "
-            << context->input(1).shape().DebugString();
+    VLOG(1) << "MklInputConversionOp: Input shapes are: "
+            << context->input(kInputIndex_0).shape().DebugString() << " and "
+            << context->input(kInputIndex_1).shape().DebugString();
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // if both inputs are in TF format, just copy input tensors to output.
@@ -285,15 +282,19 @@ class MklInputConversionOp : public OpKernel {
       VLOG(1) << "MklInputConversionOp: No conversion needed, "
               << "copying TF inputs to output";
 
-      ForwardTfTensorInToOut(context, 0, 0);
-      ForwardTfTensorInToOut(context, 1, 1);
+      ForwardTfTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+      ForwardTfTensorInToOut(context, kInputIndex_1, kInputIndex_1);
       return;
     }
 
     // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
     // If both inputs are in MKL format
     if (input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
-      if (tf_shapes_are_same) {
+      // It is safer to compare the original TensorFlow shapes than to compare
+      // Mkl shapes since element wise ops are forwarded to Eigen implementation.
+      TensorShape tf_shape0 = input_shape_0.GetTfShape();
+      TensorShape tf_shape1 = input_shape_1.GetTfShape();
+      if (tf_shape0 == tf_shape1) {
         auto input0_md = input_shape_0.GetMklLayout();
         auto input1_md = input_shape_1.GetMklLayout();
 
@@ -302,8 +303,8 @@ class MklInputConversionOp : public OpKernel {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
-          ForwardMklTensorInToOut(context, 0, 0);
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_0, kInputIndex_0);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         } else {
           VLOG(1) << "MklInputConversionOp: Shape is same, but format is "
@@ -324,7 +325,7 @@ class MklInputConversionOp : public OpKernel {
           mkl_output_mkl_shape.SetMklLayout(&input1_md);
 
           // Create output Mkl tensor for index 0
-          AllocateOutputSetMklShape(context, 0, &tensor_out,
+          AllocateOutputSetMklShape(context, kInputIndex_0, &tensor_out,
                                     input_tensor_0.shape(),
                                     mkl_output_mkl_shape);
 
@@ -342,7 +343,7 @@ class MklInputConversionOp : public OpKernel {
           stream(stream::kind::eager).submit(net).wait();
 
           // Input1 will be passed through
-          ForwardMklTensorInToOut(context, 1, 1);
+          ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
         }
       }
@@ -361,11 +362,11 @@ class MklInputConversionOp : public OpKernel {
               << "converted MKL inputs to TF format";
 
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 0);
+                                           op_data_type, has_avx512f_, kInputIndex_0);
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
-                                           op_data_type, has_avx512f_, 1);
-      SetDummyMklShapeOutput(context, 0);
-      SetDummyMklShapeOutput(context, 1);
+                                           op_data_type, has_avx512f_, kInputIndex_1);
+      SetDummyMklShapeOutput(context, kInputIndex_0);
+      SetDummyMklShapeOutput(context, kInputIndex_1);
       return;
     }
 
@@ -377,7 +378,6 @@ class MklInputConversionOp : public OpKernel {
     const Tensor* mkl_tensor;
     const MklDnnShape* mkl_shape;
     const Tensor* tf_tensor;
-    MklDnnShape* tf_mkl_shape;
     uint mkl_tensor_index;
     uint tf_tensor_index;
     if (input_shape_0.IsMklTensor() && !input_shape_1.IsMklTensor()) {
@@ -385,14 +385,12 @@ class MklInputConversionOp : public OpKernel {
       mkl_shape = &input_shape_0;
       mkl_tensor_index = 0;
       tf_tensor = &input_tensor_1;
-      tf_mkl_shape = &input_shape_1;
       tf_tensor_index = 1;
     } else if (!input_shape_0.IsMklTensor() && input_shape_1.IsMklTensor()) {
       mkl_tensor = &input_tensor_1;
       mkl_shape = &input_shape_1;
       mkl_tensor_index = 1;
       tf_tensor = &input_tensor_0;
-      tf_mkl_shape = &input_shape_0;
       tf_tensor_index = 0;
     } else {
       CHECK(false) << "MklInputConversionOp: Unexpected combination of input "
@@ -466,8 +464,8 @@ class MklInputConversionOp : public OpKernel {
     }
 
     VLOG(1) << "MklInputConversionOp: Shapes (output): "
-            << context->mutable_output(0)->shape().DebugString() << " and "
-            << context->mutable_output(1)->shape().DebugString();
+            << context->mutable_output(kInputIndex_0)->shape().DebugString() << " and "
+            << context->mutable_output(kInputIndex_1)->shape().DebugString();
 
     VLOG(1) << "MklInputConversion completed successfully.";
   }
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index 170523b5b455aa3c49db7c4b93db279b37d47252..f79e18cff29de5682ac2db445160d9346425414f 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -102,7 +102,7 @@ class MklSoftmaxOp : public OpKernel {
       // Softmax MklDnn output layout is same as input layout.
       auto dst_pd = src.GetUsrMemPrimDesc();
 
-      // if input is MKL shape, ouput is also MKL shape.
+      // if input is MKL shape, output is also MKL shape.
       // if input is TF shape, output is also TF shape
       if (src_mkl_shape.IsMklTensor()) {
         output_mkl_shape.SetMklTensor(true);
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index c3d24e50effb3fe5184e264064393a7f339105f0..313d40c082b3e334a01ba97eaf4449e1940b013a 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_kernel_library",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_kernel_library(
     name = "neon_depthwise_conv_op",
     hdrs = [
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 67d9217b9502a30f5727b6a91fbf36da872ab972..9387fb13bc252418d247b25464c67b90a864901d 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -147,7 +148,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -160,7 +161,7 @@ TEST_F(NonMaxSuppressionOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
@@ -308,7 +309,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInconsistentBoxAndScoreShapes) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("scores has incompatible shape"))
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -322,7 +323,7 @@ TEST_F(NonMaxSuppressionV2OpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("iou_threshold must be in [0, 1]"))
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 5ffcc7d65d1c83580adafe8761a4f5d8227e4327..e41df12d914d3971c0a577eb4ff4ee1b74ac8a46 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
 namespace tensorflow {
@@ -379,8 +380,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
@@ -401,8 +402,8 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid range: input_min 1 > input_max 0"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "Invalid range: input_min 1 > input_max 0"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index ad606803ee7017380b33819dca7718023daa3900..6c19f9841cdd886a614e537d75cefee4c2e892d8 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -43,6 +43,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
   void Compute(OpKernelContext* context) override {
     ResourceOpKernel<QueueInterface>::Compute(context);
+    mutex_lock l(mu_);
     if (resource_ && context->track_allocations()) {
       context->record_persistent_memory_allocation(resource_->MemoryUsed());
     }
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 9237fa51d885c633675146191dc384dd87d8ab22..0de2ebb5907caa13e0c1b2a4e11d218bd9701bae 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -244,6 +244,33 @@ __global__ void RowReduceKernel(
   if (row < num_rows && lane == 0) out[row] = sum;
 }
 
+template <typename T1>
+struct storage_type {
+  T1 val;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator T1() { return val; }
+  __host__ __device__ storage_type<T1>& operator=(const T1& in) {
+    val = in;
+    return *this;
+  }
+};
+
+template <typename T2>
+struct storage_type<std::complex<T2>> {
+  T2 real;
+  T2 imag;
+  __host__ __device__ storage_type() {}
+  __host__ __device__ operator std::complex<T2>() {
+    return std::complex<T2>(real, imag);
+  }
+  __host__ __device__ storage_type<std::complex<T2>>& operator=(
+      const std::complex<T2>& in) {
+    real = in.real();
+    imag = in.imag();
+    return *this;
+  }
+};
+
 // Works only if there are <= 16 columns
 // each warps sums over multiple rows at once
 template <typename T, typename outT, typename Op>
@@ -268,7 +295,7 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -294,7 +321,8 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
     if (blockDim.y > 1) {
       for (int row = 1; row < blockDim.y; ++row) {
-        s = op(s, partial_sums[threadIdx.x * 33 + row]);
+        value_type t = partial_sums[threadIdx.x * 33 + row];
+        s = op(s, t);
       }
     }
 
@@ -316,7 +344,7 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ value_type partial_sums[32 * 33];
+  __shared__ storage_type<value_type> partial_sums[32 * 33];
 
   row += gridDim.y * blockDim.y;
 
@@ -347,7 +375,8 @@ __global__ void ColumnReduceKernel(
         min(blockDim.y, num_rows - blockIdx.y * blockDim.y);
 
     for (int row = 1; row < numRowsThisBlock; ++row) {
-      s = op(s, partial_sums[threadIdx.x * 33 + row]);
+      value_type t = partial_sums[threadIdx.x * 33 + row];
+      s = op(s, t);
     }
 
     out[col * gridDim.y + blockIdx.y] = s;
diff --git a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
index d5b37b1ce1279ef69f103bf2968801b641dfc1f5..9217c25978c9903390f2a5d84b3731b507fcc17d 100644
--- a/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_rewriter_transform_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/remote_fused_graph_execute_op_test_utils.h"
 #include "tensorflow/core/kernels/remote_fused_graph_execute_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
@@ -181,7 +182,7 @@ class FuseRemoteGraphMultipleAddOpsRewriterTest : public ::testing::Test {
     int cluster_count = 0;
     for (const NodeDef& node_def : output_graph_def_.node()) {
       const string& name = node_def.name();
-      if (StringPiece(name).starts_with(REMOTE_FUSED_GRAPH_NODE_NAME)) {
+      if (str_util::StartsWith(name, REMOTE_FUSED_GRAPH_NODE_NAME)) {
         ++cluster_count;
         RemoteFusedGraphExecuteInfo info;
         string serialized_proto;
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index 25a37d5e1af5835d56dedb50922967704500ad46..c23570d885be2419a8e045633a31250e0b99703d 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -218,9 +219,8 @@ TEST_F(ResizeBicubicOpTest, TestBicubic2x2To0x0) {
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
 
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resize_bilinear_op_test.cc b/tensorflow/core/kernels/resize_bilinear_op_test.cc
index a920e6028104b69799e29e1fd9526baa9d85fc08..6d57892828593e30a0da5ea90b01b6742a71019f 100644
--- a/tensorflow/core/kernels/resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -457,9 +458,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidOutputSize) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid argument: output dimensions must be positive"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: output dimensions must be positive"))
       << s;
 }
 
@@ -467,8 +467,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidInputShape) {
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: input must be 4-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: input must be 4-dimensional"))
       << s;
 }
 
@@ -476,8 +476,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeDim) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({2, 1}), {4, 4});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must be 1-dimensional"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must be 1-dimensional"))
       << s;
 }
 
@@ -485,8 +485,8 @@ TEST_F(ResizeBilinearOpTest, TestInvalidSizeElements) {
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<int32>(TensorShape({3}), {4, 4, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Invalid argument: shape_t must have two elements"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid argument: shape_t must have two elements"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index e134e476f60718bae538b119ad90bd884b77639c..72504200ccbb3f4236b148ea0aeb8f83e91c4e74 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -250,8 +250,9 @@ class AssignVariableOp : public OpKernel {
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias =
-        context->forward_input(1, dtype_, value.shape(), DEVICE_MEMORY, attr);
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
+        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     if (input_alias) {
@@ -278,64 +279,6 @@ class AssignVariableOp : public OpKernel {
   DataType dtype_;
 };
 
-template <typename Device>
-Status VariantCopyFn(OpKernelContext* context, const Tensor& from, Tensor* to);
-
-#define CPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<CPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<CPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-
-#define INSTANTIATE_GET_VARIANT_COPY_FN(Device, TYPE_CALLER, TYPE_DENSE_COPY) \
-  template <>                                                                 \
-  Status VariantCopyFn<Device>(OpKernelContext * context, const Tensor& from, \
-                               Tensor* to) {                                  \
-    PersistentTensor tmp;                                                     \
-    Tensor* tensor;                                                           \
-    AllocatorAttributes attr;                                                 \
-    attr.set_gpu_compatible(true);                                            \
-    attr.set_nic_compatible(true);                                            \
-    TF_RETURN_IF_ERROR(context->allocate_persistent(                          \
-        from.dtype(), from.shape(), &tmp, &tensor, attr));                    \
-    switch (from.dtype()) {                                                   \
-      TYPE_CALLER(TYPE_DENSE_COPY);                                           \
-      default:                                                                \
-        return errors::InvalidArgument(                                       \
-            "VariantCopyFn: Could not perform a deep copy of variant "        \
-            "element of type: ",                                              \
-            DataTypeString(from.dtype()),                                     \
-            " using device: ", context->device()->name());                    \
-    }                                                                         \
-    *to = *tensor;                                                            \
-    return Status::OK();                                                      \
-  }
-
-INSTANTIATE_GET_VARIANT_COPY_FN(CPUDevice, TF_CALL_ALL_TYPES, CPU_DENSE_COPY);
-
-#if GOOGLE_CUDA
-#define GPU_DENSE_COPY(T)                                                \
-  case DataTypeToEnum<T>::value: {                                       \
-    functor::DenseUpdate<GPUDevice, T, ASSIGN> copy_functor_;            \
-    copy_functor_(context->eigen_device<GPUDevice>(), tensor->flat<T>(), \
-                  from.flat<T>());                                       \
-    break;                                                               \
-  }
-#define TF_CALL_GPU_AND_ADDITIONAL_TYPES(T) \
-  TF_CALL_GPU_ALL_TYPES(T);                 \
-  TF_CALL_int32(T);                         \
-  TF_CALL_int64(T);
-INSTANTIATE_GET_VARIANT_COPY_FN(GPUDevice, TF_CALL_GPU_AND_ADDITIONAL_TYPES,
-                                GPU_DENSE_COPY);
-#undef TF_CALL_GPU_AND_ADDITIONAL_TYPES
-#undef GPU_DENSE_COPY
-#endif  // GOOGLE_CUDA
-
-#undef CPU_DENSE_COPY
-#undef INSTANTIATE_GET_VARIANT_COPY_FN
-
 template <typename Device>
 class AssignVariableOp<Device, Variant> : public OpKernel {
  public:
@@ -363,17 +306,47 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(DT_VARIANT)));
 
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+
+    // Copying is unnecessary if we are the last user of the value
+    // tensor, we can just adopt the input tensor's buffer instead.
+    // Note that Variant objects themselves always reside on host.
+    //
+    // We nevertheless want to signal to the runtime that the tensor
+    // should reside in memory of the associated device, as Variant
+    // tensors may be marked as sitting on either CPU or GPU.  This
+    // helps to elide one or more copies.
+    std::unique_ptr<Tensor> input_alias = context->forward_input(
+        1, OpKernelContext::Params::kNoReservation /*output_index*/, DT_VARIANT,
+        value.shape(),
+        std::is_same<Device, CPUDevice>::value ? HOST_MEMORY : DEVICE_MEMORY,
+        attr);
+
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
     *variable->tensor() = Tensor(DT_VARIANT, value.shape());
+
+    if (input_alias) {
+      *variable->tensor() = *input_alias;
+      return;
+    }
+
+    // Need to copy, but maybe we can re-use variable's buffer?
+    if (!variable->tensor()->RefCountIsOne() ||
+        !variable->tensor()->shape().IsSameSize(value.shape())) {
+      PersistentTensor unused;
+      Tensor* tmp;
+      OP_REQUIRES_OK(context,
+                     context->allocate_persistent(DT_VARIANT, value.shape(),
+                                                  &unused, &tmp, attr));
+      *variable->tensor() = *tmp;
+    }
+
     const auto elements_in = value.flat<Variant>();
     auto elements_out = variable->tensor()->flat<Variant>();
-    auto copy_fn = std::bind(&VariantCopyFn<Device>, context,
-                             std::placeholders::_1, std::placeholders::_2);
     for (int64 i = 0; i < elements_in.size(); ++i) {
-      OP_REQUIRES_OK(context, VariantDeviceCopy(
-                                  VariantDeviceCopyDirection::DEVICE_TO_DEVICE,
-                                  elements_in(i), &elements_out(i), copy_fn));
+      elements_out(i) = elements_in(i);
     }
   }
 
@@ -503,6 +476,7 @@ class ResourceGatherOp : public OpKernel {
   void Compute(OpKernelContext* c) override {
     Var* v = nullptr;
     OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+    core::ScopedUnref su(v);
     // NOTE: We hold the lock for the whole gather operation instead
     // of increasing the reference count of v->tensor() to avoid a
     // situation where a write to the same variable will see a
@@ -531,7 +505,14 @@ class ResourceGatherOp : public OpKernel {
     }
 
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    Tensor tmp;
+    if (params.dtype() == DT_VARIANT) {
+      tmp = Tensor(DT_VARIANT, result_shape);
+      c->set_output(0, tmp);
+      out = &tmp;
+    } else {
+      OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+    }
     if (N > 0) {
       const int64 gather_dim_size = params.dim_size(0);
       int64 inner_size = 1;
@@ -576,7 +557,24 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GATHER_CPU);
 #if GOOGLE_CUDA
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_GATHER_GPU);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
+
+// Variant objects themselves sit on CPU, even if they contain data
+// pointing to a device.
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int32>)
+REGISTER_KERNEL_BUILDER(Name("ResourceGather")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceGatherOp<GPUDevice, Variant, int64>)
 
 #endif  // GOOGLE_CUDA
 
@@ -692,6 +690,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
@@ -704,6 +704,23 @@ REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_GPU);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_GPU);
 
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .HostMemory("indices")
+                            .TypeConstraint<Variant>("dtype")
+                            .TypeConstraint<int64>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, Variant, int64,
+                                                scatter_op::UpdateOp::ASSIGN>)
+
 #endif  // GOOGLE_CUDA
 
 #undef REGISTER_SCATTER_ARITHMETIC
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index bcbdbee058b4fdb587f2099c54545b8a6aec8ca9..4b630809c5a85496dc57476c6291729f54abc5a7 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -254,8 +254,11 @@ class RollOp : public OpKernel {
     // total modulo sum of shifts for each dimension
     gtl::InlinedVector<int, 4> shift_mod_sum(num_dims, 0);
     for (int i = 0; i < num_shifts; i++) {
-      const int axis = axis_flat(i);
-      OP_REQUIRES(context, axis < num_dims,
+      int axis = axis_flat(i);
+      if (axis < 0) {
+        axis += num_dims;
+      }
+      OP_REQUIRES(context, 0 <= axis && axis < num_dims,
                   errors::InvalidArgument("axis ", axis, " is out of range"));
       const int ds = std::max<int>(static_cast<int>(input.dim_size(axis)), 1);
       const int sum = shift_mod_sum[axis] + static_cast<int>(shift_flat(i));
diff --git a/tensorflow/core/kernels/roll_op_test.cc b/tensorflow/core/kernels/roll_op_test.cc
index 90b6f8d0f3094224ca694b59c851c14bb424d120..e431226aa634d34a13a67785d0cec0d894d04b67 100644
--- a/tensorflow/core/kernels/roll_op_test.cc
+++ b/tensorflow/core/kernels/roll_op_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -372,7 +373,8 @@ TEST_F(RollOpTest, Error_InputMustBeVectorOrHigher) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {0});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("input must be 1-D or higher"))
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "input must be 1-D or higher"))
       << s;
 }
 
@@ -384,8 +386,8 @@ TEST_F(RollOpTest, Error_AxisMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("axis must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "axis must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -397,8 +399,8 @@ TEST_F(RollOpTest, Error_ShiftMustBeScalarOrVector) {
   AddInputFromArray<int32>(TensorShape({1, 2}), {0, 1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift must be a scalar or a 1-D vector"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift must be a scalar or a 1-D vector"))
       << s;
 }
 
@@ -410,8 +412,8 @@ TEST_F(RollOpTest, Error_ShiftAndAxisMustBeSameSize) {
   AddInputFromArray<int32>(TensorShape({1}), {1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("shift and axis must have the same size"))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(),
+                                    "shift and axis must have the same size"))
       << s;
 }
 
@@ -423,7 +425,7 @@ TEST_F(RollOpTest, Error_AxisOutOfRange) {
   AddInputFromArray<int32>(TensorShape({}), {1});
   AddInputFromArray<int32>(TensorShape({}), {1});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("is out of range")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "is out of range")) << s;
 }
 
 // isd - (inner shift dimension) The inner most dimension to be shifted.
diff --git a/tensorflow/core/kernels/rpc_op.cc b/tensorflow/core/kernels/rpc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2447ef504044e2289a99d19630112d33b0147a8a
--- /dev/null
+++ b/tensorflow/core/kernels/rpc_op.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// RpcOp is a TensorFlow op that sends and receives arbitrary messages.
+//
+// See docs in ../ops/rpc_op.cc.
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/rpc/call_container.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+class RpcOp : public AsyncOpKernel {
+ public:
+  explicit RpcOp(OpKernelConstruction* context) : AsyncOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("protocol", &protocol_));
+    OP_REQUIRES(context, !protocol_.empty(),
+                errors::InvalidArgument("protocol must be non-empty."));
+    bool fail_fast;
+    OP_REQUIRES_OK(context, context->GetAttr("fail_fast", &fail_fast));
+    int64 timeout_in_ms;
+    OP_REQUIRES_OK(context, context->GetAttr("timeout_in_ms", &timeout_in_ms));
+
+    RPCFactoryRegistry::RPCFactoryFn* rpc_factory_fn =
+        RPCFactoryRegistry::Global()->Get(protocol_);
+    OP_REQUIRES(context, rpc_factory_fn != nullptr,
+                errors::InvalidArgument("The protocol ", protocol_,
+                                        " was not recognized."));
+
+    rpc_factory_.reset((*rpc_factory_fn)(context, fail_fast, timeout_in_ms));
+  }
+
+  ~RpcOp() override {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& address_t = ctx->input(0);
+    const Tensor& method_t = ctx->input(1);
+    const Tensor& request_t = ctx->input(2);
+
+    OP_REQUIRES_ASYNC(
+        ctx, address_t.dims() == 0 || address_t.dims() == 1,
+        errors::InvalidArgument("address must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, method_t.dims() == 0 || method_t.dims() == 1,
+        errors::InvalidArgument("method must be a scalar or vector."), done);
+    OP_REQUIRES_ASYNC(
+        ctx, request_t.dims() == 0 || request_t.dims() == 1,
+        errors::InvalidArgument("request must be a scalar or vector."), done);
+
+    TensorShape output_shape({});
+    for (const Tensor& t : {address_t, method_t, request_t}) {
+      if (t.dims() == 1) {
+        OP_REQUIRES_ASYNC(
+            ctx,
+            output_shape.dims() == 0 ||
+                output_shape.dim_size(0) == t.dim_size(0),
+            errors::InvalidArgument(
+                "Input vector shapes don't match: ", output_shape.DebugString(),
+                " vs. ", t.shape().DebugString()),
+            done);
+        output_shape = t.shape();
+      }
+    }
+
+    Tensor* response_t;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->allocate_output(0, output_shape, &response_t), done);
+
+    const bool try_rpc = (ctx->num_outputs() > 1);
+
+    Tensor* status_code_t = nullptr;
+    Tensor* status_message_t = nullptr;
+    if (try_rpc) {
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(1, output_shape, &status_code_t), done);
+      OP_REQUIRES_OK_ASYNC(
+          ctx, ctx->allocate_output(2, output_shape, &status_message_t), done);
+    }
+
+    if (request_t.NumElements() == 0) {
+      // Special case, we finished early!
+      done();
+      return;
+    }
+
+    int64 num_elements = output_shape.num_elements();
+
+    rpc_factory_->Call(ctx, num_elements, address_t, method_t, request_t,
+                       try_rpc, response_t, status_code_t, status_message_t,
+                       std::move(done));
+  }
+
+ private:
+  string protocol_;
+  std::unique_ptr<RPCFactory> rpc_factory_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(RpcOp);
+};
+
+REGISTER_KERNEL_BUILDER(Name("Rpc").Device(DEVICE_CPU), RpcOp);
+REGISTER_KERNEL_BUILDER(Name("TryRpc").Device(DEVICE_CPU), RpcOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index 52666645bf0bb38df3fd600c602313d7b5925b00..ebaa2bd9c6253abf975c74338125529282dd7850 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -203,9 +206,9 @@ struct ScatterFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -216,6 +219,42 @@ struct ScatterFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   typename TTypes<Variant>::ConstMatrix updates,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    DCHECK_EQ(N, updates.dimension(0));
+    DCHECK_EQ(cols, updates.dimension(1));
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Copy last Ndim-1 dimensions of updates[i] to params[index]
+      for (int j = 0; j < cols; ++j) {
+        const Variant& to_scatter = updates(i, j);
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterFunctor<CPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<CPUDevice, Index> {};
+
+template <typename Index>
+struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
+    : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
@@ -227,9 +266,9 @@ struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -252,9 +291,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
     const Index limit = static_cast<Index>(params.dimension(0));
     if (!std::is_same<T, string>::value) {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         memmove(params.data() + index * params.dimension(1),
@@ -263,9 +303,10 @@ struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
       }
     } else {
       for (Index i = 0; i < N; i++) {
-        // Grab the index and check its validity.  An earlier version of the
-        // code checked it and then grabbed it from memory a second time, which
-        // was a security risk since it could have changed in between.
+        // Grab the index and check its validity.  Do this carefully,
+        // to avoid checking the value and grabbing it again from
+        // memory a second time (a security risk since it may change in
+        // between).
         const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
         if (!FastBoundsCheck(index, limit)) return i;
         // Copy last Ndim-1 dimensions of updates[i] to params[index]
@@ -321,9 +362,9 @@ struct ScatterScalarFunctorBase {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -334,6 +375,41 @@ struct ScatterScalarFunctorBase {
   }
 };
 
+template <typename Device, typename Index>
+struct ScatterScalarFunctorVariantAssignBase {
+  Index operator()(OpKernelContext* c, const Device& d,
+                   typename TTypes<Variant>::Matrix params,
+                   const typename TTypes<Variant>::ConstScalar update,
+                   typename TTypes<Index>::ConstFlat indices) {
+    // indices and params sizes were validated in DoCompute().
+    const Index N = static_cast<Index>(indices.size());
+    const Index limit = static_cast<Index>(params.dimension(0));
+    const Index cols = static_cast<Index>(params.dimension(1));
+    const Variant& to_scatter = update();
+    for (Index i = 0; i < N; i++) {
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
+      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
+      if (!FastBoundsCheck(index, limit)) return i;
+      // Broadcast update to params[index]
+      for (Index j = 0; j < cols; ++j) {
+        params(index, j) = to_scatter;
+      }
+    }
+    return -1;
+  }
+};
+
+template <typename Index>
+struct ScatterScalarFunctor<CPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<CPUDevice, Index> {};
+template <typename Index>
+struct ScatterScalarFunctor<GPUDevice, Variant, Index,
+                            scatter_op::UpdateOp::ASSIGN>
+    : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
+
 #ifdef TENSORFLOW_USE_SYCL
 template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
@@ -345,9 +421,9 @@ struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
@@ -370,9 +446,9 @@ struct ScatterScalarFunctorBase<CPUDevice, T, Index,
     const Index N = static_cast<Index>(indices.size());
     const Index limit = static_cast<Index>(params.dimension(0));
     for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  An earlier version of the
-      // code checked it and then grabbed it from memory a second time, which
-      // was a security risk since it could have changed in between.
+      // Grab the index and check its validity.  Do this carefully,
+      // to avoid checking the value and grabbing it again from
+      // memory a second time (a security risk since it may change in between).
       const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
       if (!FastBoundsCheck(index, limit)) return i;
       // Broadcast update to params[index]
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index ae81efa31d4193e6bbb173a1bd6866889037b76e..c134a8dd5bcdb06445b063d4083c18e76c5f4265 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -183,9 +184,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Invalid indices: [2,0] = [99] does not index into [5,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
       << s;
 }
 
@@ -198,10 +198,10 @@ TEST_F(ScatterNdUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("The outermost dimension of updates and indices "
-                            "must match. Got indices.shape [1,3,1], "
-                            "updates.shape [3,3]"))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match. Got "
+      "indices.shape [1,3,1], updates.shape [3,3]"))
       << s;
 }
 
@@ -216,10 +216,8 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains("Must have updates.shape = indices.shape[:batch_dim]"))
-
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(), "Must have updates.shape = indices.shape[:batch_dim]"))
       << s;
 }
 
@@ -233,10 +231,9 @@ TEST_F(ScatterNdUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(
-      StringPiece(s.ToString())
-          .contains(
-              "The outermost dimension of updates and indices must match."))
+  EXPECT_TRUE(str_util::StrContains(
+      s.ToString(),
+      "The outermost dimension of updates and indices must match."))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc
index 5b3537b94c8304d7427e769eacb15784cec8c295..2ec8c422336551d79ffb800a882ac91db9d5bdf9 100644
--- a/tensorflow/core/kernels/scatter_op_test.cc
+++ b/tensorflow/core/kernels/scatter_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -170,7 +171,7 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("indices[2] = 99 is not in [0, 5)"))
+      str_util::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 5)"))
       << s;
 }
 
@@ -183,8 +184,9 @@ TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) {
   AddInputFromArray<float>(TensorShape({3, 3}),
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
@@ -200,8 +202,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedParamsAndUpdateDimensions) {
       TensorShape({3, 4}),
       {100, 101, 102, 103, 777, 778, 779, 780, 10000, 10001, 10002, 10004});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
 
       << s;
@@ -217,8 +220,9 @@ TEST_F(ScatterUpdateOpTest, Error_MismatchedIndicesAndUpdateDimensions) {
   AddInputFromArray<float>(TensorShape({2, 3}),
                            {100, 101, 102, 10000, 10001, 10002});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString())
-                  .contains("Must have updates.shape = indices.shape + "
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(),
+                            "Must have updates.shape = indices.shape + "
                             "params.shape[1:] or updates.shape = [], got "))
       << s;
 }
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index 5a389a6548797d4248b4255e6f3eba11d5439ab3..623de2a48249d2de9b225d766a2c55c35decd353 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -302,6 +302,11 @@ Status Examples::SampleAdaptiveProbabilities(
   return Status::OK();
 }
 
+void Examples::RandomShuffle() {
+  std::iota(sampled_index_.begin(), sampled_index_.end(), 0);
+  std::random_shuffle(sampled_index_.begin(), sampled_index_.end());
+}
+
 // TODO(sibyl-Aix6ihai): Refactor/shorten this function.
 Status Examples::Initialize(OpKernelContext* const context,
                             const ModelWeights& weights,
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 1665b1210ec568acd4871292df642286e307de2b..bfdb3febdc41c4f4b058fc9e19c93f48bad9409d 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -322,10 +322,7 @@ class Examples {
     return examples_.at(example_index);
   }
 
-  int sampled_index(const int id, const bool adaptive) const {
-    if (adaptive) return sampled_index_[id];
-    return id;
-  }
+  int sampled_index(const int id) const { return sampled_index_[id]; }
 
   // Adaptive SDCA in the current implementation only works for
   // binary classification, where the input argument for num_weight_vectors
@@ -337,6 +334,8 @@ class Examples {
       const std::unique_ptr<DualLossUpdater>& loss_updater,
       const int num_weight_vectors);
 
+  void RandomShuffle();
+
   int num_examples() const { return examples_.size(); }
 
   int num_features() const { return num_features_; }
diff --git a/tensorflow/core/kernels/sdca_ops.cc b/tensorflow/core/kernels/sdca_ops.cc
index 5b63057f3f59d63ae8c7e696595c492fdbd31d6f..55e68b348b9fa499df5dd0052dfe1378bf4c1739 100644
--- a/tensorflow/core/kernels/sdca_ops.cc
+++ b/tensorflow/core/kernels/sdca_ops.cc
@@ -153,8 +153,9 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
                        options.num_loss_partitions, options.regularizations,
                        model_weights, example_state_data, options.loss_updater,
                        /*num_weight_vectors =*/1));
+  } else {
+    examples.RandomShuffle();
   }
-
   mutex mu;
   Status train_step_status GUARDED_BY(mu);
   std::atomic<std::int64_t> atomic_index(-1);
@@ -162,8 +163,7 @@ void DoCompute(const ComputeOptions& options, OpKernelContext* const context) {
     // The static_cast here is safe since begin and end can be at most
     // num_examples which is an int.
     for (int id = static_cast<int>(begin); id < end; ++id) {
-      const int64 example_index =
-          examples.sampled_index(++atomic_index, options.adaptive);
+      const int64 example_index = examples.sampled_index(++atomic_index);
       const Example& example = examples.example(example_index);
       const float dual = example_state_data(example_index, 0);
       const float example_weight = example.example_weight();
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index d0703d7576932c19933844ba43c6c00f357d1ba1..6d35ff2de6df5f0d3c6a9d06e4528cd355742eb7 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -17,6 +17,21 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
 
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
+
+// This file requires the following include because it uses CudaAtomicMax:
+// #include "tensorflow/core/util/cuda_kernel_helper.h"
+
+// Unfortunately we can't add the #include, since it breaks compilation for
+// non-GPU targets. This only breaks in clang, because it's more strict for
+// template code and CudaAtomicMax is used in template context.
+
 // This file requires the following include because it uses CudaAtomicMax:
 // #include "tensorflow/core/util/cuda_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/shape_op_test.cc b/tensorflow/core/kernels/shape_op_test.cc
index a545fb146c94342869441dee3770e7b677fa5142..9cd590ae615a67de2d683a77bb2bc562b46b362f 100644
--- a/tensorflow/core/kernels/shape_op_test.cc
+++ b/tensorflow/core/kernels/shape_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
@@ -62,8 +63,8 @@ REGISTER_UNARY_VARIANT_DECODE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE");
 REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(KnownVecSize, "KNOWN VECTOR SIZE TYPE",
                                       GetShapeFromKnownVecSize);
 
-static void ExpectHasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+static void ExpectHasError(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << ">>" << s << "<<, expected substring >>" << substr << "<<";
 }
 
diff --git a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
index f1c0ed2eae0009d54053e7b3fb9d802afb3da1b6..e4e3bd5220382b50eca263d50d91d503b3a1c526 100644
--- a/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/snapshot_op_gpu.cu.cc
@@ -25,8 +25,7 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 // Definition of the GPU implementations declared in softsign_op.cc.
-#define DEFINE_GPU_KERNELS(T)                      \
-  template struct functor::Snapshot<GPUDevice, T>;
+#define DEFINE_GPU_KERNELS(T) template struct functor::Snapshot<GPUDevice, T>;
 
 TF_CALL_POD_TYPES(DEFINE_GPU_KERNELS);
 
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index e1712ac239d6be2d51b0c0598a799959a8b53a94..e72608945b0b4494123afb5763fe882f54717a00 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -55,7 +56,7 @@ template <typename Device, typename T>
 class SoftmaxOp : public OpKernel {
  public:
   explicit SoftmaxOp(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/softmax_op_gpu.cu.cc b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
index 130d693dbdf132515a7ffcfc0bc6c9631a5aee21..b63dcbb163b1b7c1bee68571e2b43bb0a6f358a8 100644
--- a/tensorflow/core/kernels/softmax_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/softmax_op_gpu.cu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -128,7 +129,7 @@ template <typename T>
 class SoftmaxOpGPU : public OpKernel {
  public:
   explicit SoftmaxOpGPU(OpKernelConstruction* context) : OpKernel(context) {
-    log_ = StringPiece(type_string()).starts_with("Log");
+    log_ = str_util::StartsWith(type_string(), "Log");
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index 4b5df7aff0e9b345fb94f9f06a9906972448c048..4ebb7fbcc70e10a0d1b7a5dd063c2524b01b6dfc 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -419,7 +419,7 @@ class SparseCrossOp : public OpKernel {
           context, TensorShapeUtils::IsMatrix(dense_list_in[i].shape()),
           errors::InvalidArgument(
               "Dense inputs should be a matrix but received shape ",
-              indices_list_in[i].shape().DebugString(), " at position ", i));
+              dense_list_in[i].shape().DebugString(), " at position ", i));
       OP_REQUIRES(context, dense_list_in[i].dim_size(0) == batch_size,
                   errors::InvalidArgument("Expected batch size ", batch_size,
                                           " got ", dense_list_in[i].dim_size(0),
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
index fe198af7e6c131ab19daf877063a2a6838d1f2c7..29577ebb4ed1f678768cf8af92dff48a5a901e67 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -32,7 +33,7 @@ namespace tensorflow {
 namespace {
 
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 03fc4467a1dcf9d70c90c19809690934b0a7c2f4..73a02a34cf231799e6a813f042757d70b4e9414a 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -32,53 +32,8 @@ namespace {
 
 class Buffer : public ResourceBase {
  public:
-  // public types
   using Tuple = std::vector<Tensor>;
 
- private:
-  // private variables
-  std::size_t capacity_;
-  std::size_t memory_limit_;
-  std::size_t current_bytes_;
-  std::mutex mu_;
-  std::condition_variable non_empty_cond_var_;
-  std::condition_variable full_cond_var_;
-  std::deque<Tuple> buf_;
-
- private:
-  // private methods
-
-  // If the buffer is configured for bounded capacity, notify
-  // waiting inserters that space is now available
-  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
-    if (IsBounded()) {
-      lock->unlock();
-      // Notify all inserters. The removal of an element
-      // may make memory available for many inserters
-      // to insert new elements
-      full_cond_var_.notify_all();
-    }
-  }
-
-  // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
-  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
-
-  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
-
-  bool WouldExceedMemoryLimit(std::size_t bytes) const {
-    return bytes + current_bytes_ > memory_limit_;
-  }
-
-  std::size_t GetTupleBytes(const Tuple& tuple) {
-    return std::accumulate(tuple.begin(), tuple.end(), 0,
-                           [](const std::size_t& lhs, const Tensor& rhs) {
-                             return lhs + rhs.TotalBytes();
-                           });
-  }
-
- public:
-  // public methods
   explicit Buffer(std::size_t capacity, std::size_t memory_limit)
       : capacity_(capacity), memory_limit_(memory_limit), current_bytes_(0) {}
 
@@ -181,6 +136,44 @@ class Buffer : public ResourceBase {
     std::unique_lock<std::mutex> lock(mu_);
     return strings::StrCat("Staging size: ", buf_.size());
   }
+
+ private:
+  // If the buffer is configured for bounded capacity, notify
+  // waiting inserters that space is now available
+  void notify_inserters_if_bounded(std::unique_lock<std::mutex>* lock) {
+    if (IsBounded()) {
+      lock->unlock();
+      // Notify all inserters. The removal of an element
+      // may make memory available for many inserters
+      // to insert new elements
+      full_cond_var_.notify_all();
+    }
+  }
+
+  // Are there a limit number of elements or a memory limit
+  // configued on this buffer?
+  bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
+
+  bool IsCapacityFull() const { return buf_.size() >= capacity_; }
+
+  bool WouldExceedMemoryLimit(std::size_t bytes) const {
+    return bytes + current_bytes_ > memory_limit_;
+  }
+
+  std::size_t GetTupleBytes(const Tuple& tuple) {
+    return std::accumulate(tuple.begin(), tuple.end(), 0,
+                           [](const std::size_t& lhs, const Tensor& rhs) {
+                             return lhs + rhs.TotalBytes();
+                           });
+  }
+
+  std::size_t capacity_;
+  std::size_t memory_limit_;
+  std::size_t current_bytes_;
+  std::mutex mu_;
+  std::condition_variable non_empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<Tuple> buf_;
 };
 
 Status GetBuffer(OpKernelContext* ctx, const NodeDef& ndef, Buffer** buf) {
diff --git a/tensorflow/core/kernels/summary_op_test.cc b/tensorflow/core/kernels/summary_op_test.cc
index 3c46abb8ab1ab2aafeff140ae07f987a3eba7db5..9dcabcc5843a9db13d93723e4364959cc74ea820 100644
--- a/tensorflow/core/kernels/summary_op_test.cc
+++ b/tensorflow/core/kernels/summary_op_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -122,7 +123,7 @@ TEST_F(SummaryScalarOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({3}), {1.0f, -0.73f, 10000.0f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("not the same shape")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "not the same shape")) << s;
 }
 
 TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
@@ -133,7 +134,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsTags) {
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -145,7 +146,7 @@ TEST_F(SummaryScalarOpTest, Error_WrongDimsValues) {
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
   EXPECT_TRUE(
-      StringPiece(s.ToString()).contains("tags and values not the same shape"))
+      str_util::StrContains(s.ToString(), "tags and values not the same shape"))
       << s;
 }
 
@@ -256,7 +257,7 @@ TEST_F(SummaryHistoOpTest, Error_WrongDimsTags) {
   AddInputFromArray<string>(TensorShape({2, 1}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
@@ -266,7 +267,7 @@ TEST_F(SummaryHistoOpTest, Error_TooManyTagValues) {
   AddInputFromArray<string>(TensorShape({2}), {"tag1", "tag2"});
   AddInputFromArray<float>(TensorShape({2, 1}), {1.0f, -0.73f});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("tags must be scalar")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "tags must be scalar")) << s;
 }
 
 // --------------------------------------------------------------------------
@@ -365,7 +366,7 @@ TEST_F(SummaryMergeOpTest, Error_MismatchedSize) {
   AddInputFromArray<string>(TensorShape({2}),
                             {s1.SerializeAsString(), s2.SerializeAsString()});
   Status s = RunOpKernel();
-  EXPECT_TRUE(StringPiece(s.ToString()).contains("Duplicate tag")) << s;
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), "Duplicate tag")) << s;
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index f6e2a5ae251f57eed99b5c968d806310be11440e..7e56e15450aba23e6625b27da34a29b1ad2ecce2 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_KERNELS_TRAINING_OP_HELPERS_H_
 
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
@@ -40,14 +41,27 @@ Status PrepareToUpdateVariable(OpKernelContext* ctx, Tensor* tensor) {
     // updating.
     PersistentTensor unused;
     Tensor* tmp;
-    AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
-    TF_RETURN_IF_ERROR(ctx->allocate_persistent(
-        tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
-                 const_cast<const Tensor*>(tensor)->flat<T>());
+    if (std::is_same<T, Variant>::value) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+
+      const auto elements_in = tensor->flat<Variant>();
+      auto elements_out = tmp->flat<Variant>();
+      for (int64 i = 0; i < elements_in.size(); ++i) {
+        elements_out(i) = elements_in(i);
+      }
+    } else {
+      AllocatorAttributes attr;
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+      TF_RETURN_IF_ERROR(ctx->allocate_persistent(
+          tensor->dtype(), tensor->shape(), &unused, &tmp, attr));
+      functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
+      copy_functor(ctx->eigen_device<Device>(), tmp->flat<T>(),
+                   const_cast<const Tensor*>(tensor)->flat<T>());
+    }
     *tensor = *tmp;
   }
   return Status::OK();
@@ -64,24 +78,21 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
                                   bool lock_held, bool sparse, Tensor* out) {
   if (ctx->input_dtype(input) == DT_RESOURCE) {
     Var* var;
-    if (LookupResource(ctx, HandleFromInput(ctx, input), &var).ok()) {
-      core::ScopedUnref unref_var(var);
-      if (lock_held) {
+    TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
+    core::ScopedUnref unref_var(var);
+    if (lock_held) {
+      TF_RETURN_IF_ERROR(
+          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+      *out = *var->tensor();
+    } else {
+      mutex_lock ml(*var->mu());
+      if (!sparse) {
         TF_RETURN_IF_ERROR(
             PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        *out = *var->tensor();
-      } else {
-        mutex_lock ml(*var->mu());
-        if (!sparse) {
-          TF_RETURN_IF_ERROR(
-              PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-        }
-        *out = *var->tensor();
       }
-      return Status::OK();
-    } else {
-      return errors::Internal("Invalid variable reference.");
+      *out = *var->tensor();
     }
+    return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
   return Status::OK();
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index ebd19c3d35125e8d1884ad39ea490b632f479ef0..9a3612bd72cdc2bc1c3c471beed6616816072a71 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -52,8 +52,8 @@ class SoftmaxXentWithLogitsOp : public OpKernel {
       OP_REQUIRES(context, bcast.IsValid(),
                   errors::InvalidArgument(
                       "logits and labels must be broadcastable: logits_size=",
-                      logits_in.shape().DebugString(), " labels_size=",
-                      labels_in.shape().DebugString()));
+                      logits_in.shape().DebugString(),
+                      " labels_size=", labels_in.shape().DebugString()));
       shape_in = BCast::ToShape(bcast.output_shape());
     }
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(shape_in),
diff --git a/tensorflow/core/lib/core/stringpiece.cc b/tensorflow/core/lib/core/stringpiece.cc
index 5bd79778a66f65fbe3963b664a639a7c1b028237..0b006fa2b46e57baac3a5b5ba24c5ddfb414ef81 100644
--- a/tensorflow/core/lib/core/stringpiece.cc
+++ b/tensorflow/core/lib/core/stringpiece.cc
@@ -55,6 +55,4 @@ StringPiece StringPiece::substr(size_t pos, size_t n) const {
   return StringPiece(data_ + pos, n);
 }
 
-const StringPiece::size_type StringPiece::npos = size_type(-1);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 79409cce4b492939c7a0758e1dc0c0f0d06cace8..835b938cbfdf35ae5f1a040ad793544e3a7eb101 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -65,7 +65,7 @@ class StringPiece {
   iterator begin() const { return data_; }
   iterator end() const { return data_ + size_; }
 
-  static const size_t npos;
+  static const size_t npos = size_type(-1);
 
   // Return the ith byte in the referenced data.
   // REQUIRES: n < size()
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index 9ff87e8d66d2575966c703a896ac9ff0bc51661a..ce09c2009ac81b5cd2736800852a148bfefff6a9 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -42,9 +42,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["*"]),
-    visibility = ["//tensorflow:__pkg__"],
-)
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index bb65e5357a845ebc132a8518fc28fec94b669bde..0901eba9265a48351d108a73a620dd753f4ec92f 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -321,7 +321,7 @@ TEST(FlatMap, Copy) {
     NumMap copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 09fbbb1fb6c6670d24345c0043c56df0ed2c7bb0..010b4bb5df3337ad814caa3a8767796074be1d18 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -252,7 +252,7 @@ TEST(FlatSet, Copy) {
     NumSet copy2;
     copy2 = src;
     EXPECT_EQ(Contents(src), Contents(copy2));
-    copy2 = copy2;  // Self-assignment
+    copy2 = *&copy2;  // Self-assignment, avoiding -Wself-assign.
     EXPECT_EQ(Contents(src), Contents(copy2));
   }
 }
diff --git a/tensorflow/core/lib/io/format.cc b/tensorflow/core/lib/io/format.cc
index 64852943ad560edeede640fbd882e2984a4afee5..0c24c660a246eacde9fe0a0368a66eb511b1786d 100644
--- a/tensorflow/core/lib/io/format.cc
+++ b/tensorflow/core/lib/io/format.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <limits>
+
 #include "tensorflow/core/lib/io/format.h"
 
 #include "tensorflow/core/lib/core/coding.h"
@@ -84,6 +86,11 @@ Status ReadBlock(RandomAccessFile* file, const BlockHandle& handle,
   // Read the block contents as well as the type/crc footer.
   // See table_builder.cc for the code that built this structure.
   size_t n = static_cast<size_t>(handle.size());
+
+  if (kBlockTrailerSize > std::numeric_limits<size_t>::max() - n) {
+    return errors::DataLoss("handle.size() too big");
+  }
+
   char* buf = new char[n + kBlockTrailerSize];
   StringPiece contents;
   Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
diff --git a/tensorflow/core/lib/io/inputbuffer_test.cc b/tensorflow/core/lib/io/inputbuffer_test.cc
index 6be1f819c2081dd4cc73853276d1cd94399614ff..3608008b30181ca5025644437740f1cd0fe1a156 100644
--- a/tensorflow/core/lib/io/inputbuffer_test.cc
+++ b/tensorflow/core/lib/io/inputbuffer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
@@ -287,7 +288,7 @@ TEST(InputBuffer, Seek) {
     EXPECT_TRUE(errors::IsOutOfRange(in.ReadNBytes(1, &read)));
 
     EXPECT_TRUE(
-        StringPiece(in.Seek(-1).ToString()).contains("negative position"));
+        str_util::StrContains(in.Seek(-1).ToString(), "negative position"));
   }
 }
 
diff --git a/tensorflow/core/lib/io/recordio_test.cc b/tensorflow/core/lib/io/recordio_test.cc
index b7e51256a22b0d84e734e2a036a184b3adc3e547..63235761d92a90b1dc48029abe5aaceaf22f1f2e 100644
--- a/tensorflow/core/lib/io/recordio_test.cc
+++ b/tensorflow/core/lib/io/recordio_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -218,7 +219,7 @@ TEST_F(RecordioTest, RandomRead) {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void AssertHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 516decc3c01742b9a9fbbef1239037b3a4005b2f..8f34baa7def340f320793660f10b6e0958dade70 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <locale>
 #include <unordered_map>
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -203,7 +204,7 @@ bool safe_strto64(StringPiece str, int64* value) {
 
   int64 vlimit = kint64max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different limit for positive and negative integers.
     vlimit = kint64min;
@@ -265,7 +266,7 @@ bool safe_strto32(StringPiece str, int32* value) {
 
   int64 vmax = kint32max;
   int sign = 1;
-  if (str.Consume("-")) {
+  if (str_util::ConsumePrefix(&str, "-")) {
     sign = -1;
     // Different max for positive and negative integers.
     ++vmax;
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index fee8a6f93e9a56c1d3a152683a27ad4fec8950ef..ede9f4d390180501bd65c3cbfe301da86d7530a6 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -128,7 +129,7 @@ void TestWriteAppends(T first, U second) {
   string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
-  EXPECT_TRUE(StringPiece(encoded).starts_with(encoded_first_only));
+  EXPECT_TRUE(str_util::StartsWith(encoded, encoded_first_only));
 }
 
 template <typename T>
diff --git a/tensorflow/core/lib/strings/scanner.h b/tensorflow/core/lib/strings/scanner.h
index d3b63357ee71394250a2e13f7895363b8d82de29..c82e771368c1c2b8a945e070cfcd45f72f91f0a3 100644
--- a/tensorflow/core/lib/strings/scanner.h
+++ b/tensorflow/core/lib/strings/scanner.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
@@ -75,14 +76,14 @@ class Scanner {
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, this is a no-op.
   Scanner& ZeroOrOneLiteral(StringPiece s) {
-    cur_.Consume(s);
+    str_util::ConsumePrefix(&cur_, s);
     return *this;
   }
 
   // Consume the next s.size() characters of the input, if they match <s>. If
   // they don't match <s>, then GetResult will ultimately return false.
   Scanner& OneLiteral(StringPiece s) {
-    if (!cur_.Consume(s)) {
+    if (!str_util::ConsumePrefix(&cur_, s)) {
       error_ = true;
     }
     return *this;
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index d8a83fc464b33d274aa4f8174132980275fd8598..9e41da6a20dc5c1491786a5391e12ca7aa2e722c 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -203,7 +204,7 @@ TEST(WavIO, ChunkSizeOverflow) {
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
   EXPECT_FALSE(decode_status.ok());
-  EXPECT_TRUE(StringPiece(decode_status.error_message()).contains("too large"))
+  EXPECT_TRUE(str_util::StrContains(decode_status.error_message(), "too large"))
       << decode_status.error_message();
 }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f97f1645a6f60f88af6df1017a90522297c77f3a..2a8b9f9beea924e333de4f8459c5cc9f476bd891 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -27,6 +27,7 @@ namespace tensorflow {
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
+using shape_inference::UnchangedShape;
 
 namespace {
 
@@ -178,46 +179,88 @@ Status SetOutputShapeForReshape(InferenceContext* c) {
     c->set_output(0, out);
     return Status::OK();
   }
-  DimensionHandle num_in_elems = c->NumElements(in);
-  if (c->FullyDefined(out)) {
-    DimensionHandle num_out_elems = c->NumElements(out);
-    if (c->ValueKnown(num_in_elems) &&
-        c->Value(num_in_elems) != c->Value(num_out_elems)) {
-      return errors::InvalidArgument(
-          "Cannot reshape a tensor with ", c->DebugString(num_in_elems),
-          " elements to shape ", c->DebugString(out), " (",
-          c->DebugString(num_out_elems), " elements)");
-    }
-    c->set_output(0, out);
-    return Status::OK();
-  }
 
-  if (c->ValueKnown(num_in_elems)) {
+  if (c->RankKnown(out) && c->RankKnown(in)) {
     // We don't know the number of output elements, but we can try to infer
     // the missing dimension.
-    int32 unknown_idx = -1;
     bool too_many_unknown = false;
-    DimensionHandle known_elems = c->MakeDim(1);
-    for (int32 i = 0; i < c->Rank(out); ++i) {
-      DimensionHandle dim = c->Dim(out, i);
-      if (!c->ValueKnown(dim)) {
-        if (unknown_idx >= 0) {
-          too_many_unknown = true;
-          break;
+    int32 out_unknown_idx = -1;
+
+    DimensionHandle known_out_elems = c->NumElements(out);
+    if (!c->ValueKnown(known_out_elems)) {
+      known_out_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(out); ++i) {
+        DimensionHandle dim = c->Dim(out, i);
+        if (!c->ValueKnown(dim)) {
+          if (out_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          out_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(
+              c->Multiply(known_out_elems, dim, &known_out_elems));
+        }
+      }
+    }
+    int32 in_unknown_idx = -1;
+    DimensionHandle known_in_elems = c->NumElements(in);
+    if (!c->ValueKnown(known_in_elems)) {
+      known_in_elems = c->MakeDim(1);
+      for (int32 i = 0; i < c->Rank(in); ++i) {
+        DimensionHandle dim = c->Dim(in, i);
+        if (!c->ValueKnown(dim)) {
+          if (in_unknown_idx >= 0) {
+            too_many_unknown = true;
+            break;
+          }
+          in_unknown_idx = i;
+        } else {
+          TF_RETURN_IF_ERROR(c->Multiply(known_in_elems, dim, &known_in_elems));
         }
-        unknown_idx = i;
-      } else {
-        TF_RETURN_IF_ERROR(c->Multiply(known_elems, dim, &known_elems));
       }
     }
-    if (!too_many_unknown && c->Value(known_elems) != 0) {
-      DimensionHandle inferred_dim;
-      TF_RETURN_IF_ERROR(c->Divide(num_in_elems, c->Value(known_elems),
-                                   true /* evenly_divisible */, &inferred_dim));
-      TF_RETURN_IF_ERROR(c->ReplaceDim(out, unknown_idx, inferred_dim, &out));
+
+    if (!too_many_unknown) {
+      if (in_unknown_idx < 0 && out_unknown_idx < 0) {
+        // Just check that the dimensions match.
+        if (c->Value(known_in_elems) != c->Value(known_out_elems)) {
+          return errors::InvalidArgument(
+              "Cannot reshape a tensor with ", c->DebugString(known_in_elems),
+              " elements to shape ", c->DebugString(out), " (",
+              c->DebugString(known_out_elems), " elements)");
+        }
+      } else if (in_unknown_idx < 0 && out_unknown_idx >= 0 &&
+                 c->Value(known_out_elems) > 0) {
+        // Input fully known, infer the one missing output dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_in_elems, c->Value(known_out_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(out, out_unknown_idx, inferred_dim, &out));
+
+      } else if (in_unknown_idx >= 0 && out_unknown_idx < 0 &&
+                 c->Value(known_in_elems) != 0) {
+        // Output fully known, infer the one missing input dim
+        DimensionHandle inferred_dim;
+        TF_RETURN_IF_ERROR(c->Divide(known_out_elems, c->Value(known_in_elems),
+                                     true /* evenly_divisible */,
+                                     &inferred_dim));
+        DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+        TF_RETURN_IF_ERROR(
+            c->Merge(unknown_in_dim, inferred_dim, &unknown_in_dim));
+      } else if (in_unknown_idx >= 0 && out_unknown_idx >= 0) {
+        // Exactly one unknown dimension in both input and output. These 2 are
+        // equal iff the known elements are equal.
+        if (c->Value(known_in_elems) == c->Value(known_out_elems)) {
+          DimensionHandle unknown_in_dim = c->Dim(in, in_unknown_idx);
+          TF_RETURN_IF_ERROR(
+              c->ReplaceDim(out, out_unknown_idx, unknown_in_dim, &out));
+        }
+      }
     }
   }
-
   c->set_output(0, out);
   return Status::OK();
 }
@@ -299,6 +342,50 @@ REGISTER_OP("Pack")
       return Status::OK();
     });
 
+REGISTER_OP("DeepCopy")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetIsStateful()
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceUpdate")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceAdd")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("InplaceSub")
+    .Input("x: T")
+    .Input("i: int32")
+    .Input("v: T")
+    .Output("y: T")
+    .Attr("T: type")
+    .SetShapeFn(UnchangedShape);
+
+REGISTER_OP("Empty")
+    .Input("shape: int32")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("init: bool = false")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 // --------------------------------------------------------------------------
 REGISTER_OP("Unpack")
     .Input("value: T")
@@ -452,9 +539,9 @@ REGISTER_OP("SplitV")
       const Tensor* size_splits = c->input_tensor(1);
       if (rank == InferenceContext::kUnknownRank) {
         // If the rank of input tensor is unknown, then return unknown shapes.
-        output_shape = c->UnknownShape();
+        // Note that the shape of each output can be different.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShape());
         }
       } else if (rank == 0) {
         // Throw error if input is a scalar.
@@ -463,18 +550,19 @@ REGISTER_OP("SplitV")
         // If split dimension is known, but the sizes are unknown, then
         // only the split dimension is unknown
         output_shape = input;
-        TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
-                                         c->Value(split_dimension),
-                                         c->UnknownDim(), &output_shape));
         for (int i = 0; i < num_outputs; ++i) {
+          TF_RETURN_IF_ERROR(c->ReplaceDim(output_shape,
+                                           c->Value(split_dimension),
+                                           c->UnknownDim(), &output_shape));
           c->set_output(i, output_shape);
         }
       } else if (size_splits == nullptr && !c->ValueKnown(split_dimension)) {
         // If split dimension or tensor containing the split sizes is unknown,
-        // then return unknown shapes of same rank as input.
-        output_shape = c->UnknownShapeOfRank(rank);
+        // then return unknown shapes of same rank as input. Note that each
+        // output shape can be different since splitv doesn't always split
+        // tensors evenly.
         for (int i = 0; i < num_outputs; ++i) {
-          c->set_output(i, output_shape);
+          c->set_output(i, c->UnknownShapeOfRank(rank));
         }
       } else {
         // Determine the output shape if split dimension and split sizes are
@@ -579,7 +667,7 @@ REGISTER_OP("OnesLike")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {bfloat16, float, double, int8, uint8, int16, uint16, int32, "
+        "T: {bfloat16, half, float, double, int8, uint8, int16, uint16, int32, "
         "int64, complex64, complex128, bool}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -587,7 +675,9 @@ REGISTER_OP("OnesLike")
 REGISTER_OP("Diag")
     .Input("diagonal: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       TF_RETURN_IF_ERROR(c->WithRankAtLeast(in, 1, &in));
@@ -602,7 +692,9 @@ REGISTER_OP("Diag")
 REGISTER_OP("DiagPart")
     .Input("input: T")
     .Output("diagonal: T")
-    .Attr("T: {bfloat16, float, double, int32, int64, complex64, complex128}")
+    .Attr(
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
+        "complex128}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle in = c->input(0);
       if (!c->RankKnown(in)) {
@@ -746,7 +838,7 @@ REGISTER_OP("ReverseV2")
     .Output("output: T")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr(
-        "T: {uint8, int8, uint16, int16, int32, int64, bool, half, bfloat16, "
+        "T: {uint8, int8, uint16, int16, int32, int64, bool, bfloat16, half, "
         "float, double, complex64, complex128, string}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
@@ -776,7 +868,7 @@ REGISTER_OP("ReverseV2")
           }
           if (axes_dense[canonical_axis]) {
             return errors::InvalidArgument("axis ", canonical_axis,
-                                            " specified more than once.");
+                                           " specified more than once.");
           }
           axes_dense[canonical_axis] = true;
         }
@@ -1122,7 +1214,7 @@ REGISTER_OP("PreventGradient")
 REGISTER_OP("CheckNumerics")
     .Input("tensor: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .Attr("message: string")
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -2407,13 +2499,12 @@ REGISTER_OP("Bitcast")
     .Output("output: type")
     // All supported dtypes are listed here to include qint16 and quint16.
     .Attr(
-        "T: {bfloat16, float, double, int64, int32, uint8, uint16, int8, int16,"
-        " complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "T: {bfloat16, half, float, double, int64, int32, uint8, uint16, int8, "
+        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32}")
     .Attr(
-        "type: {bfloat16, float, double, int64, int32, uint8, uint16, int8, "
-        "int16, complex64, complex128, qint8, quint8, qint16, quint16, qint32,"
-        " half}")
+        "type: {bfloat16, half, float, double, int64, int32, uint8, uint16, "
+        "int8, int16, complex64, complex128, qint8, quint8, qint16, quint16, "
+        "qint32}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input = c->input(0);
       if (!c->RankKnown(input)) {
@@ -2509,7 +2600,7 @@ REGISTER_OP("QuantizeAndDequantize")
     .Attr("input_min: float = 0")
     .Attr("input_max: float = 0")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Deprecated(22, "Replaced by QuantizeAndDequantizeV2");
 
@@ -2522,7 +2613,7 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("num_bits: int = 8")
     .Attr("range_given: bool = false")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2539,7 +2630,7 @@ REGISTER_OP("QuantizeAndDequantizeV3")
     .Attr("signed_input: bool = true")
     .Attr("range_given: bool = true")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
diff --git a/tensorflow/core/ops/array_ops_test.cc b/tensorflow/core/ops/array_ops_test.cc
index cf5bb5ad849571c92f7dccf4d0fdc5780965567c..b1463338fbe726e10a3fb0a2cdc69521ab021ce6 100644
--- a/tensorflow/core/ops/array_ops_test.cc
+++ b/tensorflow/core/ops/array_ops_test.cc
@@ -838,7 +838,7 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   // Unknown dimensions.
   // Flatten:
   new_shape = test::AsTensor<int32>({-1});
-  INFER_OK(op, "[?];[1]", "[?]");
+  INFER_OK(op, "[?];[1]", "[d0_0]");
   INFER_OK(op, "[2,2];[1]", "[4]");
   // The first dimension is inferred:
   new_shape = test::AsTensor<int32>({2, -1});
@@ -851,6 +851,10 @@ TEST(ArrayOpsTest, Reshape_ShapeFn) {
   new_shape = test::AsTensor<int32>({-1, -1, 2});
   INFER_OK(op, "[8];[3]", "[?,?,2]");
 
+  // Symbolic shape propagation
+  new_shape = test::AsTensor<int32>({-1, 2, 3});
+  INFER_OK(op, "[?,2,3];[3]", "[d0_0,2,3]");
+
   // Reshaping to a scalar.
   new_shape = test::AsTensor<int32>({});
   INFER_OK(op, "[1];[0]", "[]");
diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..297e94655fe3c62893291de2a256b177222cd7a2
--- /dev/null
+++ b/tensorflow/core/ops/boosted_trees_ops.cc
@@ -0,0 +1,319 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(BoostedTreesEnsembleResource);
+
+REGISTER_OP("IsBoostedTreesEnsembleInitialized")
+    .Input("tree_ensemble_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCalculateBestGainsPerFeature")
+    .Input("node_id_range: int32")
+    .Input("stats_summary_list: num_features * float32")
+    .Attr("l1: float")
+    .Attr("l2: float")
+    .Attr("tree_complexity: float")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_features: int >= 1")  // not passed but populated automatically.
+    .Output("node_ids_list: num_features * int32")
+    .Output("gains_list: num_features * float32")
+    .Output("thresholds_list: num_features * int32")
+    .Output("left_node_contribs_list: num_features * float32")
+    .Output("right_node_contribs_list: num_features * float32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Confirms the rank of the inputs and sets the shape of the outputs.
+      int max_splits;
+      int num_features;
+      float l1, l2, tree_complexity;
+      TF_RETURN_IF_ERROR(c->GetAttr("l1", &l1));
+      if (l1 < 0) {
+        return errors::InvalidArgument("l1 must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("l2", &l2));
+      if (l2 < 0) {
+        return errors::InvalidArgument("l2 must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("tree_complexity", &tree_complexity));
+      if (tree_complexity < 0) {
+        return errors::InvalidArgument("Tree complexity must be non-negative.");
+      }
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_id_range_shape;
+      shape_inference::ShapeHandle unused_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_id_range_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(node_id_range_shape, c->MakeShape({2}), &unused_shape));
+      // Checks that all stats summary entries are of the same shape.
+      shape_inference::ShapeHandle summary_shape_base;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &summary_shape_base));
+      TF_RETURN_IF_ERROR(c->Merge(summary_shape_base,
+                                  c->MakeShape({max_splits, -1, 2}),
+                                  &unused_shape));
+      for (int i = 1; i < num_features; ++i) {
+        shape_inference::ShapeHandle summary_shape;
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(1 + i), 3, &summary_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(summary_shape_base, summary_shape, &unused_shape));
+      }
+      // Sets the output lists.
+      std::vector<shape_inference::ShapeHandle> output_shapes_vec(
+          num_features, c->MakeShape({-1}));
+      TF_RETURN_IF_ERROR(c->set_output("node_ids_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("gains_list", output_shapes_vec));
+      TF_RETURN_IF_ERROR(c->set_output("thresholds_list", output_shapes_vec));
+      std::vector<shape_inference::ShapeHandle> output_shapes_contribs(
+          num_features, c->MakeShape({-1, 1}));
+      TF_RETURN_IF_ERROR(
+          c->set_output("left_node_contribs_list", output_shapes_contribs));
+      TF_RETURN_IF_ERROR(
+          c->set_output("right_node_contribs_list", output_shapes_contribs));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesCreateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesDeserializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("stamp_token: int64")
+    .Input("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused_input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesGetEnsembleStates")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("num_trees: int32")
+    .Output("num_finalized_trees: int32")
+    .Output("num_attempted_layers: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      c->set_output(2, c->Scalar());
+      c->set_output(3, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesMakeStatsSummary")
+    .Input("node_ids: int32")
+    .Input("gradients: float")
+    .Input("hessians: float")
+    .Input("bucketized_features_list: num_features * int32")
+    .Attr("max_splits: int >= 1")
+    .Attr("num_buckets: int >= 1")
+    .Attr("num_features: int >= 1")
+    .Output("stats_summary: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Sets the shape of the output as a Rank 4 Tensor.
+      int max_splits;
+      int num_buckets;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_splits", &max_splits));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_buckets", &num_buckets));
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+      shape_inference::ShapeHandle node_ids_shape;
+      shape_inference::ShapeHandle gradients_shape;
+      shape_inference::ShapeHandle hessians_shape;
+      shape_inference::ShapeHandle bucketized_feature_shape;
+      shape_inference::ShapeHandle unused_shape;
+      shape_inference::DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &node_ids_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &gradients_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &hessians_shape));
+      TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                  c->Dim(gradients_shape, 0), &unused_dim));
+      TF_RETURN_IF_ERROR(
+          c->Merge(gradients_shape, hessians_shape, &unused_shape));
+      for (int f = 0; f < num_features; ++f) {
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(3 + f), 1, &bucketized_feature_shape));
+        TF_RETURN_IF_ERROR(c->Merge(c->Dim(node_ids_shape, 0),
+                                    c->Dim(bucketized_feature_shape, 0),
+                                    &unused_dim));
+      }
+      c->set_output(0,
+                    c->MakeShape({num_features, max_splits, num_buckets, 2}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")
+    .Attr("logits_dimension: int")
+    .Attr("max_depth: int >= 1")
+    .Output("logits: float")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 1), 1, &feature_shape));
+        // Check that the shapes of all bucketized features are the same.
+        TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      }
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Logits.
+      c->set_output(0, logits_shape);
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesSerializeEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Output("stamp_token: int64")
+    .Output("tree_ensemble_serialized: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      c->set_output(1, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesTrainingPredict")
+    .Input("tree_ensemble_handle: resource")
+    .Input("cached_tree_ids: int32")
+    .Input("cached_node_ids: int32")
+    .Input("bucketized_features: num_bucketized_features * int32")
+    .Attr("num_bucketized_features: int >= 1")
+    .Attr("logits_dimension: int")
+    .Attr("max_depth: int >= 1")
+    .Output("partial_logits: float")
+    .Output("tree_ids: int32")
+    .Output("node_ids: int32")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle feature_shape;
+      int num_bucketized_features;
+      TF_RETURN_IF_ERROR(
+          c->GetAttr("num_bucketized_features", &num_bucketized_features));
+
+      int max_depth;
+      TF_RETURN_IF_ERROR(c->GetAttr("max_depth", &max_depth));
+
+      shape_inference::ShapeHandle unused_input;
+      for (int i = 0; i < num_bucketized_features; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 3), 1, &feature_shape));
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->input(i + 3), feature_shape, &unused_input));
+      }
+      // all inputs/outputs except logits should have same shape.
+      TF_RETURN_IF_ERROR(c->Merge(c->input(1), feature_shape, &unused_input));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), feature_shape, &unused_input));
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      auto logits_shape =
+          c->MakeShape({c->Dim(feature_shape, 0), logits_dimension});
+      // Partial logits.
+      c->set_output(0, logits_shape);
+      // Tree ids.
+      c->set_output(1, c->MakeShape({c->Dim(feature_shape, 0)}));
+      // Node ids.
+      c->set_output(2, c->MakeShape({c->Dim(feature_shape, 0)}));
+      return Status::OK();
+    });
+
+REGISTER_OP("BoostedTreesUpdateEnsemble")
+    .Input("tree_ensemble_handle: resource")
+    .Input("feature_ids: int32")
+    .Input("node_ids: num_features * int32")
+    .Input("gains: num_features * float")
+    .Input("thresholds: num_features * int32")
+    .Input("left_node_contribs: num_features * float")
+    .Input("right_node_contribs: num_features * float")
+    .Attr("max_depth: int >= 1")
+    .Attr("learning_rate: float")
+    .Attr("pruning_mode: int >=0")
+    .Attr("num_features: int >= 0")  // Inferred.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      int num_features;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features));
+
+      // Feature_ids, should be one for each feature.
+      shape_inference::ShapeHandle feature_ids_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &feature_ids_shape));
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->input(1), c->Vector(num_features), &shape_handle));
+
+      for (int i = 0; i < num_features; ++i) {
+        // Node ids.
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i + 2), 1, &shape_handle));
+        auto shape_rank_1 = c->MakeShape({c->Dim(shape_handle, 0)});
+        auto shape_rank_2 = c->MakeShape({c->Dim(shape_handle, 0), 1});
+
+        // Gains.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features + 2), 1, &shape_handle));
+        // TODO(nponomareva): replace this with input("name",vector of shapes).
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features + 2),
+                                    shape_rank_1, &shape_handle));
+        // Thresholds.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 2 + 2), 1, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 2 + 2),
+                                    shape_rank_1, &shape_handle));
+        // Left and right node contribs.
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 3 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 3 + 2),
+                                    shape_rank_2, &shape_handle));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(i + num_features * 4 + 2), 2, &shape_handle));
+        TF_RETURN_IF_ERROR(c->Merge(c->input(i + num_features * 4 + 2),
+                                    shape_rank_2, &shape_handle));
+      }
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6157a69df5cf535a0957df8b7ed6d4f597acd1d
--- /dev/null
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("CollectiveReduce")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
+    .Attr("final_op: {'Id', 'Div'}")
+    .Attr("subdiv_offsets: list(int)")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("CollectiveBcastSend")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+REGISTER_OP("CollectiveBcastRecv")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Attr("group_size: int")
+    .Attr("group_key: int")
+    .Attr("instance_key: int")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 6cdb1586bc826f5de4c926c0b5f7bf0f9285cd93..c613ab144f8824586121200b3f89c87b25cc7522 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -57,18 +57,3 @@ tf_cc_binary(
         "//tensorflow/core:lib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 05d6e0228168c63b788a1c321b70390a2e142bb2..12df60a2ae3ef4753a6628c179da22dfda5a6fb3 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -64,6 +64,31 @@ op {
     }
   }
 }
+op {
+  name: "Abs"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "AccumulateNV2"
   input_arg {
@@ -607,6 +632,33 @@ op {
     }
   }
 }
+op {
+  name: "Acos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Acosh"
   input_arg {
@@ -656,6 +708,31 @@ op {
     }
   }
 }
+op {
+  name: "Acosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Add"
   input_arg {
@@ -725,6 +802,41 @@ op {
     }
   }
 }
+op {
+  name: "Add"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "AddManySparseToTensorsMap"
   input_arg {
@@ -1094,6 +1206,42 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
 op {
   name: "AdjustContrast"
   input_arg {
@@ -6166,6 +6314,33 @@ op {
     }
   }
 }
+op {
+  name: "Asin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Asinh"
   input_arg {
@@ -6215,6 +6390,31 @@ op {
     }
   }
 }
+op {
+  name: "Asinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Assert"
   input_arg {
@@ -6761,6 +6961,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Atan2"
   input_arg {
@@ -6812,6 +7039,33 @@ op {
     }
   }
 }
+op {
+  name: "Atan2"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Atanh"
   input_arg {
@@ -6861,6 +7115,31 @@ op {
     }
   }
 }
+op {
+  name: "Atanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "AudioSpectrogram"
   input_arg {
@@ -7331,6 +7610,111 @@ op {
     }
   }
 }
+op {
+  name: "AvgPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "AvgPool3DGrad"
+  input_arg {
+    name: "orig_input_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "AvgPool3DGrad"
   input_arg {
@@ -7367,6 +7751,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -7432,6 +7829,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7492,6 +7890,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -8328,6 +8727,50 @@ op {
     }
   }
 }
+op {
+  name: "BatchMatMul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "adj_x"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "adj_y"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "BatchMatrixBandPart"
   input_arg {
@@ -10155,99 +10598,160 @@ op {
   }
 }
 op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
+  name: "Bitcast"
   input_arg {
-    name: "y"
+    name: "input"
     type_attr: "T"
   }
   output_arg {
-    name: "z"
-    type_attr: "T"
+    name: "output"
+    type_attr: "type"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
-      }
-    }
-  }
-  is_commutative: true
-}
-op {
-  name: "BitwiseAnd"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
         type: DT_INT8
         type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
-}
-op {
-  name: "BitwiseOr"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type_attr: "T"
-  }
   attr {
-    name: "T"
+    name: "type"
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT64
+        type: DT_INT32
         type: DT_UINT8
         type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT16
+        type: DT_QUINT16
+        type: DT_QINT32
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "BitwiseOr"
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseAnd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+      }
+    }
+  }
+  is_commutative: true
+}
+op {
+  name: "BitwiseOr"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -10340,6 +10844,342 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "tree_complexity"
+    type: "float"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "learning_rate"
+    type: "float"
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "BroadcastArgs"
   input_arg {
@@ -10745,6 +11585,29 @@ op {
     }
   }
 }
+op {
+  name: "Ceil"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "CheckNumerics"
   input_arg {
@@ -10798,6 +11661,33 @@ op {
     type: "string"
   }
 }
+op {
+  name: "CheckNumerics"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "message"
+    type: "string"
+  }
+}
 op {
   name: "Cholesky"
   input_arg {
@@ -10875,6 +11765,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -12982,6 +14013,31 @@ op {
     }
   }
 }
+op {
+  name: "Cos"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Cosh"
   input_arg {
@@ -13031,6 +14087,31 @@ op {
     }
   }
 }
+op {
+  name: "Cosh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "CountUpTo"
   input_arg {
@@ -15446,6 +16527,65 @@ op {
     }
   }
 }
+op {
+  name: "DecodeCSV"
+  input_arg {
+    name: "records"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "OUT_TYPE"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "OUT_TYPE"
+  }
+  attr {
+    name: "OUT_TYPE"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "field_delim"
+    type: "string"
+    default_value {
+      s: ","
+    }
+  }
+  attr {
+    name: "use_quote_delim"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "na_value"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
 op {
   name: "DecodeCompressed"
   input_arg {
@@ -15570,6 +16710,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -15668,6 +16857,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -16233,6 +17438,76 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropFilter"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "filter_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "DepthwiseConv2dNativeBackpropInput"
   input_arg {
@@ -16401,6 +17676,76 @@ op {
     }
   }
 }
+op {
+  name: "DepthwiseConv2dNativeBackpropInput"
+  input_arg {
+    name: "input_sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+}
 op {
   name: "Dequantize"
   input_arg {
@@ -16718,6 +18063,58 @@ op {
     }
   }
 }
+op {
+  name: "Diag"
+  input_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "DiagPart"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "diagonal"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DiagPart"
   input_arg {
@@ -16733,6 +18130,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -16759,6 +18157,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -16814,6 +18213,29 @@ op {
     }
   }
 }
+op {
+  name: "Digamma"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Dilation2D"
   input_arg {
@@ -17587,6 +19009,41 @@ op {
     }
   }
 }
+op {
+  name: "Div"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "DrawBoundingBoxes"
   input_arg {
@@ -17882,6 +19339,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -18043,6 +19523,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -18188,6 +19704,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Erf"
   input_arg {
@@ -18233,6 +19789,29 @@ op {
     }
   }
 }
+op {
+  name: "Erf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Erfc"
   input_arg {
@@ -18278,6 +19857,29 @@ op {
     }
   }
 }
+op {
+  name: "Erfc"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Exit"
   input_arg {
@@ -18342,6 +19944,31 @@ op {
     }
   }
 }
+op {
+  name: "Exp"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExpandDims"
   input_arg {
@@ -18423,6 +20050,31 @@ op {
     }
   }
 }
+op {
+  name: "Expm1"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ExtractGlimpse"
   input_arg {
@@ -20203,6 +21855,29 @@ op {
     }
   }
 }
+op {
+  name: "Floor"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FloorDiv"
   input_arg {
@@ -20272,6 +21947,41 @@ op {
     }
   }
 }
+op {
+  name: "FloorDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "FloorMod"
   input_arg {
@@ -20327,6 +22037,35 @@ op {
     }
   }
 }
+op {
+  name: "FloorMod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "FlushSummaryWriter"
   input_arg {
@@ -20335,6 +22074,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -22418,6 +24189,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -22841,6 +24651,75 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceSub"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+}
 op {
   name: "InterleaveDataset"
   input_arg {
@@ -23073,6 +24952,33 @@ op {
     }
   }
 }
+op {
+  name: "Inv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "InvGrad"
   input_arg {
@@ -23257,6 +25163,35 @@ op {
     }
   }
 }
+op {
+  name: "InvGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Invert"
   input_arg {
@@ -23333,6 +25268,18 @@ op {
     }
   }
 }
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "IsFinite"
   input_arg {
@@ -23378,6 +25325,29 @@ op {
     }
   }
 }
+op {
+  name: "IsFinite"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsInf"
   input_arg {
@@ -23423,6 +25393,29 @@ op {
     }
   }
 }
+op {
+  name: "IsInf"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsNan"
   input_arg {
@@ -23468,6 +25461,29 @@ op {
     }
   }
 }
+op {
+  name: "IsNan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "IsVariableInitialized"
   input_arg {
@@ -24303,84 +26319,106 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "LessEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-}
-op {
-  name: "LessEqual"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "LessEqual"
+  name: "Lgamma"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
+  output_arg {
     name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -24401,6 +26439,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -24422,8 +26461,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -24646,6 +26685,31 @@ op {
     }
   }
 }
+op {
+  name: "Log"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Log1p"
   input_arg {
@@ -24695,6 +26759,31 @@ op {
     }
   }
 }
+op {
+  name: "Log1p"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "LogMatrixDeterminant"
   input_arg {
@@ -25710,6 +27799,50 @@ op {
     }
   }
 }
+op {
+  name: "MatMul"
+  input_arg {
+    name: "a"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "b"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "product"
+    type_attr: "T"
+  }
+  attr {
+    name: "transpose_a"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "transpose_b"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "MatchingFiles"
   input_arg {
@@ -26759,9 +28892,130 @@ op {
   }
 }
 op {
-  name: "MaxPool3D"
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3D"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+}
+op {
+  name: "MaxPool3DGrad"
   input_arg {
-    name: "input"
+    name: "orig_input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "orig_output"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "grad"
     type_attr: "T"
   }
   output_arg {
@@ -26790,25 +29044,11 @@ op {
       }
     }
   }
-  attr {
-    name: "data_format"
-    type: "string"
-    default_value {
-      s: "NDHWC"
-    }
-    allowed_values {
-      list {
-        s: "NDHWC"
-        s: "NCDHW"
-      }
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -26854,6 +29094,19 @@ op {
       }
     }
   }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
   attr {
     name: "T"
     type: "type"
@@ -26868,11 +29121,11 @@ op {
   name: "MaxPool3DGrad"
   input_arg {
     name: "orig_input"
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "orig_output"
-    type: DT_FLOAT
+    type_attr: "TInput"
   }
   input_arg {
     name: "grad"
@@ -26920,6 +29173,21 @@ op {
   attr {
     name: "T"
     type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "TInput"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
     allowed_values {
       list {
         type: DT_FLOAT
@@ -26988,6 +29256,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -27000,6 +29269,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
       }
     }
@@ -27066,6 +29336,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -27079,6 +29350,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -29588,6 +31860,36 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Maximum"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Mean"
   input_arg {
@@ -29976,137 +32278,197 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
         type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
-op {
-  name: "Min"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -30125,64 +32487,33 @@ op {
   }
 }
 op {
-  name: "Min"
+  name: "Minimum"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "reduction_indices"
-    type_attr: "Tidx"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
-  attr {
-    name: "keep_dims"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
         type: DT_INT32
         type: DT_INT64
       }
     }
   }
+  is_commutative: true
 }
 op {
   name: "Minimum"
@@ -30204,6 +32535,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30232,8 +32564,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30382,6 +32714,36 @@ op {
     }
   }
 }
+op {
+  name: "Mod"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Mul"
   input_arg {
@@ -30453,6 +32815,42 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Mul"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Multinomial"
   input_arg {
@@ -31055,6 +33453,33 @@ op {
     }
   }
 }
+op {
+  name: "Neg"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "NegTrain"
   input_arg {
@@ -31238,6 +33663,46 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_QUINT8
+        type: DT_QINT8
+        type: DT_QINT32
+        type: DT_STRING
+        type: DT_BOOL
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "NthElement"
   input_arg {
@@ -31505,6 +33970,38 @@ op {
     }
   }
 }
+op {
+  name: "OnesLike"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT8
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_UINT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_BOOL
+      }
+    }
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -33015,6 +35512,37 @@ op {
     }
   }
 }
+op {
+  name: "Pow"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "PrefetchDataset"
   input_arg {
@@ -33869,6 +36397,117 @@ op {
     version: 22
   }
 }
+op {
+  name: "QuantizeAndDequantize"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "input_min"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "input_max"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  deprecation {
+    version: 22
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV2"
   input_arg {
@@ -33913,6 +36552,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -33964,6 +36604,54 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV3"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "num_bits"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34011,6 +36699,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -34059,6 +36748,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -38111,6 +40801,41 @@ op {
     }
   }
 }
+op {
+  name: "RealDiv"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Reciprocal"
   input_arg {
@@ -38164,6 +40889,33 @@ op {
     }
   }
 }
+op {
+  name: "Reciprocal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "ReciprocalGrad"
   input_arg {
@@ -38249,6 +41001,35 @@ op {
     }
   }
 }
+op {
+  name: "ReciprocalGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "RecordInput"
   output_arg {
@@ -47613,44 +50394,236 @@ op {
   }
 }
 op {
-  name: "ReverseSequence"
+  name: "ReverseSequence"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "seq_lengths"
+    type_attr: "Tlen"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "seq_dim"
+    type: "int"
+  }
+  attr {
+    name: "batch_dim"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tlen"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "axis"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "ReverseV2"
   input_arg {
-    name: "input"
+    name: "tensor"
     type_attr: "T"
   }
   input_arg {
-    name: "seq_lengths"
-    type_attr: "Tlen"
+    name: "axis"
+    type_attr: "Tidx"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "seq_dim"
-    type: "int"
-  }
-  attr {
-    name: "batch_dim"
-    type: "int"
+    name: "Tidx"
+    type: "type"
     default_value {
-      i: 0
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
     }
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tlen"
-    type: "type"
-    default_value {
-      type: DT_INT64
-    }
     allowed_values {
       list {
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_BOOL
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
@@ -47689,134 +50662,145 @@ op {
       list {
         type: DT_UINT8
         type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_BOOL
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
         type: DT_COMPLEX128
+        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "RightShift"
   input_arg {
-    name: "tensor"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "axis"
-    type_attr: "Tidx"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
+    name: "z"
     type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_INT8
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
+  is_commutative: true
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "Rint"
   input_arg {
-    name: "tensor"
+    name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "axis"
-    type_attr: "Tidx"
-  }
   output_arg {
-    name: "output"
+    name: "y"
     type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
-        type: DT_INT32
-        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
+}
+op {
+  name: "Rint"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_BOOL
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "ReverseV2"
+  name: "Roll"
   input_arg {
-    name: "tensor"
+    name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "shift"
+    type_attr: "Tshift"
+  }
   input_arg {
     name: "axis"
-    type_attr: "Tidx"
+    type_attr: "Taxis"
   }
   output_arg {
     name: "output"
     type_attr: "T"
   }
   attr {
-    name: "Tidx"
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tshift"
     type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
         type: DT_INT32
@@ -47825,40 +50809,24 @@ op {
     }
   }
   attr {
-    name: "T"
+    name: "Taxis"
     type: "type"
     allowed_values {
       list {
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_BOOL
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_STRING
       }
     }
   }
 }
 op {
-  name: "RightShift"
+  name: "Round"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
   output_arg {
-    name: "z"
+    name: "y"
     type_attr: "T"
   }
   attr {
@@ -47866,21 +50834,19 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_INT8
-        type: DT_INT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
-  is_commutative: true
 }
 op {
-  name: "Rint"
+  name: "Round"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -47894,14 +50860,20 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Rint"
+  name: "Round"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -47916,57 +50888,60 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Roll"
+  name: "Rpc"
   input_arg {
-    name: "input"
-    type_attr: "T"
+    name: "address"
+    type: DT_STRING
   }
   input_arg {
-    name: "shift"
-    type_attr: "Tshift"
+    name: "method"
+    type: DT_STRING
   }
   input_arg {
-    name: "axis"
-    type_attr: "Taxis"
+    name: "request"
+    type: DT_STRING
   }
   output_arg {
-    name: "output"
-    type_attr: "T"
+    name: "response"
+    type: DT_STRING
   }
   attr {
-    name: "T"
-    type: "type"
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
   attr {
-    name: "Tshift"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
     }
   }
   attr {
-    name: "Taxis"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
     }
   }
+  is_stateful: true
 }
 op {
-  name: "Round"
+  name: "Rsqrt"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -47983,8 +50958,6 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -47992,7 +50965,7 @@ op {
   }
 }
 op {
-  name: "Round"
+  name: "Rsqrt"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -48010,8 +50983,6 @@ op {
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_INT64
         type: DT_COMPLEX64
         type: DT_COMPLEX128
       }
@@ -48033,6 +51004,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -48043,22 +51015,25 @@ op {
   }
 }
 op {
-  name: "Rsqrt"
+  name: "RsqrtGrad"
   input_arg {
     name: "x"
     type_attr: "T"
   }
-  output_arg {
+  input_arg {
     name: "y"
     type_attr: "T"
   }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
         type: DT_HALF
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -48070,11 +51045,11 @@ op {
 op {
   name: "RsqrtGrad"
   input_arg {
-    name: "x"
+    name: "y"
     type_attr: "T"
   }
   input_arg {
-    name: "y"
+    name: "dy"
     type_attr: "T"
   }
   output_arg {
@@ -48115,6 +51090,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -48142,8 +51118,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -52348,6 +55324,31 @@ op {
     }
   }
 }
+op {
+  name: "Sigmoid"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SigmoidGrad"
   input_arg {
@@ -52433,6 +55434,35 @@ op {
     }
   }
 }
+op {
+  name: "SigmoidGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sign"
   input_arg {
@@ -52486,6 +55516,33 @@ op {
     }
   }
 }
+op {
+  name: "Sign"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sin"
   input_arg {
@@ -52535,6 +55592,31 @@ op {
     }
   }
 }
+op {
+  name: "Sin"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Sinh"
   input_arg {
@@ -52584,6 +55666,31 @@ op {
     }
   }
 }
+op {
+  name: "Sinh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Size"
   input_arg {
@@ -61742,6 +64849,31 @@ op {
     }
   }
 }
+op {
+  name: "Sqrt"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SqrtGrad"
   input_arg {
@@ -61827,6 +64959,35 @@ op {
     }
   }
 }
+op {
+  name: "SqrtGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Square"
   input_arg {
@@ -61880,6 +65041,33 @@ op {
     }
   }
 }
+op {
+  name: "Square"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "SquaredDifference"
   input_arg {
@@ -61943,6 +65131,38 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "SquaredDifference"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_commutative: true
+}
 op {
   name: "Squeeze"
   input_arg {
@@ -63036,27 +66256,57 @@ op {
   }
 }
 op {
-  name: "StringToNumber"
+  name: "StringToNumber"
+  input_arg {
+    name: "string_tensor"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type_attr: "out_type"
+  }
+  attr {
+    name: "out_type"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "Sub"
   input_arg {
-    name: "string_tensor"
-    type: DT_STRING
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "output"
-    type_attr: "out_type"
+    name: "z"
+    type_attr: "T"
   }
   attr {
-    name: "out_type"
+    name: "T"
     type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -63083,6 +66333,10 @@ op {
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -63111,6 +66365,7 @@ op {
     allowed_values {
       list {
         type: DT_HALF
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -63144,8 +66399,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -63800,6 +67055,33 @@ op {
     }
   }
 }
+op {
+  name: "Tan"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "Tanh"
   input_arg {
@@ -63849,6 +67131,31 @@ op {
     }
   }
 }
+op {
+  name: "Tanh"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TanhGrad"
   input_arg {
@@ -63934,6 +67241,35 @@ op {
     }
   }
 }
+op {
+  name: "TanhGrad"
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "dy"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "TemporaryVariable"
   output_arg {
@@ -65768,57 +69104,157 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  deprecation {
+    version: 7
+  }
+}
+op {
+  name: "TopK"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  attr {
+    name: "k"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sorted"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-      }
-    }
-  }
-  deprecation {
-    version: 7
-  }
-}
-op {
-  name: "TopK"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "values"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -65831,11 +69267,15 @@ op {
   }
 }
 op {
-  name: "TopK"
+  name: "TopKV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "T"
@@ -65844,11 +69284,6 @@ op {
     name: "indices"
     type: DT_INT32
   }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
   attr {
     name: "sorted"
     type: "bool"
@@ -65870,22 +69305,20 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
-  deprecation {
-    version: 7
-  }
 }
 op {
-  name: "TopK"
+  name: "TopKV2"
   input_arg {
     name: "input"
     type_attr: "T"
   }
+  input_arg {
+    name: "k"
+    type: DT_INT32
+  }
   output_arg {
     name: "values"
     type_attr: "T"
@@ -65894,11 +69327,6 @@ op {
     name: "indices"
     type: DT_INT32
   }
-  attr {
-    name: "k"
-    type: "int"
-    has_minimum: true
-  }
   attr {
     name: "sorted"
     type: "bool"
@@ -65914,11 +69342,10 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -65926,9 +69353,6 @@ op {
       }
     }
   }
-  deprecation {
-    version: 7
-  }
 }
 op {
   name: "TopKV2"
@@ -65969,6 +69393,9 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -66006,10 +69433,11 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
@@ -66019,125 +69447,102 @@ op {
   }
 }
 op {
-  name: "TopKV2"
+  name: "Transpose"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "k"
-    type: DT_INT32
+    name: "perm"
+    type_attr: "Tperm"
   }
   output_arg {
-    name: "values"
+    name: "y"
     type_attr: "T"
   }
-  output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
   attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "T"
+    type: "type"
   }
   attr {
-    name: "T"
+    name: "Tperm"
     type: "type"
+    default_value {
+      type: DT_INT32
+    }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "TopKV2"
+  name: "TruncateDiv"
   input_arg {
-    name: "input"
+    name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "k"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "values"
+    name: "y"
     type_attr: "T"
   }
   output_arg {
-    name: "indices"
-    type: DT_INT32
-  }
-  attr {
-    name: "sorted"
-    type: "bool"
-    default_value {
-      b: true
-    }
+    name: "z"
+    type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT32
         type: DT_UINT8
-        type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "Transpose"
+  name: "TruncateDiv"
   input_arg {
     name: "x"
     type_attr: "T"
   }
   input_arg {
-    name: "perm"
-    type_attr: "Tperm"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "y"
+    name: "z"
     type_attr: "T"
   }
   attr {
     name: "T"
     type: "type"
-  }
-  attr {
-    name: "Tperm"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
     allowed_values {
       list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
@@ -66161,6 +69566,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -66177,7 +69583,7 @@ op {
   }
 }
 op {
-  name: "TruncateDiv"
+  name: "TruncateMod"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -66195,18 +69601,10 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_INT16
         type: DT_INT32
         type: DT_INT64
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -66232,6 +69630,7 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -66260,6 +69659,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -66361,6 +69761,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
@@ -67727,6 +71176,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index e2453b9712f3a1141343f694f0e2c798aa39ed4a..b25abbcc6780364c27c1a078b0a8980014c83a43 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -105,8 +105,11 @@ REGISTER_OP("RepeatDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate the
-                                                // shape of `count`.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("TakeDataset")
     .Input("input_dataset: variant")
@@ -114,7 +117,11 @@ REGISTER_OP("TakeDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("SkipDataset")
     .Input("input_dataset: variant")
@@ -122,7 +129,11 @@ REGISTER_OP("SkipDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle count_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("BytesProducedStatsDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/core/ops/decode_proto_ops.cc b/tensorflow/core/ops/decode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f6fb2f58234c564960ad378867a6af27d1b5d2e
--- /dev/null
+++ b/tensorflow/core/ops/decode_proto_ops.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("DecodeProtoV2")
+    .Input("bytes: string")
+    .Attr("message_type: string")
+    .Attr("field_names: list(string)")
+    .Attr("output_types: list(type) >= 0")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("message_format: string = 'binary'")
+    .Attr("sanitize: bool = false")
+    .Output("sizes: int32")
+    .Output("values: output_types")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input = c->input(0);
+
+      std::vector<tensorflow::DataType> output_types;
+      TF_RETURN_IF_ERROR(c->GetAttr("output_types", &output_types));
+
+      ShapeHandle sizes;
+      TF_RETURN_IF_ERROR(
+          c->Concatenate(input, c->Vector(output_types.size()), &sizes));
+      c->set_output(0, sizes);
+
+      // TODO(nix): to do the best possible job of shape inference, we
+      // should examine the proto descriptors here in order to set shape
+      // indices to 1 instead of unknown for optional or required fields.
+      // Any general-purpose code will have to handle the unknown case,
+      // but there might be XLA code that could be sped up with the additional
+      // knowledge.
+      for (int i = 0; i < output_types.size(); ++i) {
+        ShapeHandle values;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(input, c->Vector(c->UnknownDim()), &values));
+        c->set_output(i + 1, values);
+      }
+
+      return Status::OK();
+    });
+
+// TODO(nix): Consider adding an additional input argument that truncates
+// repeated fields to a maximum count. For now this could be done by passing
+// the output through tf.slice.
+
+// TODO(nix): define missing value behavior.
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/encode_proto_ops.cc b/tensorflow/core/ops/encode_proto_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec3056e35c8be82302f6cb32174661c9979225
--- /dev/null
+++ b/tensorflow/core/ops/encode_proto_ops.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("EncodeProto")
+    .Input("sizes: int32")
+    .Input("values: Tinput_types")
+    .Attr("field_names: list(string)")
+    .Attr("message_type: string")
+    .Attr("descriptor_source: string = 'local://'")
+    .Attr("Tinput_types: list(type)")
+    .Output("bytes: string")
+    .SetShapeFn([](InferenceContext* c) {
+      int first_field_index = 1;
+      int num_fields = c->num_inputs() - 1;
+
+      ShapeHandle output;
+      for (int i = num_fields - 1; i >= 0; --i) {
+        ShapeHandle input = c->input(first_field_index + i);
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(input, 2, &input));
+        ShapeHandle inner;
+        TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &inner));
+        TF_RETURN_IF_ERROR(c->Merge(inner, output, &output));
+      }
+
+      c->set_output(0, output);
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 4b21fac80aea76555959e8a202a73ccc833d0306..792686cae1f599c17cbea05aa5de61f1e592996c 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -50,6 +50,7 @@ REGISTER_OP("RemoteCall")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
 
+// TODO(drpng): remove this.
 REGISTER_OP("_If")
     .Input("cond: Tcond")
     .Input("input: Tin")
@@ -76,8 +77,18 @@ else_branch: A function that takes 'inputs' and returns a list of
     tensors.  whose types are the same as what then_branch returns.
 )doc");
 
-// TODO(b/37549631) setting the While Op to always be stateful is too
-// conservative.
+REGISTER_OP("If")
+    .Input("cond: Tcond")
+    .Input("input: Tin")
+    .Output("output: Tout")
+    .Attr("Tcond: type")
+    .Attr("Tin: list(type)")
+    .Attr("Tout: list(type)")
+    .Attr("then_branch: func")
+    .Attr("else_branch: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+// TODO(drpng): remove this.
 REGISTER_OP("_While")
     .Input("input: T")
     .Output("output: T")
@@ -108,4 +119,30 @@ body: A function that takes a list of tensors and returns another
       by T.
 )doc");
 
+// TODO(b/37549631) setting the While Op to always be stateful is too
+// conservative.
+REGISTER_OP("While")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("cond: func")
+    .Attr("body: func")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      for (int i = 0; i < c->num_outputs(); ++i) {
+        c->set_output(i, c->input(i));
+      }
+      return Status::OK();
+    });
+
+REGISTER_OP("For")
+    .Input("start: int32")
+    .Input("limit: int32")
+    .Input("delta: int32")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: list(type) >= 0")
+    .Attr("body: func")
+    .SetShapeFn(shape_inference::UnknownShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index cad617638ff12cd1020276341fbe9f9b7aac97bc..c151055ee68af14d9c1ea912557fc941047dc516 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -30,7 +30,8 @@ REGISTER_OP("EmptyTensorList")
       DataType t;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
       shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(
+          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{{s, t}});
       return Status::OK();
@@ -193,6 +194,7 @@ REGISTER_OP("TensorListReserve")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
       DataType t;
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index 8dcd3e815f3c19b41b1ef02a23e1f5ce36697a23..da38a6bc2497aca1623faed40c41386a4daff113 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session.h"
 
@@ -362,7 +363,7 @@ class MathGradTest : public ::testing::Test {
 };
 
 void HasError(const Status& s, const string& substr) {
-  EXPECT_TRUE(StringPiece(s.ToString()).contains(substr))
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f33d51d5a20fc207102e4bf79e7605d9817eb9f..8f8443a46cfa68e9879825d36b305b4f7774bd66 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -65,7 +65,7 @@ REGISTER_OP("BatchMatMul")
     .Input("x: T")
     .Input("y: T")
     .Output("output: T")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .Attr("adj_x: bool = false")
     .Attr("adj_y: bool = false")
     .SetShapeFn([](InferenceContext* c) {
@@ -133,7 +133,7 @@ _HostCast requires its input and produces its output in host memory.
 REGISTER_OP("Abs")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("ComplexAbs")
@@ -148,27 +148,27 @@ REGISTER_OP("ComplexAbs")
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
       .Attr(                                                             \
-          "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+          "T: {bfloat16, half, float, double, int32, int64, complex64, " \
           "complex128}")                                                 \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_REAL()                              \
   Input("x: T")                                   \
       .Output("y: T")                             \
-      .Attr("T: {half, bfloat16, float, double}") \
+      .Attr("T: {bfloat16, half, float, double}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_COMPLEX()                                                  \
   Input("x: T")                                                          \
       .Output("y: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 #define UNARY_GRADIENT_COMPLEX()                                         \
   Input("y: T")                                                          \
       .Input("dy: T")                                                    \
       .Output("z: T")                                                    \
-      .Attr("T: {half, bfloat16, float, double, complex64, complex128}") \
+      .Attr("T: {bfloat16, half, float, double, complex64, complex128}") \
       .SetShapeFn(shape_inference::UnchangedShape)
 
 REGISTER_OP("Neg").UNARY();
@@ -246,57 +246,57 @@ REGISTER_OP("Atan").UNARY();
 REGISTER_OP("IsNan")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsInf")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("IsFinite")
     .Input("x: T")
     .Output("y: bool")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Sign")
     .Input("x: T")
     .Output("y: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, half, float, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Floor")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Ceil")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("Rint")
     .Input("x: T")
     .Output("y: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 // Declares cwise binary operations signature: 't, 't -> 't.
 
 #define BINARY_MORE()                                                          \
   Input("x: T").Input("y: T").Output("z: T").Attr(                             \
-      "T: {half, bfloat16, float, double, uint8, int8, uint16, int16, int32, " \
+      "T: {bfloat16, half, float, double, uint8, int8, uint16, int16, int32, " \
       "int64, complex64, complex128}")
 
 #define BINARY_FEWER()                                               \
   Input("x: T").Input("y: T").Output("z: T").Attr(                   \
-      "T: {half, bfloat16, float, double, int32, int64, complex64, " \
+      "T: {bfloat16, half, float, double, int32, int64, complex64, " \
       "complex128}")
 
 REGISTER_OP("Add")
@@ -304,7 +304,7 @@ REGISTER_OP("Add")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128, string}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -315,7 +315,7 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, uint8, int8, int16, int32, int64, "
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
         "complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
@@ -412,7 +412,7 @@ REGISTER_OP("Maximum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -437,7 +437,7 @@ REGISTER_OP("Minimum")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {half, bfloat16, float, double, int32, int64}")
+    .Attr("T: {bfloat16, half, float, double, int32, int64}")
     .SetIsCommutative()
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -445,21 +445,21 @@ REGISTER_OP("Mod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, float16, half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("FloorMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("TruncateMod")
     .Input("x: T")
     .Input("y: T")
     .Output("z: T")
-    .Attr("T: {int32, int64, bfloat16, float, double}")
+    .Attr("T: {int32, int64, bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Pow")
@@ -467,7 +467,7 @@ REGISTER_OP("Pow")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {half, bfloat16, float, double, int32, int64, complex64, "
+        "T: {bfloat16, float, half, double, int32, int64, complex64, "
         "complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
@@ -503,7 +503,7 @@ REGISTER_OP("Atan2")
     .Input("y: T")
     .Input("x: T")
     .Output("z: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {bfloat16, half, float, double}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn);
 
 REGISTER_OP("Betainc")
@@ -574,7 +574,7 @@ REGISTER_OP("GreaterEqual").COMPARISON();
       .Output("z: bool")                                                   \
       .SetIsCommutative()                                                  \
       .Attr(                                                               \
-          "T: {half, bfloat16, float, double, uint8, int8, int16, int32, " \
+          "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
           "int64, complex64, quint8, qint8, qint32, string, bool, "        \
           "complex128}")                                                   \
       .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
@@ -713,7 +713,7 @@ REGISTER_OP("MatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {half, bfloat16, float, double, int32, complex64, complex128}")
+    .Attr("T: {bfloat16, half, float, double, int32, complex64, complex128}")
     .SetShapeFn(shape_inference::MatMulShape);
 
 REGISTER_OP("SparseMatMul")
@@ -1558,6 +1558,14 @@ REGISTER_OP("Bucketize")
     .Attr("boundaries: list(float)")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("ClipByValue")
+    .Input("t: T")
+    .Input("clip_value_min: T")
+    .Input("clip_value_max: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
 #ifdef INTEL_MKL
 REGISTER_OP("_MklAddN")
     .Input("inputs: N * T")
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index ca3772e6f89805b70f05f1c9fd5e36ee99f2d510..8f974d5367a486dca39cddfd3fbdca4d4a3bf6eb 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -239,20 +240,21 @@ TEST(MathOpsTest, Select_ShapeFn) {
 
   // Expect an error when the shapes can't be merged.
   handle_data[2]->at(0).first = shape_proto({2, 2});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("must be equal, but are 1 and 2"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "must be equal, but are 1 and 2"));
   handle_data[2]->at(0).first = i1;  // restore to valid
 
   // Expect an error when the types can't be merged.
   handle_data[2]->at(1).second = DT_INT64;
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different dtypes"));
+  EXPECT_TRUE(str_util::StrContains(run_inference_for_handles().error_message(),
+                                    "pointing to different dtypes"));
   handle_data[2]->at(1).second = DT_INT32;  // restore to valid
 
   // Expect an error when different numbers of tensors are merged.
   handle_data[2]->push_back({i1, DT_FLOAT});
-  EXPECT_TRUE(StringPiece(run_inference_for_handles().error_message())
-                  .contains("pointing to different numbers of tensors"));
+  EXPECT_TRUE(
+      str_util::StrContains(run_inference_for_handles().error_message(),
+                            "pointing to different numbers of tensors"));
   handle_data[2]->pop_back();  // restore to valid.
 }
 
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index b9d5104857be7d76eae1d4a12fd65afd285fb52a..18165fb6edb023087e283c695d25fa2475e78a47 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -472,7 +472,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropInput")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -490,7 +490,7 @@ REGISTER_OP("DepthwiseConv2dNativeBackpropFilter")
     .Input("filter_sizes: int32")
     .Input("out_backprop: T")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .Attr("strides: list(int)")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -589,7 +589,7 @@ REGISTER_OP("AvgPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("AvgPool3DGrad")
@@ -600,7 +600,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {half, bfloat16, float, double}")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -618,7 +618,7 @@ REGISTER_OP("MaxPool3D")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {half, bfloat16, float}")
     .SetShapeFn(shape_inference::Pool3DShape);
 
 REGISTER_OP("MaxPool3DGrad")
@@ -630,8 +630,8 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr("strides: list(int) >= 5")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
-    .Attr("T: {bfloat16, float} = DT_FLOAT")
-    .Attr("TInput: {bfloat16, float} = DT_FLOAT")
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
       return UnchangedShapeWithRank(c, 5);
     });
@@ -1071,7 +1071,12 @@ REGISTER_OP("SoftmaxCrossEntropyWithLogits")
       }
       TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFn(c, 1));
 
-      if (!c->RankKnown(c->output(1)) || c->Rank(c->output(1)) != 2) {
+      if (!c->RankKnown(c->output(1))) {
+        return errors::InvalidArgument(
+            "Shape must be broadcasted with rank 2, but is rank is unknown.");
+      }
+
+      if (c->Rank(c->output(1)) != 2) {
         return errors::InvalidArgument(
             "Shape must be broadcasted with rank 2, but is rank ",
             c->Rank(c->output(1)));
@@ -1528,6 +1533,7 @@ REGISTER_OP("__MklDummyConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 Dummy node that enables fusing Conv2D and BiasAdd operator for MKL. This node
 does not perform anything. It is just created as an intermediate output of
@@ -1554,6 +1560,7 @@ REGISTER_OP("_MklConv2DWithBias")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 MKL version of Conv2D and BiasAdd operator. Uses MKL DNN APIs to perform
 2D convolution and add Bias to the output of convolution.
@@ -1676,6 +1683,7 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+#ifdef INTEL_MKL_ML
 REGISTER_OP("_MklConv2DWithBiasBackpropBias")
     .Input("out_backprop: T")
     .Input("mkl_out_backprop: uint8")
@@ -1692,6 +1700,7 @@ gradients of convolution with respect to the bias.
 NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
+#endif
 
 REGISTER_OP("_MklConv2DBackpropInput")
     .Input("input_sizes: int32")
@@ -2149,6 +2158,7 @@ REGISTER_OP("_MklToTf")
     .Output("output: T")
     .Attr("T: {half, float, double}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to convert a tensor from MKL layout to TensorFlow layout.
 
@@ -2170,6 +2180,7 @@ REGISTER_OP("_MklInputConversion")
         "T: {half, float, double, uint8, int8, uint16, int16, int32, int64, "
         "complex64, complex128}")
     .Attr(GetConvnetDataFormatAttrString())
+    .SetShapeFn(shape_inference::UnknownShape)
     .Doc(R"doc(
 MKL operator to process the inputs to an elementwise MKL op. Both inputs
 need to be either in TF or in MKL format. This op is added before every
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 274a7fbf752051d8c2d2d80ecfb9ab8f98cf4892..6af77be14806076b1e7963b38bcf88f84a1468bb 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -30,8 +30,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -210,8 +210,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -237,8 +237,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -266,8 +266,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -423,8 +423,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -1932,8 +1932,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -1959,8 +1959,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2191,8 +2191,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -2223,6 +2223,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -2244,8 +2245,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -2448,6 +2449,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -2509,6 +2511,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -3004,8 +3007,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -3854,6 +3857,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3869,7 +3873,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3879,6 +3882,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT64
@@ -3894,7 +3898,6 @@ op {
         type: DT_QINT16
         type: DT_QUINT16
         type: DT_QINT32
-        type: DT_HALF
       }
     }
   }
@@ -3995,6 +3998,342 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "BoostedTreesCalculateBestGainsPerFeature"
+  input_arg {
+    name: "node_id_range"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "stats_summary_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "node_ids_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "gains_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "thresholds_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "left_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "right_node_contribs_list"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "l1"
+    type: "float"
+  }
+  attr {
+    name: "l2"
+    type: "float"
+  }
+  attr {
+    name: "tree_complexity"
+    type: "float"
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesCreateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesDeserializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesEnsembleResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesGetEnsembleStates"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "num_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_finalized_trees"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "num_attempted_layers"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesMakeStatsSummary"
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "gradients"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "hessians"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "bucketized_features_list"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  output_arg {
+    name: "stats_summary"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "max_splits"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_buckets"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "BoostedTreesPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesSerializeEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "stamp_token"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "tree_ensemble_serialized"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesTrainingPredict"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "cached_tree_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "cached_node_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "bucketized_features"
+    type: DT_INT32
+    number_attr: "num_bucketized_features"
+  }
+  output_arg {
+    name: "partial_logits"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "tree_ids"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "node_ids"
+    type: DT_INT32
+  }
+  attr {
+    name: "num_bucketized_features"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "BoostedTreesUpdateEnsemble"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "feature_ids"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "node_ids"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "gains"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "thresholds"
+    type: DT_INT32
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "left_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  input_arg {
+    name: "right_node_contribs"
+    type: DT_FLOAT
+    number_attr: "num_features"
+  }
+  attr {
+    name: "max_depth"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "learning_rate"
+    type: "float"
+  }
+  attr {
+    name: "pruning_mode"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "num_features"
+    type: "int"
+    has_minimum: true
+  }
+  is_stateful: true
+}
 op {
   name: "BroadcastArgs"
   input_arg {
@@ -4301,8 +4640,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4324,8 +4663,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -4392,6 +4731,147 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  is_stateful: true
+}
 op {
   name: "CompareAndBitpack"
   input_arg {
@@ -5423,8 +5903,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -5448,8 +5928,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -6883,6 +7363,14 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "select_cols"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
 }
 op {
   name: "DecodeCompressed"
@@ -7008,6 +7496,55 @@ op {
     }
   }
 }
+op {
+  name: "DecodeProtoV2"
+  input_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "values"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "message_format"
+    type: "string"
+    default_value {
+      s: "binary"
+    }
+  }
+  attr {
+    name: "sanitize"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "DecodeRaw"
   input_arg {
@@ -7072,6 +7609,22 @@ op {
     }
   }
 }
+op {
+  name: "DeepCopy"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  is_stateful: true
+}
 op {
   name: "DeleteSessionTensor"
   input_arg {
@@ -7341,6 +7894,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7410,6 +7964,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -7624,6 +8179,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7650,6 +8206,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -7675,8 +8232,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -7881,8 +8438,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8096,6 +8653,29 @@ op {
     }
   }
 }
+op {
+  name: "Empty"
+  input_arg {
+    name: "shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "init"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "EmptyTensorList"
   input_arg {
@@ -8257,6 +8837,42 @@ op {
     }
   }
 }
+op {
+  name: "EncodeProto"
+  input_arg {
+    name: "sizes"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "Tinput_types"
+  }
+  output_arg {
+    name: "bytes"
+    type: DT_STRING
+  }
+  attr {
+    name: "field_names"
+    type: "list(string)"
+  }
+  attr {
+    name: "message_type"
+    type: "string"
+  }
+  attr {
+    name: "descriptor_source"
+    type: "string"
+    default_value {
+      s: "local://"
+    }
+  }
+  attr {
+    name: "Tinput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "EncodeWav"
   input_arg {
@@ -8342,8 +8958,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -8378,8 +8994,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8401,8 +9017,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -8439,8 +9055,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -8496,8 +9112,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -9364,8 +9980,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9391,8 +10007,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -9429,6 +10045,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -9443,6 +10060,38 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "For"
+  input_arg {
+    name: "start"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "limit"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+}
 op {
   name: "FractionalAvgPool"
   input_arg {
@@ -10847,6 +11496,45 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
@@ -11151,68 +11839,137 @@ op {
   }
 }
 op {
-  name: "InitializeTableFromTextFileV2"
+  name: "InitializeTableFromTextFileV2"
+  input_arg {
+    name: "table_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  attr {
+    name: "key_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "value_index"
+    type: "int"
+    has_minimum: true
+    minimum: -2
+  }
+  attr {
+    name: "vocab_size"
+    type: "int"
+    default_value {
+      i: -1
+    }
+    has_minimum: true
+    minimum: -1
+  }
+  attr {
+    name: "delimiter"
+    type: "string"
+    default_value {
+      s: "\t"
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "InitializeTableV2"
   input_arg {
     name: "table_handle"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "filename"
-    type: DT_STRING
+    name: "keys"
+    type_attr: "Tkey"
   }
-  attr {
-    name: "key_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
+  input_arg {
+    name: "values"
+    type_attr: "Tval"
   }
   attr {
-    name: "value_index"
-    type: "int"
-    has_minimum: true
-    minimum: -2
+    name: "Tkey"
+    type: "type"
   }
   attr {
-    name: "vocab_size"
-    type: "int"
-    default_value {
-      i: -1
-    }
-    has_minimum: true
-    minimum: -1
+    name: "Tval"
+    type: "type"
+  }
+  is_stateful: true
+}
+op {
+  name: "InplaceAdd"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "delimiter"
-    type: "string"
-    default_value {
-      s: "\t"
-    }
+    name: "T"
+    type: "type"
   }
-  is_stateful: true
 }
 op {
-  name: "InitializeTableV2"
+  name: "InplaceSub"
   input_arg {
-    name: "table_handle"
-    type: DT_RESOURCE
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "keys"
-    type_attr: "Tkey"
+    name: "i"
+    type: DT_INT32
   }
   input_arg {
-    name: "values"
-    type_attr: "Tval"
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
   }
   attr {
-    name: "Tkey"
+    name: "T"
     type: "type"
   }
+}
+op {
+  name: "InplaceUpdate"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "i"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "v"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
   attr {
-    name: "Tval"
+    name: "T"
     type: "type"
   }
-  is_stateful: true
 }
 op {
   name: "InterleaveDataset"
@@ -11273,8 +12030,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -11304,8 +12061,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -11365,6 +12122,18 @@ op {
     }
   }
 }
+op {
+  name: "IsBoostedTreesEnsembleInitialized"
+  input_arg {
+    name: "tree_ensemble_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
 op {
   name: "IsFinite"
   input_arg {
@@ -11380,8 +12149,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11403,8 +12172,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11426,8 +12195,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -11941,8 +12710,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -12089,8 +12858,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12114,8 +12883,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -12971,8 +13740,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -13467,6 +14236,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -13534,6 +14304,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -13547,6 +14318,7 @@ op {
     }
     allowed_values {
       list {
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
       }
@@ -14206,8 +14978,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14462,8 +15234,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -14578,6 +15350,8 @@ op {
       list {
         type: DT_INT32
         type: DT_INT64
+        type: DT_HALF
+        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
@@ -14604,8 +15378,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15026,8 +15800,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -15162,8 +15936,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -15327,6 +16101,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT8
@@ -16605,9 +17380,9 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
         type: DT_FLOAT
+        type: DT_HALF
         type: DT_DOUBLE
         type: DT_INT32
         type: DT_INT64
@@ -17037,6 +17812,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17092,6 +17868,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -17140,6 +17917,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -19687,8 +20465,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -19718,8 +20496,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -19749,8 +20527,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23038,8 +23816,8 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BOOL
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23097,6 +23875,7 @@ op {
     allowed_values {
       list {
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -23161,8 +23940,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -23173,6 +23952,47 @@ op {
     }
   }
 }
+op {
+  name: "Rpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Rsqrt"
   input_arg {
@@ -23188,8 +24008,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -23217,8 +24037,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25068,8 +25888,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25097,8 +25917,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25122,8 +25942,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -25149,8 +25969,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -25174,8 +25994,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28569,8 +29389,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28598,8 +29418,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -28623,8 +29443,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -28654,8 +29474,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -29603,8 +30423,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -29988,8 +30808,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
@@ -30015,8 +30835,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -30044,8 +30864,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_COMPLEX64
@@ -31666,8 +32486,8 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_HALF
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_UINT8
@@ -31704,6 +32524,7 @@ op {
         type: DT_INT32
         type: DT_INT64
         type: DT_BFLOAT16
+        type: DT_HALF
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -31758,6 +32579,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TryRpc"
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "method"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "request"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "response"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "status_code"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "status_message"
+    type: DT_STRING
+  }
+  attr {
+    name: "protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "fail_fast"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "timeout_in_ms"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Unbatch"
   input_arg {
@@ -32588,6 +33458,31 @@ op {
     }
   }
 }
+op {
+  name: "While"
+  input_arg {
+    name: "input"
+    type_list_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "cond"
+    type: "func"
+  }
+  attr {
+    name: "body"
+    type: "func"
+  }
+  is_stateful: true
+}
 op {
   name: "WholeFileReader"
   output_arg {
diff --git a/tensorflow/core/ops/parsing_ops.cc b/tensorflow/core/ops/parsing_ops.cc
index ddd2aa92748f244c2d132f00780a0d6424f1e595..ddb714b4e951aa485d087daa31368bad9f1261e4 100644
--- a/tensorflow/core/ops/parsing_ops.cc
+++ b/tensorflow/core/ops/parsing_ops.cc
@@ -245,6 +245,7 @@ REGISTER_OP("DecodeCSV")
     .Attr("field_delim: string = ','")
     .Attr("use_quote_delim: bool = true")
     .Attr("na_value: string = ''")
+    .Attr("select_cols: list(int) = []")
     .SetShapeFn([](InferenceContext* c) {
       // Validate the record_defaults inputs.
       for (int i = 1; i < c->num_inputs(); ++i) {
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index f6c668f5c98efff07a49be15b1187f1858800110..416ce9c0d82ca0bfba730d3d7f4513260876e9ad 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -43,7 +43,12 @@ REGISTER_OP("RandomUniformInt")
     .Attr("seed2: int = 0")
     .Attr("Tout: {int32, int64}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("RandomStandardNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..72fda5e6eba3fd4cf3e53a26e0b4d9f5d6b19100
--- /dev/null
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+Status RpcShapeOp(InferenceContext* c, bool try_rpc) {
+  ShapeHandle address;
+  ShapeHandle method;
+  ShapeHandle request;
+  ShapeHandle output;
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &address));
+  if (c->Rank(address) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, address, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &method));
+  if (c->Rank(method) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, method, &output));
+  }
+  TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &request));
+  if (c->Rank(request) == 1) {
+    TF_RETURN_IF_ERROR(c->Merge(output, request, &output));
+  }
+  if (!c->RankKnown(output)) {
+    output = request;
+  }
+  c->set_output(0, output);  // response
+  if (try_rpc) {
+    c->set_output(1, output);  // status_code
+    c->set_output(2, output);  // status_message
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("Rpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/false);
+    });
+
+REGISTER_OP("TryRpc")
+    .Input("address: string")
+    .Input("method: string")
+    .Input("request: string")
+    .Attr("protocol: string = ''")
+    .Attr("fail_fast: bool = true")
+    .Attr("timeout_in_ms: int = 0")
+    .Output("response: string")
+    .Output("status_code: int32")
+    .Output("status_message: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      return RpcShapeOp(c, /*try_rpc=*/true);
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/abi.cc b/tensorflow/core/platform/abi.cc
index 4df62734e96c86fdbdae4dcf34f7a1f2a6583d5c..e597a490d619e55ad491d108c4a536727431b92b 100644
--- a/tensorflow/core/platform/abi.cc
+++ b/tensorflow/core/platform/abi.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/abi.h"
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include <windows.h>
 #include <cstring>
 #else
@@ -26,19 +26,19 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 
 extern "C" char* __unDName(char* output_string, const char* name,
                            int max_string_length, void* (*p_alloc)(std::size_t),
                            void (*p_free)(void*), unsigned short disable_flags);
 
-#endif  // defined(PLATFORM_WINDOWS)
+#endif  // defined(_MSC_VER)
 
 namespace tensorflow {
 namespace port {
 
 std::string MaybeAbiDemangle(const char* name) {
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
   std::unique_ptr<char> demangled{__unDName(nullptr, name, 0, std::malloc,
                                             std::free,
                                             static_cast<unsigned short>(0))};
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 21636641e7a35c4f3757cd9bb9f9e90ec51620c2..be84316c482aa5eb9031388c3d3694a1ae3514ea 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -14,20 +14,6 @@ load(
     "if_windows",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        include = [
-            "**/*",
-        ],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
@@ -99,6 +85,7 @@ cc_library(
         ":retrying_utils",
         ":time_util",
         "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@jsoncpp_git//:jsoncpp",
     ],
@@ -277,6 +264,7 @@ tf_cc_test(
     deps = [
         ":gcs_file_system",
         ":http_request_fake",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 1691826483a3227ea00b3b37e82002f3ad8d5225..3c0dc13d75fb113ece6960e6cf1e39e9c3f9adf1 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -172,7 +172,7 @@ Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
@@ -535,7 +535,8 @@ class GcsWritableFile : public WritableFile {
       *uploaded = 0;
     } else {
       StringPiece range_piece(received_range);
-      range_piece.Consume("bytes=");  // May or may not be present.
+      str_util::ConsumePrefix(&range_piece,
+                              "bytes=");  // May or may not be present.
       std::vector<int64> range_parts;
       if (!str_util::SplitAndParseAsInts(range_piece, '-', &range_parts) ||
           range_parts.size() != 2) {
@@ -1172,7 +1173,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         // 'object_prefix', which is part of 'dirname', should be removed from
         // the beginning of 'name'.
         StringPiece relative_path(name);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(strings::StrCat(
               "Unexpected response: the returned file name ", name,
               " doesn't match the prefix ", object_prefix));
@@ -1201,7 +1202,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
         }
         const string& prefix_str = prefix.asString();
         StringPiece relative_path(prefix_str);
-        if (!relative_path.Consume(object_prefix)) {
+        if (!str_util::ConsumePrefix(&relative_path, object_prefix)) {
           return errors::Internal(
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 8516421614481cbb5e96cacd4b1f16aded883a91..2fbde9b6a79883b674df15fb4e69b5cf3cc643a2 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/cloud/http_request_fake.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -584,8 +585,9 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadAllAttemptsFail) {
   TF_EXPECT_OK(file->Append("content2"));
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts failed. The last failure: "
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(),
+                            "All 10 retry attempts failed. The last failure: "
                             "Unavailable: important HTTP error 503"))
       << status;
 }
@@ -641,13 +643,12 @@ TEST(GcsFileSystemTest, NewWritableFile_UploadReturns410) {
   const auto& status = file->Close();
   EXPECT_EQ(errors::Code::UNAVAILABLE, status.code());
   EXPECT_TRUE(
-      StringPiece(status.error_message())
-          .contains(
-              "Upload to gs://bucket/path/writeable.txt failed, caused by: "
-              "Not found: important HTTP error 410"))
+      str_util::StrContains(status.error_message(),
+                            "Upload to gs://bucket/path/writeable.txt failed, "
+                            "caused by: Not found: important HTTP error 410"))
       << status;
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("when uploading gs://bucket/path/writeable.txt"))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(), "when uploading gs://bucket/path/writeable.txt"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index d3f763bb3c845436e8458135a0a754d8cb002957..ee6886fef70328dafd199ba94ef2af2c58e0de29 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_file_system.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -245,7 +246,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   char scratch[10];
   const auto& status = random_access_file->Read(0, 10, &result, scratch);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -399,7 +400,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   // Use it and check the results.
   const auto& status = writable_file->Sync();
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -428,7 +429,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
   const auto& status =
       fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -454,7 +455,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -481,7 +482,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -506,7 +507,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -531,7 +532,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -556,7 +557,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -582,7 +583,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -605,7 +606,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -630,7 +631,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -642,7 +643,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -677,7 +678,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
@@ -706,7 +707,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
   const auto& status =
       fs.DeleteRecursively("gs://path/dir", &undeleted_files, &undeleted_dirs);
   EXPECT_TRUE(
-      StringPiece(status.error_message()).contains("Retriable error #10"))
+      str_util::StrContains(status.error_message(), "Retriable error #10"))
       << status;
 }
 
diff --git a/tensorflow/core/platform/cloud/retrying_utils_test.cc b/tensorflow/core/platform/cloud/retrying_utils_test.cc
index 6eb340e09438eafbe59844a378aa06801ed3b4bf..1b6527618a8e0fa1261b96bd79bdd8e5e2e6f8d1 100644
--- a/tensorflow/core/platform/cloud/retrying_utils_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/retrying_utils.h"
 #include <fstream>
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -31,10 +32,9 @@ TEST(RetryingUtilsTest, CallWithRetries_RetryDelays) {
 
   const auto& status = RetryingUtils::CallWithRetries(f, 500000L, sleep);
   EXPECT_EQ(errors::Code::ABORTED, status.code());
-  EXPECT_TRUE(StringPiece(status.error_message())
-                  .contains("All 10 retry attempts "
-                            "failed. The last failure: "
-                            "Unavailable: Failed."))
+  EXPECT_TRUE(str_util::StrContains(
+      status.error_message(),
+      "All 10 retry attempts failed. The last failure: Unavailable: Failed."))
       << status;
 
   EXPECT_EQ(10, requested_delays.size());
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index 331f3e525169a93fa01739eefdf2dc6c588980a0..bb77650e26e7e7bd3391d4d0b5131c5a386a6dc6 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <string>
 
-#if defined(PLATFORM_WINDOWS)
+#if defined(_MSC_VER)
 #include "tensorflow/core/platform/windows/cpu_info.h"
 #endif
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index e01e076bcf279206ea821d2777a3d44755668f02..4cfa25bf66eba51e4c8646f8596c2d7b9f9a4b53 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -122,6 +122,7 @@ def cc_proto_library(
     protoc="@protobuf_archive//:protoc",
     internal_bootstrap_hack=False,
     use_grpc_plugin=False,
+    use_grpc_namespace=False,
     default_header=False,
     **kargs):
   """Bazel rule to create a C++ protobuf library from proto source files.
@@ -169,8 +170,11 @@ def cc_proto_library(
     return
 
   grpc_cpp_plugin = None
+  plugin_options = []
   if use_grpc_plugin:
     grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+    if use_grpc_namespace:
+      plugin_options = ["services_namespace=grpc"]
 
   gen_srcs = _proto_cc_srcs(srcs, use_grpc_plugin)
   gen_hdrs = _proto_cc_hdrs(srcs, use_grpc_plugin)
@@ -184,6 +188,7 @@ def cc_proto_library(
       protoc=protoc,
       plugin=grpc_cpp_plugin,
       plugin_language="grpc",
+      plugin_options=plugin_options,
       gen_cc=1,
       outs=outs,
       visibility=["//visibility:public"],
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 2cd607edbe554cd18d21626e258176e8570282ed..44a89c3a96ad293be76d709ac21ac6bafe6afcbd 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -113,6 +113,12 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "base",
+    srcs = [],
+    copts = tf_copts(),
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
@@ -128,6 +134,11 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stacktrace",
+    srcs = [],
+)
+
 cc_library(
     name = "gif",
     copts = tf_copts(),
@@ -160,6 +171,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "test_lite_main",
+    testonly = 1,
+    linkstatic = 1,
+    deps = [],
+)
+
 cc_library(
     name = "test_main",
     testonly = 1,
@@ -218,15 +236,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index e813e4a17aca918582e8346d4bf2655724a143b3..78345488969ee3284f31504e2a8bfaf01bf19d14 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -22,7 +22,6 @@ limitations under the License.
 // IWYU pragma: friend third_party/tensorflow/core/platform/tracing.h
 
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/tracing.h"
 
diff --git a/tensorflow/core/platform/denormal.cc b/tensorflow/core/platform/denormal.cc
index 3631d9ddf99430372c11403dba56c14331a3db24..82cbc43b4f83f9a02d09700fdb9b1546e1d45d49 100644
--- a/tensorflow/core/platform/denormal.cc
+++ b/tensorflow/core/platform/denormal.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+
 #include "tensorflow/core/platform/denormal.h"
-#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/platform.h"
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index 9a6ff48069d939ae0888c245ff3623ca1ef9000b..a70a417e6a2f3ade644f5d7377adf5ebc52d77e5 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -373,9 +373,8 @@ TEST_F(DefaultEnvTest, CreateUniqueFileName) {
 
   EXPECT_TRUE(env->CreateUniqueFileName(&filename, suffix));
 
-  StringPiece str(filename);
-  EXPECT_TRUE(str.starts_with(prefix));
-  EXPECT_TRUE(str.ends_with(suffix));
+  EXPECT_TRUE(str_util::StartsWith(filename, prefix));
+  EXPECT_TRUE(str_util::EndsWith(filename, suffix));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index 5bc8606e28057d65522f23c58266494ec7d44c0b..b55e94d552ed3a66bd05930702acd9633cd02f81 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -28,28 +27,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-
-constexpr int kNumThreads = 8;
-
-// Run a function in parallel using a ThreadPool, but skip the ThreadPool
-// on the iOS platform due to its problems with more than a few threads.
-void ForEach(int first, int last, const std::function<void(int)>& f) {
-#if TARGET_OS_IPHONE
-  for (int i = first; i < last; i++) {
-    f(i);
-  }
-#else
-  int num_threads = std::min(kNumThreads, last - first);
-  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
-  for (int i = first; i < last; i++) {
-    threads.Schedule([f, i] { f(i); });
-  }
-#endif
-}
-
-}  // anonymous namespace
-
 FileSystem::~FileSystem() {}
 
 string FileSystem::TranslateName(const string& name) const {
@@ -94,76 +71,6 @@ bool FileSystem::FilesExist(const std::vector<string>& files,
   return result;
 }
 
-Status FileSystem::GetMatchingPaths(const string& pattern,
-                                    std::vector<string>* results) {
-  results->clear();
-  // Find the fixed prefix by looking for the first wildcard.
-  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
-  string eval_pattern = pattern;
-  std::vector<string> all_files;
-  string dir = io::Dirname(fixed_prefix).ToString();
-  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
-  // include . as the top level directory.
-  if (dir.empty()) {
-    dir = ".";
-    fixed_prefix = io::JoinPath(dir, fixed_prefix);
-    eval_pattern = io::JoinPath(dir, pattern);
-  }
-
-  // Setup a BFS to explore everything under dir.
-  std::deque<string> dir_q;
-  dir_q.push_back(dir);
-  Status ret;  // Status to return.
-  // children_dir_status holds is_dir status for children. It can have three
-  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
-  // if we don't calculate IsDirectory (we might do that because there isn't
-  // any point in exploring that child path).
-  std::vector<Status> children_dir_status;
-  while (!dir_q.empty()) {
-    string current_dir = dir_q.front();
-    dir_q.pop_front();
-    std::vector<string> children;
-    Status s = GetChildren(current_dir, &children);
-    ret.Update(s);
-    if (children.empty()) continue;
-    // This IsDirectory call can be expensive for some FS. Parallelizing it.
-    children_dir_status.resize(children.size());
-    ForEach(0, children.size(),
-            [this, &current_dir, &children, &fixed_prefix,
-             &children_dir_status](int i) {
-              const string child_path = io::JoinPath(current_dir, children[i]);
-              // In case the child_path doesn't start with the fixed_prefix then
-              // we don't need to explore this path.
-              if (!StringPiece(child_path).starts_with(fixed_prefix)) {
-                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
-                                                "Operation not needed");
-              } else {
-                children_dir_status[i] = IsDirectory(child_path);
-              }
-            });
-    for (int i = 0; i < children.size(); ++i) {
-      const string child_path = io::JoinPath(current_dir, children[i]);
-      // If the IsDirectory call was cancelled we bail.
-      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
-        continue;
-      }
-      // If the child is a directory add it to the queue.
-      if (children_dir_status[i].ok()) {
-        dir_q.push_back(child_path);
-      }
-      all_files.push_back(child_path);
-    }
-  }
-
-  // Match all obtained files to the input pattern.
-  for (const auto& f : all_files) {
-    if (Env::Default()->MatchPath(f, eval_pattern)) {
-      results->push_back(f);
-    }
-  }
-  return ret;
-}
-
 Status FileSystem::DeleteRecursively(const string& dirname,
                                      int64* undeleted_files,
                                      int64* undeleted_dirs) {
@@ -241,7 +148,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
       return status;
     }
     // Basename returns "" for / ending dirs.
-    if (!remaining_dir.ends_with("/")) {
+    if (!str_util::EndsWith(remaining_dir, "/")) {
       sub_dirs.push_back(io::Basename(remaining_dir));
     }
     remaining_dir = io::Dirname(remaining_dir);
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 8f99766e151d2d456c9fcd4baa1282f36d586b7f..077b1d79cfb259c6d497f37e7f06d0da189f3ff5 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -138,10 +138,8 @@ class FileSystem {
   ///  * OK - no errors
   ///  * UNIMPLEMENTED - Some underlying functions (like GetChildren) are not
   ///                    implemented
-  /// The default implementation uses a combination of GetChildren, MatchPath
-  /// and IsDirectory.
   virtual Status GetMatchingPaths(const string& pattern,
-                                  std::vector<string>* results);
+                                  std::vector<string>* results) = 0;
 
   /// \brief Obtains statistics for the given path.
   virtual Status Stat(const string& fname, FileStatistics* stat) = 0;
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22c5057281959fa1584828d927387e8094bfa50a
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -0,0 +1,126 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/file_system_helper.h"
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/platform.h"
+
+namespace tensorflow {
+namespace internal {
+
+namespace {
+
+constexpr int kNumThreads = 8;
+
+// Run a function in parallel using a ThreadPool, but skip the ThreadPool
+// on the iOS platform due to its problems with more than a few threads.
+void ForEach(int first, int last, const std::function<void(int)>& f) {
+#if TARGET_OS_IPHONE
+  for (int i = first; i < last; i++) {
+    f(i);
+  }
+#else
+  int num_threads = std::min(kNumThreads, last - first);
+  thread::ThreadPool threads(Env::Default(), "ForEach", num_threads);
+  for (int i = first; i < last; i++) {
+    threads.Schedule([f, i] { f(i); });
+  }
+#endif
+}
+
+}  // namespace
+
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results) {
+  results->clear();
+  // Find the fixed prefix by looking for the first wildcard.
+  string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
+  string eval_pattern = pattern;
+  std::vector<string> all_files;
+  string dir = io::Dirname(fixed_prefix).ToString();
+  // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
+  // include . as the top level directory.
+  if (dir.empty()) {
+    dir = ".";
+    fixed_prefix = io::JoinPath(dir, fixed_prefix);
+    eval_pattern = io::JoinPath(dir, pattern);
+  }
+
+  // Setup a BFS to explore everything under dir.
+  std::deque<string> dir_q;
+  dir_q.push_back(dir);
+  Status ret;  // Status to return.
+  // children_dir_status holds is_dir status for children. It can have three
+  // possible values: OK for true; FAILED_PRECONDITION for false; CANCELLED
+  // if we don't calculate IsDirectory (we might do that because there isn't
+  // any point in exploring that child path).
+  std::vector<Status> children_dir_status;
+  while (!dir_q.empty()) {
+    string current_dir = dir_q.front();
+    dir_q.pop_front();
+    std::vector<string> children;
+    Status s = fs->GetChildren(current_dir, &children);
+    ret.Update(s);
+    if (children.empty()) continue;
+    // This IsDirectory call can be expensive for some FS. Parallelizing it.
+    children_dir_status.resize(children.size());
+    ForEach(0, children.size(),
+            [fs, &current_dir, &children, &fixed_prefix,
+             &children_dir_status](int i) {
+              const string child_path = io::JoinPath(current_dir, children[i]);
+              // In case the child_path doesn't start with the fixed_prefix then
+              // we don't need to explore this path.
+              if (!str_util::StartsWith(child_path, fixed_prefix)) {
+                children_dir_status[i] = Status(tensorflow::error::CANCELLED,
+                                                "Operation not needed");
+              } else {
+                children_dir_status[i] = fs->IsDirectory(child_path);
+              }
+            });
+    for (int i = 0; i < children.size(); ++i) {
+      const string child_path = io::JoinPath(current_dir, children[i]);
+      // If the IsDirectory call was cancelled we bail.
+      if (children_dir_status[i].code() == tensorflow::error::CANCELLED) {
+        continue;
+      }
+      // If the child is a directory add it to the queue.
+      if (children_dir_status[i].ok()) {
+        dir_q.push_back(child_path);
+      }
+      all_files.push_back(child_path);
+    }
+  }
+
+  // Match all obtained files to the input pattern.
+  for (const auto& f : all_files) {
+    if (env->MatchPath(f, eval_pattern)) {
+      results->push_back(f);
+    }
+  }
+  return ret;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/file_system_helper.h b/tensorflow/core/platform/file_system_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d812b0e38150f9190f69fd279561944d42174c6
--- /dev/null
+++ b/tensorflow/core/platform/file_system_helper.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+#define TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FileSystem;
+class Env;
+
+namespace internal {
+
+// Given a pattern, stores in 'results' the set of paths (in the given file
+// system) that match that pattern.
+//
+// This helper may be used by implementations of FileSystem::GetMatchingPaths()
+// in order to provide parallel scanning of subdirectories (except on iOS).
+//
+// Arguments:
+//   fs: may not be null and will be used to identify directories and list
+//       their contents.
+//   env: may not be null and will be used to check if a match has been found.
+//   pattern: see FileSystem::GetMatchingPaths() for details.
+//   results: will be cleared and may not be null.
+//
+// Returns an error status if any call to 'fs' failed.
+Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
+                        std::vector<string>* results);
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_FILE_SYSTEM_HELPER_H_
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index e07aad55cb52d9a08f13dad74da6d7fdd72eda4e..f261b8f5761506fc5d706c9646c36eef912fc18f 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -124,7 +124,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     io::ParseURI(name, &scheme, &host, &path);
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
-    path.Consume("/");
+    str_util::ConsumePrefix(&path, "/");
     *parsed_path = path.ToString();
   }
 
@@ -160,7 +160,8 @@ string Match(InterPlanetaryFileSystem* ipfs, const string& suffix_pattern) {
     std::sort(results.begin(), results.end());
     for (const string& result : results) {
       StringPiece trimmed_result(result);
-      EXPECT_TRUE(trimmed_result.Consume(strings::StrCat(kPrefix, "/")));
+      EXPECT_TRUE(str_util::ConsumePrefix(&trimmed_result,
+                                          strings::StrCat(kPrefix, "/")));
       trimmed_results.push_back(trimmed_result);
     }
     return str_util::Join(trimmed_results, ",");
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 774a439855e49904b29f1e0c3d82196b1b9afb5d..7c38c399bd7a4645b3556e653110c19b8b9ab9ff 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -12,18 +12,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "hadoop_file_system",
     srcs = ["hadoop_file_system.cc"],
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 74863293a32451e8881c93de468539b913169aaa..9a71fbe2b785be2a47d413962d3996b19e39fecc 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/posix/error.h"
@@ -396,6 +397,11 @@ Status HadoopFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status HadoopFileSystem::GetMatchingPaths(const string& pattern,
+                                          std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status HadoopFileSystem::DeleteFile(const string& fname) {
   hdfsFS fs = nullptr;
   TF_RETURN_IF_ERROR(Connect(fname, &fs));
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 5f2b222622cf01033af117f92d49458eeae00e6f..6af7a698ffe91d79d1460a4e335ddd7bf8727a3c 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -49,6 +49,9 @@ class HadoopFileSystem : public FileSystem {
 
   Status GetChildren(const string& dir, std::vector<string>* result) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index 6ba2f04d0f839cedee9d75d8ed960a50668e541c..b207d3474977361777383299a2a603a9f21481d4 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -197,7 +198,7 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
   // implementation makes no guarantees that writable files are readable while
   // being written.
-  if (!StringPiece(fname).starts_with("hdfs://")) {
+  if (!str_util::StartsWith(fname, "hdfs://")) {
     return;
   }
 
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 6119edfd5a63d1aa4e81bb91d95736ed2835c478..37239681755c04152d3ae4a91ab7ec73a89f522b 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -31,13 +31,14 @@ limitations under the License.
   __attribute__((__format__(__printf__, string_index, first_to_check)))
 #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) \
   __attribute__((__format__(__scanf__, string_index, first_to_check)))
-#elif defined(COMPILER_MSVC)
+#elif defined(_MSC_VER)
 // Non-GCC equivalents
 #define TF_ATTRIBUTE_NORETURN __declspec(noreturn)
-#define TF_ATTRIBUTE_ALWAYS_INLINE
+#define TF_ATTRIBUTE_ALWAYS_INLINE __forceinline
 #define TF_ATTRIBUTE_NOINLINE
 #define TF_ATTRIBUTE_UNUSED
 #define TF_ATTRIBUTE_COLD
+#define TF_ATTRIBUTE_WEAK
 #define TF_MUST_USE_RESULT
 #define TF_PACKED
 #define TF_PRINTF_ATTRIBUTE(string_index, first_to_check)
@@ -57,7 +58,7 @@ limitations under the License.
 #endif
 
 // Control visiblity outside .so
-#if defined(COMPILER_MSVC)
+#if defined(_WIN32)
 #ifdef TF_COMPILE_LIBRARY
 #define TF_EXPORT __declspec(dllexport)
 #else
@@ -65,13 +66,20 @@ limitations under the License.
 #endif  // TF_COMPILE_LIBRARY
 #else
 #define TF_EXPORT __attribute__((visibility("default")))
-#endif  // COMPILER_MSVC
+#endif  // _WIN32
+
+#ifdef __has_builtin
+#define TF_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define TF_HAS_BUILTIN(x) 0
+#endif
 
-// GCC can be told that a certain branch is not likely to be taken (for
-// instance, a CHECK failure), and use that information in static analysis.
-// Giving it this information can help it optimize for the common case in
-// the absence of better information (ie. -fprofile-arcs).
-#if defined(COMPILER_GCC3)
+// Compilers can be told that a certain branch is not likely to be taken
+// (for instance, a CHECK failure), and use that information in static
+// analysis. Giving it this information can help it optimize for the
+// common case in the absence of better information (ie.
+// -fprofile-arcs).
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/platform/mem.h b/tensorflow/core/platform/mem.h
index 7bb9fc264fbf6ee3f20e9b2687c9ba52b6171ec4..fca3a2332d15f986d637f7d3a5eb91069dfce1a0 100644
--- a/tensorflow/core/platform/mem.h
+++ b/tensorflow/core/platform/mem.h
@@ -59,7 +59,7 @@ void MallocExtension_ReleaseToSystem(std::size_t num_bytes);
 // routine, this routine returns 0.
 std::size_t MallocExtension_GetAllocatedSize(const void* p);
 
-// Returns the amount of RAM available in kB, or INT64_MAX if unknown.
+// Returns the amount of RAM available in bytes, or INT64_MAX if unknown.
 int64 AvailableRam();
 
 }  // namespace port
diff --git a/tensorflow/core/platform/null_file_system.h b/tensorflow/core/platform/null_file_system.h
index 008e6d54d02c8712414ff8374acbf18bb5155013..420abc1ada81456e8883e48dd693614f75b81116 100644
--- a/tensorflow/core/platform/null_file_system.h
+++ b/tensorflow/core/platform/null_file_system.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 
 namespace tensorflow {
 
@@ -65,6 +66,11 @@ class NullFileSystem : public FileSystem {
     return errors::Unimplemented("GetChildren unimplemented");
   }
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override {
+    return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+  }
+
   Status DeleteFile(const string& fname) override {
     return errors::Unimplemented("DeleteFile unimplemented");
   }
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 494acde803a778fb839a7444e4d5ac2fd094eb09..8e316472fe2ea6f7c3187f0a5f98052c20f5ce6b 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -177,7 +177,7 @@ int64 AvailableRam() {
   struct sysinfo info;
   int err = sysinfo(&info);
   if (err == 0) {
-    return info.freeram / 1024;
+    return info.freeram;
   }
 #endif
   return INT64_MAX;
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index 9a8021565cbcc2a172a23439d2a7139108c0df39..47bfa020cef991e6a2e9e9de283318b287788454 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/posix/posix_file_system.h"
@@ -225,6 +226,11 @@ Status PosixFileSystem::GetChildren(const string& dir,
   return Status::OK();
 }
 
+Status PosixFileSystem::GetMatchingPaths(const string& pattern,
+                                         std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status PosixFileSystem::DeleteFile(const string& fname) {
   Status result;
   if (unlink(TranslateName(fname).c_str()) != 0) {
diff --git a/tensorflow/core/platform/posix/posix_file_system.h b/tensorflow/core/platform/posix/posix_file_system.h
index 98ffa43b8acf8a10a4ace1bf11cc7d6f5e8a95a7..e8898d0a97f50e29d1216bf2d9d340711cb29754 100644
--- a/tensorflow/core/platform/posix/posix_file_system.h
+++ b/tensorflow/core/platform/posix/posix_file_system.h
@@ -47,6 +47,9 @@ class PosixFileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stats) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/s3/BUILD b/tensorflow/core/platform/s3/BUILD
index 3a0ad2e9bd09211aa452f8b39b621343a113785d..21038cfeb15be052f7460151bacaa15544c8d77c 100644
--- a/tensorflow/core/platform/s3/BUILD
+++ b/tensorflow/core/platform/s3/BUILD
@@ -13,18 +13,6 @@ load(
     "tf_cc_test",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 tf_cc_binary(
     name = "s3_file_system.so",
     srcs = [
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 301fcb9dbf653d29f6ac5321332c8764adaad681..6da679dc7523f52724cf992e7ba70351de3cf393 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/s3/aws_logging.h"
 #include "tensorflow/core/platform/s3/s3_crypto.h"
@@ -155,7 +156,7 @@ Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
     return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
                                    fname);
   }
-  objectp.Consume("/");
+  str_util::ConsumePrefix(&objectp, "/");
   *object = objectp.ToString();
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("S3 path doesn't contain an object name: ",
@@ -497,6 +498,11 @@ Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
   return Status::OK();
 }
 
+Status S3FileSystem::GetMatchingPaths(const string& pattern,
+                                      std::vector<string>* results) {
+  return internal::GetMatchingPaths(this, Env::Default(), pattern, results);
+}
+
 Status S3FileSystem::DeleteFile(const string& fname) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 31264be621d93c1efb68f7b0b49e28cb65b05de1..5d0565b378198a39f80940c0627f7638e92691fa 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -46,6 +46,9 @@ class S3FileSystem : public FileSystem {
 
   Status Stat(const string& fname, FileStatistics* stat) override;
 
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
+
   Status DeleteFile(const string& fname) override;
 
   Status CreateDir(const string& name) override;
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 8f7bff1bb020ee501c982d5d0761d36537993e63..3c6e7b0db5995136a7cff468288cca0bd32c2349 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -103,7 +103,9 @@ class Tracing {
   friend class ScopedAnnotation;
   friend class TraceMe;
 
-  static std::atomic<Tracing::Engine*> tracing_engine_;
+  // TODO: TF_EXPORT is for building //tensorflow/contrib/data:_dataset_ops.so
+  //       on Windows. Figure out a way to remove TF_EXPORT here.
+  TF_EXPORT static std::atomic<Tracing::Engine*> tracing_engine_;
   static Tracing::Engine* engine() {
     return tracing_engine_.load(std::memory_order_acquire);
   }
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index f3b27ea394d04770b612752328d5d571e6521cc6..174f41a993f8010112f316dc9ba220f6ecc2804e 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -166,7 +166,7 @@ int64 AvailableRam() {
   MEMORYSTATUSEX statex;
   statex.dwLength = sizeof(statex);
   if (GlobalMemoryStatusEx(&statex)) {
-    return statex.ullAvailPhys / 1024;
+    return statex.ullAvailPhys;
   }
   return INT64_MAX;
 }
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 682e46e0fcd0322ed34fa94d0ee5516cf9194a3b..dc2efbeaf5e3eabc6077df4c1c126762d36ba8a6 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
@@ -494,7 +495,8 @@ Status WindowsFileSystem::GetMatchingPaths(const string& pattern,
   // but no code appears to rely on this behavior.
   string converted_pattern(pattern);
   std::replace(converted_pattern.begin(), converted_pattern.end(), '\\', '/');
-  TF_RETURN_IF_ERROR(FileSystem::GetMatchingPaths(converted_pattern, results));
+  TF_RETURN_IF_ERROR(internal::GetMatchingPaths(this, Env::Default(),
+                                                converted_pattern, results));
   for (string& result : *results) {
     std::replace(result.begin(), result.end(), '/', '\\');
   }
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 5ce6f1046d3a812039106520d4883622c4df485b..3d3203cdaa80fa4b59063e256146dd47a9303279 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -4,21 +4,6 @@ package(
 
 licenses(["notice"])  # Apache 2.0
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
 load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
diff --git a/tensorflow/core/profiler/g3doc/profile_model_architecture.md b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
index 61bb66bd21b336074475142ee564414ee154cafc..4ccd43ce68379db40b5bba0c3f2969d899b39d08 100644
--- a/tensorflow/core/profiler/g3doc/profile_model_architecture.md
+++ b/tensorflow/core/profiler/g3doc/profile_model_architecture.md
@@ -45,22 +45,22 @@ sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)
 
 For an operation to have float operation statistics:
 
-* It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
-use the definition to calculate float operations. Contributes are welcome.
-
-* It must have known "shape" information for RegisterStatistics('flops')
-to calculate the statistics. It is suggested to pass in `-run_meta_path` if
-shape is only known during runtime. tfprof can fill in the missing shape with
-the runtime shape information from RunMetadata.
-Hence, it is suggested to use `-account_displayed_op_only`
-option so that you know the statistics are only for the operations printed out.
-
-* If no RunMetadata provided, tfprof count float_ops of each graph node once,
-even if it is defined in tf.while_loop. This is because tfprof doesn't know
-how many times are run statically. If RunMetadata provided, tfprof calculate
-float_ops as float_ops * run_count.
-
-
+*   It must have `RegisterStatistics('flops')` defined in TensorFlow. tfprof
+    uses the definition to calculate float operations. Contributions are
+    welcomed.
+
+*   It must have known "shape" information for RegisterStatistics('flops') to
+    calculate the statistics. It is suggested to pass in `-run_meta_path` if
+    shape is only known during runtime. tfprof can fill in the missing shape
+    with the runtime shape information from RunMetadata. Hence, it is suggested
+    to use `-account_displayed_op_only` option so that you know the statistics
+    are only for the operations printed out.
+
+*   If no RunMetadata is provided, tfprof counts float_ops of each graph node
+    once, even if it is defined in a tf.while_loop. This is because tfprof
+    doesn't know statically how many times each graph node is run. If
+    RunMetadata is provided, tfprof calculates float_ops as float_ops *
+    run_count.
 
 ```python
 # To profile float opertions in commandline, you need to pass --graph_path
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 05a798bff80a0775e5170bf8f428d9e88d8060b3..8dcfde9a2adbd3a1774bce8506a84f80ca099c34 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -365,17 +365,3 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index 40cfd1e12e609de0f70b12b2cf98ef1086b4d024..1fedb05ae319176886cb0ff0409ea6685df76a4c 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -73,18 +73,3 @@ tf_cc_test(
         "//tensorflow/core/profiler/internal:tfprof_tf_testlib",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index e968b9c97e28eeae22954102d5f0e07e09d75f7f..96b6cc30bd9b3d603eb585a05023f36fe7b816b7 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -82,8 +83,8 @@ TEST_F(TFProfAdvisorTest, OperationChecker) {
   (*options.mutable_checkers())[kCheckers[1]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[1]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[1]).reports(0))
-                  .contains("NCHW"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[1]).reports(0), "NCHW"));
 }
 
 TEST_F(TFProfAdvisorTest, UtilizationChecker) {
@@ -91,16 +92,17 @@ TEST_F(TFProfAdvisorTest, UtilizationChecker) {
   (*options.mutable_checkers())[kCheckers[0]];
   AdviceProto advice = advisor_->Advise(options);
   EXPECT_EQ(advice.checkers().at(kCheckers[0]).reports_size(), 1);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[0]).reports(0))
-                  .contains("low utilization"));
+  EXPECT_TRUE(str_util::StrContains(
+      advice.checkers().at(kCheckers[0]).reports(0), "low utilization"));
 }
 
 TEST_F(TFProfAdvisorTest, ExpensiveOperationChecker) {
   AdvisorOptionsProto options;
   (*options.mutable_checkers())[kCheckers[2]];
   AdviceProto advice = advisor_->Advise(options);
-  EXPECT_TRUE(StringPiece(advice.checkers().at(kCheckers[2]).reports(0))
-                  .contains("top 1 operation type: Conv2D"));
+  EXPECT_TRUE(
+      str_util::StrContains(advice.checkers().at(kCheckers[2]).reports(0),
+                            "top 1 operation type: Conv2D"));
 }
 
 }  // namespace tfprof
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index a3557e4721644dd2577e7b56077a4e7ef8030463..c1a0075b6468cded7e5378ddcca1a7bcff914e98 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -409,6 +409,17 @@ message RunMetadata {
   repeated GraphDef partition_graphs = 3;
 }
 
+// Defines a connection between two tensors in a `GraphDef`.
+message TensorConnection {
+  // A tensor name. The value of this tensor will be substituted for
+  // the tensor named in `to_tensor`.
+  string from_tensor = 1;
+
+  // A tensor name. The value of this tensor will be bound to the
+  // value of the tensor named in `from_tensor`.
+  string to_tensor = 2;
+}
+
 // Defines a subgraph in another `GraphDef` as a set of feed points and nodes
 // to be fetched or executed.
 //
@@ -429,5 +440,10 @@ message CallableOptions {
   // Options that will be applied to each run.
   RunOptions run_options = 4;
 
-  // Next: 5
+  // Tensors to be connected in the callable. Each TensorConnection denotes
+  // a pair of tensors in the graph, between which an edge will be created
+  // in the callable.
+  repeated TensorConnection tensor_connection = 5;
+
+  // Next: 6
 }
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 0437cb1b83e12d83bf3b8713e2940a6d45173fb5..96c91536f7386556c4c75ef463c4f781edd0aebb 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -23,6 +23,7 @@ option java_package = "org.tensorflow.distruntime";
 
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
+import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/named_tensor.proto";
@@ -264,3 +265,70 @@ message ListDevicesResponse {
   repeated DeviceAttributes local_device = 1;
   repeated DeviceAttributes remote_device = 2;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// MakeCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message MakeCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // Options that define the behavior of the created callable.
+  CallableOptions options = 2;
+}
+
+message MakeCallableResponse {
+  // A handle to the created callable.
+  int64 handle = 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// RunCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RunCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+
+  // Values of the tensors passed as arguments to the callable, in the order
+  // defined in the CallableOptions.feed field passed to MakeCallable.
+  repeated TensorProto feed = 3;
+}
+
+message RunCallableResponse {
+  // Values of the tensors returned by the callable, in the order defined in the
+  // CallableOptions.fetch field passed to MakeCallable.
+  repeated TensorProto fetch = 1;
+
+  // Returned metadata if requested in the options.
+  RunMetadata metadata = 2;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ReleaseCallable method request/response protos.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message ReleaseCallableRequest {
+  // REQUIRED: session_handle must be returned by a CreateSession call
+  // to the same master service.
+  string session_handle = 1;
+
+  // REQUIRED: handle must be returned by a MakeCallable call to the same
+  // master service.
+  int64 handle = 2;
+}
+
+message ReleaseCallableResponse {
+}
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 771c80562a7885e983e7e25f167e0ca56bba6cc8..1170611f37232704f7702185a3009bd1fa1e3f64 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -107,4 +107,13 @@ service MasterService {
   // will no longer affect fresh ones via the resources in containers listed in
   // the ResetRequest.  See ResetRequest for more details.
   rpc Reset(ResetRequest) returns (ResetResponse);
+
+  // Registers a callable for execution with RunCallable.
+  rpc MakeCallable(MakeCallableRequest) returns (MakeCallableResponse);
+
+  // Executes a callable registered with MakeCallable.
+  rpc RunCallable(RunCallableRequest) returns (RunCallableResponse);
+
+  // Frees resources associated with a callable registered with MakeCallable.
+  rpc ReleaseCallable(ReleaseCallableRequest) returns (ReleaseCallableResponse);
 }
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index bb772460b03569d7a7c5d18f1deba653d0ac4f60..9b6202e7b494a0881788375e66698269e33505d8 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -29,6 +29,14 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
+  // Enum controling the number of times to run optimizers. The default is to
+  // run them once.
+  enum NumIterationsType {
+    DEFAULT_NUM_ITERS = 0;
+    ONE = 1;
+    TWO = 2;
+  }
+
   // Optimize tensor layouts (default is ON)
   // e.g. This will try to use NCHW layout on GPU which is faster.
   Toggle layout_optimizer = 1;
@@ -51,6 +59,10 @@ message RewriterConfig {
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
 
+  // Controls how many times we run the optimizers in meta optimizer (default
+  // is once).
+  NumIterationsType meta_optimizer_iterations = 12;
+
   enum MemOptType {
     // The default setting (SCHEDULING and SWAPPING HEURISTICS only)
     DEFAULT_MEM_OPT = 0;
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 40eebd1db0e44ae11f4174f93706743dbeae489c..706968d34745b8d21653bcee762f8a37555b93c1 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 3efc703faf7b23958eb49d59fd0dd4565f090bbe..480ce94fcaeddd62c30089d09752cc4d965ebf01 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -28,7 +29,9 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                      const std::function<bool(string)>& hook,
                      bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     *value_parsing_ok = hook(arg.ToString());
     return true;
   }
@@ -40,7 +43,9 @@ bool ParseInt32Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int32)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int32 parsed_int32;
     if (sscanf(arg.data(), "%d%c", &parsed_int32, &extra) != 1) {
@@ -60,7 +65,9 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(int64)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     int64 parsed_int64;
     if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
@@ -80,7 +87,8 @@ bool ParseBoolFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                    const std::function<bool(bool)>& hook,
                    bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag)) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag)) {
     if (arg.empty()) {
       *value_parsing_ok = hook(true);
       return true;
@@ -107,7 +115,9 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
                     const std::function<bool(float)>& hook,
                     bool* value_parsing_ok) {
   *value_parsing_ok = true;
-  if (arg.Consume("--") && arg.Consume(flag) && arg.Consume("=")) {
+  if (str_util::ConsumePrefix(&arg, "--") &&
+      str_util::ConsumePrefix(&arg, flag) &&
+      str_util::ConsumePrefix(&arg, "=")) {
     char extra;
     float parsed_float;
     if (sscanf(arg.data(), "%f%c", &parsed_float, &extra) != 1) {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index 1521349e4ddf064ce55726a9c6ca400ae6342c15..317420204e20ab2994ca9b7b7f4cc39e688e728f 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -26,18 +26,6 @@ alias(
     actual = ":mobile_srcs",
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "ctc",
     deps = [
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index c1bc0f33785ef1576bf0f0d3db71e8daa1a51801..ff9c108f10cdbfa6f1ca3bb966d42e32fb223c74 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -408,7 +408,7 @@ static void MergeDevNamesError(const string& name_a, const string& name_b,
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StringPiece(s.error_message()).contains(expected_error_substr))
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), expected_error_substr))
       << s;
 }
 
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index f1ec497a6772c84d599a76169515ef417c11f430..b87dce0dff536733397bff946c12e992a8097666 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 
@@ -144,7 +145,7 @@ bool EqualNodeDef(const NodeDef& actual, const NodeDef& expected, string* diff,
 
   int first_control_input = actual.input_size();
   for (int i = 0; i < actual.input_size(); ++i) {
-    if (StringPiece(actual.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(actual.input(i), "^")) {
       first_control_input = i;
       break;
     }
@@ -240,7 +241,7 @@ uint64 NodeDefHash(const NodeDef& ndef, const EqualGraphDefOptions& options) {
   // Normal inputs. Order important.
   int first_control_input = ndef.input_size();
   for (int i = 0; i < ndef.input_size(); ++i) {
-    if (StringPiece(ndef.input(i)).starts_with("^")) {
+    if (str_util::StartsWith(ndef.input(i), "^")) {
       first_control_input = i;
       break;
     }
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index a0f43d2d4a745722d2095b6817c9156415c78127..1fa6b8bec037c3ee0d2b9b95f2ccce59813c98b9 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/memmapped_file_system.h"
 
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/util/memmapped_file_system.pb.h"
 
@@ -157,6 +158,12 @@ Status MemmappedFileSystem::GetChildren(const string& filename,
   return errors::Unimplemented("memmapped format doesn't support GetChildren");
 }
 
+Status MemmappedFileSystem::GetMatchingPaths(const string& pattern,
+                                             std::vector<string>* results) {
+  return errors::Unimplemented(
+      "memmapped format doesn't support GetMatchingPaths");
+}
+
 Status MemmappedFileSystem::DeleteFile(const string& filename) {
   return errors::Unimplemented("memmapped format doesn't support DeleteFile");
 }
@@ -236,7 +243,7 @@ Status MemmappedFileSystem::InitializeFromFile(Env* env,
 }
 
 bool MemmappedFileSystem::IsMemmappedPackageFilename(const string& filename) {
-  return StringPiece(filename).starts_with(kMemmappedPackagePrefix);
+  return str_util::StartsWith(filename, kMemmappedPackagePrefix);
 }
 
 namespace {
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 541587aeab05242f0c71beb139fe74c768b810b0..76cc4911f5e067253455d4d4eb86189e7d6e9de8 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -85,6 +85,8 @@ class MemmappedFileSystem : public FileSystem {
   Status NewAppendableFile(const string& fname,
                            std::unique_ptr<WritableFile>* result) override;
   Status GetChildren(const string& dir, std::vector<string>* r) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* results) override;
   Status DeleteFile(const string& f) override;
   Status CreateDir(const string& d) override;
   Status DeleteDir(const string& d) override;
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ade14ed1620e92a2246963eaa0b317275dd4ad3d
--- /dev/null
+++ b/tensorflow/core/util/proto/BUILD
@@ -0,0 +1,62 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "decode",
+    hdrs = ["decode.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptors",
+    srcs = ["descriptors.cc"],
+    hdrs = ["descriptors.h"],
+    deps = [
+        ":descriptor_pool_registry",
+        ":local_descriptor_pool_registration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "descriptor_pool_registry",
+    srcs = ["descriptor_pool_registry.cc"],
+    hdrs = ["descriptor_pool_registry.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "descriptor_pool_registry_test",
+    srcs = ["descriptor_pool_registry_test.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+# Depending on this target adds support for using the special
+# value "local://" (or "") for descriptor source, in which case
+# descriptors linked into the code will be searched.
+cc_library(
+    name = "local_descriptor_pool_registration",
+    srcs = ["local_descriptor_pool_registration.cc"],
+    deps = [
+        ":descriptor_pool_registry",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
new file mode 100644
index 0000000000000000000000000000000000000000..74634a356a84db0fb72a15e223f373598c668eee
--- /dev/null
+++ b/tensorflow/core/util/proto/decode.h
@@ -0,0 +1,592 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Inline functions for parsing the protocol buffers wire format.
+//
+// These functions have been optimized at the expense of safety.
+// They are broken out into a separate file for readability but are
+// not intended for use by clients other than the decode_proto op.
+//
+// The calling code in the decode_proto op does some fairly
+// complicated things to ensure that this code is called
+// safely. Changes to this code should be thoroughly fuzz tested.
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace internal {
+
+using tensorflow::protobuf::internal::WireFormatLite;
+using tensorflow::protobuf::io::CodedInputStream;
+using tensorflow::protobuf::io::CodedOutputStream;
+using tensorflow::protobuf::io::StringOutputStream;
+
+// Converts an uint64 to an int64 without loss of information.
+// Unsigned values greater than INT64_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int64 WrapUnsignedAsSigned64(uint64 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT64_MAX) {
+    return static_cast<int64>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT64_MIN) {
+    return static_cast<int64>(unsigned_value - INT64_MIN) + INT64_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Converts an uint32 to an int32 without loss of information.
+// Unsigned values greater than INT_MAX are represented as
+// negative numbers by wrapping (same as twos-complement bit equivalence).
+inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+  // For a detailed explanation of why this works to wrap unsigned ints, see
+  // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
+  // Both if tests should be optimized out.
+  if (unsigned_value <= INT_MAX) {
+    return static_cast<int32>(unsigned_value);
+  }
+  // The C++ spec allows an architecture where this test is required.
+  if (unsigned_value >= INT_MIN) {
+    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+  }
+  return 0;  // This should never occur.
+}
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value);
+
+// Reads a single varint32 from a byte array.
+// It is the caller's responsibility to ensure that there is enough
+// space in the buffer.
+// The ok value will be set to false if the buffer does not contain
+// a valid varint.
+// This is slightly less efficient than the private version in
+// coded_stream.cc but we duplicate less code by calling
+// the 64 bit version instead of copying the code.
+inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
+                                          uint32* value) {
+  uint64 tmp;
+  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+  *value = tmp & 0xffffffff;
+  return buf;
+}
+
+// Reads a single proto field value from a byte array into an array.
+// The array is part of a Tensor that was allocated by the caller
+// with type TensorType, while DeclaredType is the proto field type.
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int32>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_INT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_UINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_UINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int64>(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SINT64>(
+    const uint8* buf, int64* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = WireFormatLite::ZigZagDecode64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int64* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_FIXED32>(
+    const uint8* buf, int32* value) {
+  uint32 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+                                               WireFormatLite::TYPE_FIXED32>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned32(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_FIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_uint64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
+                                               WireFormatLite::TYPE_FIXED64>(
+      buf, &temp);
+  *value = WrapUnsignedAsSigned64(temp);
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
+    const uint8* buf, int32* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32,
+                                                WireFormatLite::TYPE_SFIXED32>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<int64, WireFormatLite::TYPE_SFIXED64>(
+    const uint8* buf, int64* value) {
+  protobuf_int64 temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
+                                               WireFormatLite::TYPE_SFIXED64>(
+      buf, &temp);
+  *value = temp;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8* buf, float* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<float,
+                                                WireFormatLite::TYPE_FLOAT>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8* buf, double* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<double,
+                                                WireFormatLite::TYPE_DOUBLE>(
+      buf, value);
+}
+
+template <>
+inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8* buf, bool* value) {
+  uint64 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
+  *value = temp != 0;
+  return buf;
+}
+
+template <>
+inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8* buf, int* value) {
+  uint32 temp;
+  bool unused_ok;  // The Counting pass would have failed if this were corrupt.
+  buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
+  *value = static_cast<int>(temp);
+  return buf;
+}
+
+// Reads packed values from an array.
+// Stride is set to 1 for repeated fields, and 0 for non-repeated fields
+// (where any value overwrites previous values).
+template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
+inline int ReadPackedPrimitives(const void* bufp, const size_t len,
+                                const int index, const int stride,
+                                void* datap) {
+  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
+  const uint8* bound = buf + len;
+  TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
+  int count;
+
+  // This could overrun the bound by stride-1. This is defended
+  // against in the caller, where it ensures that the input buffer
+  // contains complete values.
+  for (count = 0; buf < bound; count += stride) {
+    buf = ReadFromArray<TensorType, DeclaredType>(buf, data + count);
+  }
+  return count;
+}
+
+// Reads a primitive value field from a serialized proto.
+// The value is parsed from the serialized format, then static_cast
+// to the desired type for TensorFlow and stored.
+template <class ValueType, class TensorType,
+          enum WireFormatLite::FieldType DeclaredType>
+inline Status ReadPrimitive(CodedInputStream* input, int index, void* data) {
+  ValueType v;
+  if (!WireFormatLite::ReadPrimitive<ValueType, DeclaredType>(input, &v)) {
+    return errors::DataLoss("Failed reading primitive");
+  }
+
+  reinterpret_cast<TensorType*>(data)[index] = v;
+  return Status::OK();
+}
+
+// Reads a string, submessage, or other variable-length field from a
+// serialized proto.
+// May read all or part of a repeated field.
+inline Status ReadBytes(CodedInputStream* input, int index, void* datap) {
+  string* data = reinterpret_cast<string*>(datap) + index;
+  if (!WireFormatLite::ReadBytes(input, data)) {
+    return errors::DataLoss("Failed reading bytes");
+  }
+  return Status::OK();
+}
+
+// Reads a tag-delimited field (TYPE_GROUP) from a serialized proto,
+// as a bytestring.
+inline Status ReadGroupBytes(CodedInputStream* input, int field_number,
+                             int index, void* datap) {
+  // WireFormatLite::SkipField has an option to emit the
+  // skipped bytes to an output stream. We could do better by implementing our
+  // own scanner but this is simpler for now.
+  // TODO(nix): there is a faster way to grab TYPE_GROUP bytes by relying
+  // on input->IsFlat() == true and using input->GetDirectBufferPointer()
+  // with input->CurrentPosition().
+  string* data = reinterpret_cast<string*>(datap) + index;
+  StringOutputStream string_stream(data);
+  CodedOutputStream out(&string_stream);
+  if (!WireFormatLite::SkipField(
+          input,
+          WireFormatLite::MakeTag(field_number,
+                                  WireFormatLite::WIRETYPE_START_GROUP),
+          &out)) {
+    return errors::DataLoss("Failed reading group");
+  }
+  return Status::OK();
+}
+
+// Reads a single field value from a CodedInputStream into a tensor.
+inline Status ReadValue(CodedInputStream* input,
+                        WireFormatLite::FieldType field_type, int field_number,
+                        DataType dtype, int index, void* datap) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      return ReadPrimitive<double, double, WireFormatLite::TYPE_DOUBLE>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FLOAT:
+      if (dtype == DataType::DT_FLOAT) {
+        return ReadPrimitive<float, float, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_DOUBLE) {
+        return ReadPrimitive<float, double, WireFormatLite::TYPE_FLOAT>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FLOAT");
+    case WireFormatLite::TYPE_INT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_INT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_UINT64:
+      return ReadPrimitive<protobuf_uint64, int64, WireFormatLite::TYPE_UINT64>(
+          input, index, datap);
+    case WireFormatLite::TYPE_INT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_FIXED64:
+      return ReadPrimitive<protobuf_uint64, int64,
+                           WireFormatLite::TYPE_FIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_FIXED32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      return ReadPrimitive<bool, bool, WireFormatLite::TYPE_BOOL>(input, index,
+                                                                  datap);
+    case WireFormatLite::TYPE_STRING:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_GROUP:
+      return ReadGroupBytes(input, field_number, index, datap);
+    case WireFormatLite::TYPE_MESSAGE:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_BYTES:
+      return ReadBytes(input, index, datap);
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        return ReadPrimitive<uint32, int64, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      if (dtype == DataType::DT_INT32) {
+        return ReadPrimitive<uint32, int32, WireFormatLite::TYPE_UINT32>(
+            input, index, datap);
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SFIXED64:
+      return ReadPrimitive<protobuf_int64, int64,
+                           WireFormatLite::TYPE_SFIXED64>(input, index, datap);
+    case WireFormatLite::TYPE_SINT32:
+      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          input, index, datap);
+    case WireFormatLite::TYPE_SINT64:
+      return ReadPrimitive<protobuf_int64, int64, WireFormatLite::TYPE_SINT64>(
+          input, index, datap);
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads and stores a length-delimited list of values.
+inline Status ReadPackedFromArray(const void* buf, size_t buf_size,
+                                  const WireFormatLite::FieldType field_type,
+                                  const int field_number, const DataType dtype,
+                                  const int stride, int* index, void* data) {
+  // Dispatch to the appropriately typed field reader based on the
+  // schema type.
+  switch (field_type) {
+    case WireFormatLite::TYPE_DOUBLE:
+      *index += ReadPackedPrimitives<double, WireFormatLite::TYPE_DOUBLE>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FLOAT:
+      *index += ReadPackedPrimitives<float, WireFormatLite::TYPE_FLOAT>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_INT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_UINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_INT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_FIXED32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_FIXED32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_FIXED32");
+    case WireFormatLite::TYPE_BOOL:
+      *index += ReadPackedPrimitives<bool, WireFormatLite::TYPE_BOOL>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_STRING:
+    case WireFormatLite::TYPE_GROUP:
+    case WireFormatLite::TYPE_MESSAGE:
+    case WireFormatLite::TYPE_BYTES:
+      return errors::DataLoss("Non-primitive type encountered as packed");
+    case WireFormatLite::TYPE_UINT32:
+      if (dtype == DataType::DT_INT64) {
+        *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      if (dtype == DataType::DT_INT32) {
+        *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_UINT32>(
+            buf, buf_size, *index, stride, data);
+        return Status::OK();
+      }
+      // Any case that reaches this point should have triggered an error
+      // already.
+      return errors::DataLoss("Failed reading TYPE_UINT32");
+    case WireFormatLite::TYPE_ENUM:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+    case WireFormatLite::TYPE_SFIXED32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SFIXED64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SFIXED64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT32:
+      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+
+    case WireFormatLite::TYPE_SINT64:
+      *index += ReadPackedPrimitives<int64, WireFormatLite::TYPE_SINT64>(
+          buf, buf_size, *index, stride, data);
+      return Status::OK();
+      // default: intentionally omitted in order to enable static checking.
+  }
+  // Unreachable.
+  return errors::DataLoss("Failed reading unknown wire type");
+}
+
+// Reads a varint from the given buffer, write it to *value, and return the
+// new buffer pointer.
+// This was copied from coded_stream.cc where it is private.
+// Important: This routine may read as much as kMaxVarintBytes from
+// the buffer. It is the caller's responsibility to make sure that there is
+// enough space in the buffer.
+inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
+                                          uint64* value) {
+  const uint8* ptr = buffer;
+  uint32 b;
+
+  // Splitting into 32-bit pieces gives better performance on 32-bit
+  // processors.
+  uint32 part0 = 0, part1 = 0, part2 = 0;
+
+  b = *(ptr++);
+  part0 = b;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80;
+  b = *(ptr++);
+  part0 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 7;
+  b = *(ptr++);
+  part0 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 14;
+  b = *(ptr++);
+  part0 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part0 -= 0x80 << 21;
+  b = *(ptr++);
+  part1 = b;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80;
+  b = *(ptr++);
+  part1 += b << 7;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 7;
+  b = *(ptr++);
+  part1 += b << 14;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 14;
+  b = *(ptr++);
+  part1 += b << 21;
+  if (!(b & 0x80)) goto done;
+  part1 -= 0x80 << 21;
+  b = *(ptr++);
+  part2 = b;
+  if (!(b & 0x80)) goto done;
+  part2 -= 0x80;
+  b = *(ptr++);
+  part2 += b << 7;
+  if (!(b & 0x80)) goto done;
+  // "part2 -= 0x80 << 7" is irrelevant because (0x80 << 7) << 56 is 0.
+
+  // We have overrun the maximum size of a varint (10 bytes).  Assume
+  // the data is corrupt.
+  *ok = false;
+  return ptr;
+
+done:
+  *ok = true;
+  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
+           (static_cast<uint64>(part2) << 56);
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DECODE_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f0423f76b74c2b24555e6908a2b61b3ba28598f
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+
+DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
+  static DescriptorPoolRegistry* registry = new DescriptorPoolRegistry;
+  return registry;
+}
+
+DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
+    const string& source) {
+  auto found = fns_.find(source);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void DescriptorPoolRegistry::Register(
+    const string& source,
+    const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+  auto existing = Get(source);
+  CHECK_EQ(existing, nullptr)
+      << "descriptor pool for source: " << source << " already registered";
+  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..66c20e9e41337292bccf0c11c6c0b94a05e5df54
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+class DescriptorPoolRegistry {
+ public:
+  typedef std::function<Status(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool)>
+      DescriptorPoolFn;
+
+  // Returns a pointer to a global DescriptorPoolRegistry object.
+  static DescriptorPoolRegistry* Global();
+
+  // Returns a pointer to a descriptor pool function for the given source.
+  DescriptorPoolFn* Get(const string& source);
+
+  // Registers a descriptor pool factory.
+  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+
+ private:
+  std::map<string, DescriptorPoolFn> fns_;
+};
+
+namespace descriptor_pool_registration {
+
+class DescriptorPoolRegistration {
+ public:
+  DescriptorPoolRegistration(
+      const string& source,
+      const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
+    DescriptorPoolRegistry::Global()->Register(source, pool_fn);
+  }
+};
+
+}  // namespace descriptor_pool_registration
+
+#define REGISTER_DESCRIPTOR_POOL(source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(__COUNTER__, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ_HELPER(ctr, source, pool_fn) \
+  REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)
+
+#define REGISTER_DESCRIPTOR_POOL_UNIQ(ctr, source, pool_fn)       \
+  static descriptor_pool_registration::DescriptorPoolRegistration \
+      descriptor_pool_registration_fn_##ctr(source, pool_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTOR_POOL_REGISTRY_H_
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry_test.cc b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6899998ab57a2c90458db31596dc6bf00e8adc0
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptor_pool_registry_test.cc
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("TEST POOL 1", Value::Function);
+REGISTER_DESCRIPTOR_POOL("TEST POOL 2", Value::Function);
+}  // namespace
+
+TEST(DescriptorPoolRegistryTest, TestBasic) {
+  EXPECT_EQ(DescriptorPoolRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto pool1 = DescriptorPoolRegistry::Global()->Get("TEST POOL 1");
+  EXPECT_NE(pool1, nullptr);
+  auto pool2 = DescriptorPoolRegistry::Global()->Get("TEST POOL 2");
+  EXPECT_NE(pool2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
new file mode 100644
index 0000000000000000000000000000000000000000..271c85efd88de0f8acbedb3d2254af3397601c6b
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -0,0 +1,85 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/reader_op_kernel.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+#include "tensorflow/core/util/proto/descriptors.h"
+
+namespace tensorflow {
+namespace {
+
+// Build a `DescriptorPool` from the named file or URI. The file or URI
+// must be available to the current TensorFlow environment.
+//
+// The file must contiain a serialized `FileDescriptorSet`. See
+// `GetDescriptorPool()` for more information.
+Status GetDescriptorPoolFromFile(
+    tensorflow::Env* env, const string& filename,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  Status st = env->FileExists(filename);
+  if (!st.ok()) {
+    return st;
+  }
+
+  // Read and parse the FileDescriptorSet.
+  tensorflow::protobuf::FileDescriptorSet descs;
+  std::unique_ptr<tensorflow::ReadOnlyMemoryRegion> buf;
+  st = env->NewReadOnlyMemoryRegionFromFile(filename, &buf);
+  if (!st.ok()) {
+    return st;
+  }
+  if (!descs.ParseFromArray(buf->data(), buf->length())) {
+    return errors::InvalidArgument(
+        "descriptor_source contains invalid FileDescriptorSet: ", filename);
+  }
+
+  // Build a DescriptorPool from the FileDescriptorSet.
+  owned_desc_pool->reset(new tensorflow::protobuf::DescriptorPool());
+  for (const auto& filedesc : descs.file()) {
+    if ((*owned_desc_pool)->BuildFile(filedesc) == nullptr) {
+      return errors::InvalidArgument(
+          "Problem loading FileDescriptorProto (missing dependencies?): ",
+          filename);
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+  // Attempt to lookup the pool in the registry.
+  auto pool_fn = DescriptorPoolRegistry::Global()->Get(descriptor_source);
+  if (pool_fn != nullptr) {
+    return (*pool_fn)(desc_pool, owned_desc_pool);
+  }
+
+  // If there is no pool function registered for the given source, let the
+  // runtime find the file or URL.
+  Status status =
+      GetDescriptorPoolFromFile(env, descriptor_source, owned_desc_pool);
+  if (status.ok()) {
+    *desc_pool = owned_desc_pool->get();
+  }
+  *desc_pool = owned_desc_pool->get();
+  return status;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
new file mode 100644
index 0000000000000000000000000000000000000000..92ee8997ab28f151a7b15b0d81628988e98159f4
--- /dev/null
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+#define TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+class Env;
+class Status;
+
+// Get a `DescriptorPool` object from the named `descriptor_source`.
+// `descriptor_source` may be a path to a file accessible to TensorFlow, in
+// which case it is parsed as a `FileDescriptorSet` and used to build the
+// `DescriptorPool`.
+//
+// `owned_desc_pool` will be filled in with the same pointer as `desc_pool` if
+// the caller should take ownership.
+extern tensorflow::Status GetDescriptorPool(
+    tensorflow::Env* env, string const& descriptor_source,
+    tensorflow::protobuf::DescriptorPool const** desc_pool,
+    std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PROTO_DESCRIPTORS_H_
diff --git a/tensorflow/core/util/proto/local_descriptor_pool_registration.cc b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48fe0102d011724a91004ff3297e07259df87c27
--- /dev/null
+++ b/tensorflow/core/util/proto/local_descriptor_pool_registration.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/util/proto/descriptor_pool_registry.h"
+
+namespace tensorflow {
+namespace {
+
+struct LocalDescriptorPool {
+  static Status Function(
+      tensorflow::protobuf::DescriptorPool const** desc_pool,
+      std::unique_ptr<tensorflow::protobuf::DescriptorPool>* owned_desc_pool) {
+    *desc_pool = ::tensorflow::protobuf::DescriptorPool::generated_pool();
+    if (*desc_pool == nullptr) {
+      return errors::InvalidArgument("Problem loading protobuf generated_pool");
+    }
+    return Status::OK();
+  }
+};
+
+REGISTER_DESCRIPTOR_POOL("", LocalDescriptorPool::Function);
+REGISTER_DESCRIPTOR_POOL("local://", LocalDescriptorPool::Function);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/reporter.cc b/tensorflow/core/util/reporter.cc
index ee38f81f3e1b370b3f10d3998dee5b3a6d916e7d..a595c9509e66c7bb3ac7fadbd0e87cfd81d1d611 100644
--- a/tensorflow/core/util/reporter.cc
+++ b/tensorflow/core/util/reporter.cc
@@ -47,6 +47,18 @@ Status TestReporter::Benchmark(int64 iters, double cpu_time, double wall_time,
   return Status::OK();
 }
 
+Status TestReporter::SetProperty(const string& name, const string& value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_string_value(value);
+  return Status::OK();
+}
+
+Status TestReporter::SetProperty(const string& name, double value) {
+  if (closed_) return Status::OK();
+  (*benchmark_entry_.mutable_extras())[name].set_double_value(value);
+  return Status::OK();
+}
+
 Status TestReporter::Initialize() {
   if (fname_.empty()) {
     return Status::OK();
diff --git a/tensorflow/core/util/reporter.h b/tensorflow/core/util/reporter.h
index bcae12204ec369af74d748e5e73d06616309f289..e551e2e4f57decff586fc0bd4a8514ca7af8e0ec 100644
--- a/tensorflow/core/util/reporter.h
+++ b/tensorflow/core/util/reporter.h
@@ -34,11 +34,13 @@ namespace tensorflow {
 //
 // If this environment variable is not defined, no logging is performed.
 //
-// The intended use is via the following 4 lines:
+// The intended use is via the following lines:
 //
 //  TestReporter reporter(test_name);
 //  TF_CHECK_OK(reporter.Initialize()));
 //  TF_CHECK_OK(reporter.Benchmark(iters, cpu_time, wall_time, throughput));
+//  TF_CHECK_OK(reporter.SetProperty("some_string_property", "some_value");
+//  TF_CHECK_OK(reporter.SetProperty("some_double_property", double_value);
 //  TF_CHECK_OK(reporter.Close());
 //
 // For example, if the environment variable
@@ -75,6 +77,12 @@ class TestReporter {
   Status Benchmark(int64 iters, double cpu_time, double wall_time,
                    double throughput);
 
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, double value);
+
+  // Set property on Benchmark to the given value.
+  Status SetProperty(const string& name, const string& value);
+
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
diff --git a/tensorflow/core/util/reporter_test.cc b/tensorflow/core/util/reporter_test.cc
index 575c27d4ef72ec33c4b9352de59fc806b12d6385..0972b86ea5fefa4b490ee61eeb1937b136783801 100644
--- a/tensorflow/core/util/reporter_test.cc
+++ b/tensorflow/core/util/reporter_test.cc
@@ -29,7 +29,7 @@ namespace {
 
 // Tests of all the error paths in log_reader.cc follow:
 static void ExpectHasSubstr(StringPiece s, StringPiece expected) {
-  EXPECT_TRUE(StringPiece(s).contains(expected))
+  EXPECT_TRUE(str_util::StrContains(s, expected))
       << s << " does not contain " << expected;
 }
 
@@ -115,5 +115,28 @@ TEST(TestReporter, Benchmark) {
   EXPECT_EQ(benchmark_entry.throughput(), 3.0);
 }
 
+TEST(TestReporter, SetProperties) {
+  string fname =
+      strings::StrCat(testing::TmpDir(), "/test_reporter_benchmarks_");
+  TestReporter test_reporter(fname, "b2/3/4");
+  TF_EXPECT_OK(test_reporter.Initialize());
+  TF_EXPECT_OK(test_reporter.SetProperty("string_prop", "abc"));
+  TF_EXPECT_OK(test_reporter.SetProperty("double_prop", 4.0));
+
+  TF_EXPECT_OK(test_reporter.Close());
+  string expected_fname = strings::StrCat(fname, "b2__3__4");
+  string read;
+  TF_EXPECT_OK(ReadFileToString(Env::Default(), expected_fname, &read));
+
+  BenchmarkEntries benchmark_entries;
+  ASSERT_TRUE(benchmark_entries.ParseFromString(read));
+  ASSERT_EQ(1, benchmark_entries.entry_size());
+  const BenchmarkEntry& benchmark_entry = benchmark_entries.entry(0);
+  const auto& extras = benchmark_entry.extras();
+  ASSERT_EQ(2, extras.size());
+  EXPECT_EQ("abc", extras.at("string_prop").string_value());
+  EXPECT_EQ(4.0, extras.at("double_prop").double_value());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/BUILD b/tensorflow/core/util/rpc/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..f0f161ecc017966cef1a59e6870b016bdfb8d3ec
--- /dev/null
+++ b/tensorflow/core/util/rpc/BUILD
@@ -0,0 +1,48 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "call_container",
+    hdrs = ["call_container.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory",
+    srcs = ["rpc_factory.cc"],
+    hdrs = ["rpc_factory.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "rpc_factory_registry",
+    srcs = ["rpc_factory_registry.cc"],
+    hdrs = ["rpc_factory_registry.h"],
+    deps = [
+        ":rpc_factory",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_cc_test(
+    name = "rpc_factory_registry_test",
+    srcs = ["rpc_factory_registry_test.cc"],
+    deps = [
+        ":rpc_factory_registry",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f3605679759b93d3eeef4e2919f905781317707
--- /dev/null
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+#define TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
+
+#include <list>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/util/reffed_status_callback.h"
+
+namespace tensorflow {
+
+template <typename Call>
+class CallContainer {
+ public:
+  explicit CallContainer(OpKernelContext* ctx, int num_calls, bool fail_fast,
+                         bool try_rpc, AsyncOpKernel::DoneCallback done,
+                         CancellationToken token)
+      : ctx_(ctx),
+        done_(std::move(done)),
+        token_(token),
+        fail_fast_(fail_fast),
+        try_rpc_(try_rpc) {
+    CHECK_GT(num_calls, 0);
+
+    // This will run when all RPCs are finished.
+    reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
+      ctx_->cancellation_manager()->DeregisterCallback(token_);
+      ctx_->SetStatus(s);
+      done_();
+      delete this;
+    });
+
+    // Subtract reference count from the initial creation.
+    core::ScopedUnref unref(reffed_status_callback_);
+
+    for (int i = 0; i < num_calls; ++i) {
+      // Increase the reference on the callback for each new RPC.
+      reffed_status_callback_->Ref();
+    }
+  }
+
+  std::list<Call>* calls() { return &calls_; }
+
+  void StartCancel() {
+    // Once this loop is done, can no longer assume anything is valid
+    // because "delete this" may have been immediately called.
+    // Nothing should run after this loop.
+    for (auto& call : calls_) {
+      call.StartCancel();
+    }
+  }
+
+  void Done(const Status& s, int index) {
+    if (!try_rpc_) {
+      reffed_status_callback_->UpdateStatus(s);
+    }
+    reffed_status_callback_->Unref();
+  }
+
+ private:
+  OpKernelContext* ctx_;
+  std::list<Call> calls_;
+  const AsyncOpKernel::DoneCallback done_;
+  const CancellationToken token_;
+  const bool fail_fast_;
+  const bool try_rpc_;
+
+  // Performs its own reference counting.
+  ReffedStatusCallback* reffed_status_callback_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_RPC_CALL_CONTAINER_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory.cc b/tensorflow/core/util/rpc/rpc_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8530f02b6e2e021ed1c01db9a5bf25f5789a1142
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/lib/strings/numbers.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+template <>
+bool GetEnvVar(const char* key, const string& default_value, string* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+  } else {
+    *value = env_value;
+  }
+  return true;
+}
+
+template <>
+bool GetEnvVar(const char* key, const int64& default_value, int64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strto64(env_value, value);
+}
+
+template <>
+bool GetEnvVar(const char* key, const uint64& default_value, uint64* value) {
+  const char* env_value = std::getenv(key);
+  if (!env_value || env_value[0] == '\0') {
+    *value = default_value;
+    return true;
+  }
+  return strings::safe_strtou64(env_value, value);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory.h b/tensorflow/core/util/rpc/rpc_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bf078c0f4a33e9d32c57357cc0b176aedd4f52f
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory.h
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+
+// Return the environment variable `key`.  If the variable is not set,
+// use the default value.  If it is set but could not be parsed,
+// return `false`.  Otherwise set `value` and return `true`.
+template <typename T>
+bool GetEnvVar(const char* key, const T& default_value, T* value);
+
+class RPCFactory {
+ public:
+  RPCFactory() {}
+  virtual ~RPCFactory() {}
+
+  // Start a Call() to methods `method_t` at addresses `address_t` with
+  // request strings from `request_t`.  Any of these may be scalar
+  // Tensors, in which case the operands are broadcasted.
+  // Upon completion of all requests, `response_t` will be populated.
+  //
+  // If `try_rpc` is `true`, then `status_message_t` and
+  // `status_code_t` will be populated as well.
+  //
+  // If `try_rpc` is `false`, then `status_message_t` and
+  // `status_code_t` are ignored (and may be nullptr).  Instead, the
+  // status of any failed call will be propagated to the op.
+  //
+  // REQUIRES:
+  //   - `response_t` is not null, and is a string Tensor with the same shape as
+  //     `request_t`.
+  //
+  //   If `try_rpc` is `true`:
+  //      - `status_code_t` and `status_message_t` are not null.
+  //      - `status_code_t` is an int32 Tensor with the same shape as
+  //        `request_t`.
+  //      - `status_message_t` is a string Tensor with the same shape as
+  //        `request_t`.
+  virtual void Call(OpKernelContext* ctx, int64 num_elements,
+                    const Tensor& address_t, const Tensor& method_t,
+                    const Tensor& request_t, const bool try_rpc,
+                    Tensor* response_t, Tensor* status_code_t,
+                    Tensor* status_message_t,
+                    AsyncOpKernel::DoneCallback done) = 0;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RPCFactory);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.cc b/tensorflow/core/util/rpc/rpc_factory_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a148b5c04d0dbe551dd11d001f6434b23e99714f
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+
+namespace tensorflow {
+
+RPCFactoryRegistry* RPCFactoryRegistry::Global() {
+  static RPCFactoryRegistry* registry = new RPCFactoryRegistry;
+  return registry;
+}
+
+RPCFactoryRegistry::RPCFactoryFn* RPCFactoryRegistry::Get(
+    const string& protocol) {
+  auto found = fns_.find(protocol);
+  if (found == fns_.end()) return nullptr;
+  return &found->second;
+}
+
+void RPCFactoryRegistry::Register(const string& protocol,
+                                  const RPCFactoryFn& factory_fn) {
+  auto existing = Get(protocol);
+  CHECK_EQ(existing, nullptr)
+      << "RPC factory for protocol: " << protocol << " already registered";
+  fns_.insert(std::pair<const string&, RPCFactoryFn>(protocol, factory_fn));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry.h b/tensorflow/core/util/rpc/rpc_factory_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2635a4012e8f243c8d4334ad3477e184e8cd53a2
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+#define TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/rpc/rpc_factory.h"
+
+namespace tensorflow {
+
+class RPCFactoryRegistry {
+ public:
+  typedef std::function<RPCFactory*(OpKernelConstruction* ctx, bool fail_fast,
+                                    int64 timeout_in_ms)>
+      RPCFactoryFn;
+
+  // Returns a pointer to a global RPCFactoryRegistry object.
+  static RPCFactoryRegistry* Global();
+
+  // Returns a pointer to an function that creates an RPC factory for the given
+  // protocol.
+  RPCFactoryFn* Get(const string& protocol);
+
+  // Registers a function that creates and RPC factory for the given protocol.
+  // The function should transfer the ownership of the factory to its caller.
+  void Register(const string& protocol, const RPCFactoryFn& factory_fn);
+
+ private:
+  std::map<string, RPCFactoryFn> fns_;
+};
+
+namespace rpc_factory_registration {
+
+class RPCFactoryRegistration {
+ public:
+  RPCFactoryRegistration(const string& protocol,
+                         const RPCFactoryRegistry::RPCFactoryFn& factory_fn) {
+    RPCFactoryRegistry::Global()->Register(protocol, factory_fn);
+  }
+};
+
+}  // namespace rpc_factory_registration
+
+#define REGISTER_RPC_FACTORY(protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ_HELPER(__COUNTER__, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ_HELPER(ctr, protocol, factory_fn) \
+  REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn)
+
+#define REGISTER_RPC_FACTORY_UNIQ(ctr, protocol, factory_fn) \
+  static rpc_factory_registration::RPCFactoryRegistration    \
+      rpc_factory_registration_fn_##ctr(protocol, factory_fn)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RPC_RPC_FACTORY_REGISTRY_H_
diff --git a/tensorflow/core/util/rpc/rpc_factory_registry_test.cc b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cfd0f95016ed344924c9366bf43ff0ccb47e548c
--- /dev/null
+++ b/tensorflow/core/util/rpc/rpc_factory_registry_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/rpc/rpc_factory_registry.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Value {
+  static RPCFactory* Function(OpKernelConstruction* ctx, bool fail_fast,
+                              int64 timeout_in_ms) {
+    return nullptr;
+  }
+};
+
+REGISTER_RPC_FACTORY("TEST FACTORY 1", Value::Function);
+REGISTER_RPC_FACTORY("TEST FACTORY 2", Value::Function);
+}  // namespace
+
+TEST(RPCFactoryRegistryTest, TestBasic) {
+  EXPECT_EQ(RPCFactoryRegistry::Global()->Get("NON-EXISTENT"), nullptr);
+  auto factory1 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 1");
+  EXPECT_NE(factory1, nullptr);
+  auto factory2 = RPCFactoryRegistry::Global()->Get("TEST FACTORY 2");
+  EXPECT_NE(factory2, nullptr);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index 166bd0f659dae3124faac6d71d69cbcd41c15b48..648358606c130b7ed64a56739be0b37884f585d5 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -75,18 +75,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 08f1aa7125bc47421e0db24a9db6f6e2b2f1e365..7f166f0ec0aeee78738648060ef7782827918cd8 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/io/table_builder.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
@@ -293,7 +294,7 @@ void VersionTest(const VersionDef& version, StringPiece expected_error) {
   BundleReader reader(Env::Default(), path);
   EXPECT_TRUE(errors::IsInvalidArgument(reader.status()));
   EXPECT_TRUE(
-      StringPiece(reader.status().error_message()).starts_with(expected_error));
+      str_util::StartsWith(reader.status().error_message(), expected_error));
 }
 
 }  // namespace
@@ -588,7 +589,7 @@ TEST(TensorBundleTest, Error) {
     TF_EXPECT_OK(writer.Add("foo", Constant_2x3(1.f)));
     EXPECT_FALSE(writer.Add("foo", Constant_2x3(2.f)).ok());
     EXPECT_TRUE(
-        StringPiece(writer.status().ToString()).contains("duplicate key"));
+        str_util::StrContains(writer.status().ToString(), "duplicate key"));
     EXPECT_FALSE(writer.Finish().ok());
   }
   {  // Double finish
@@ -598,7 +599,7 @@ TEST(TensorBundleTest, Error) {
   }
   {  // Not found.
     BundleReader reader(Env::Default(), Prefix("nonexist"));
-    EXPECT_TRUE(StringPiece(reader.status().ToString()).contains("Not found"));
+    EXPECT_TRUE(str_util::StrContains(reader.status().ToString(), "Not found"));
   }
 }
 
@@ -629,7 +630,7 @@ TEST(TensorBundleTest, Checksum) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(errors::IsDataLoss(status));
-    EXPECT_TRUE(StringPiece(status.ToString()).contains(expected_msg));
+    EXPECT_TRUE(str_util::StrContains(status.ToString(), expected_msg));
   };
 
   // Corrupts a float tensor.
@@ -680,8 +681,8 @@ TEST(TensorBundleTest, Endianness) {
 
   BundleReader reader(Env::Default(), Prefix("end"));
   EXPECT_TRUE(errors::IsUnimplemented(reader.status()));
-  EXPECT_TRUE(StringPiece(reader.status().ToString())
-                  .contains("different endianness from the reader"));
+  EXPECT_TRUE(str_util::StrContains(reader.status().ToString(),
+                                    "different endianness from the reader"));
 }
 
 TEST(TensorBundleTest, TruncatedTensorContents) {
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 010cc36823b739a6209b9f56fd883cf6d6abc6d7..3c9590e488d1895fcc5c630f846c0fb63aea12f5 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -422,7 +423,7 @@ static void VersionTest(const VersionDef& versions, const string& error) {
   // Read it back in and verify that we get the expected error
   TensorSliceReader reader(path, OpenTableTensorSliceReader);
   EXPECT_TRUE(reader.status().code() == error::INVALID_ARGUMENT &&
-              StringPiece(reader.status().error_message()).starts_with(error))
+              str_util::StartsWith(reader.status().error_message(), error))
       << "Expected error starting with '" << errors::InvalidArgument(error)
       << "', got '" << reader.status() << "'";
 }
diff --git a/tensorflow/core/util/tensor_slice_writer_test.cc b/tensorflow/core/util/tensor_slice_writer_test.cc
index ff5bfd65aef360cd89908a94bee7d8bb052f1905..31397f11b66ce7b2a64fa7f5e508a801a1d47969 100644
--- a/tensorflow/core/util/tensor_slice_writer_test.cc
+++ b/tensorflow/core/util/tensor_slice_writer_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -333,8 +334,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<int8> data(300000000, -1);
     Status s = writer.Add("test1", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 
   // Add a large string tensor slice, which will fail.
@@ -344,8 +345,8 @@ TEST(TensorSliceWriteTest, SizeErrors) {
     const std::vector<string> data(256 * 1024, std::string(8192, 'f'));
     Status s = writer.Add("test2", shape, slice, data.data());
     EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
-    EXPECT_TRUE(StringPiece(s.error_message())
-                    .contains("Tensor slice is too large to serialize"));
+    EXPECT_TRUE(str_util::StrContains(
+        s.error_message(), "Tensor slice is too large to serialize"));
   }
 }
 
diff --git a/tensorflow/docs_src/about/uses.md b/tensorflow/docs_src/about/uses.md
index d646880bd350c42e463680a5c7eb0903f2c0a497..d3db98203e8746b8d824d3ac853dcfbc35ab9d25 100644
--- a/tensorflow/docs_src/about/uses.md
+++ b/tensorflow/docs_src/about/uses.md
@@ -18,9 +18,9 @@ This section describes some of the current uses of the TensorFlow system.
 
 > If you are using TensorFlow for research, for education, or for production
 > usage in some product, we would love to add something about your usage here.
-> Please feel free to email us a brief description of how you're using
-> TensorFlow, or even better, send us a pull request to add an entry to this
-> file.
+> Please feel free to [email us](mailto:usecases@tensorflow.org) a brief
+> description of how you're using TensorFlow, or even better, send us a
+> pull request to add an entry to this file.
 
 * **Deep Speech**
 <ul>
diff --git a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
index de4f126507930331d348cc795bd03b9971778d07..20fe88a799b3e0f6767207eb36d132d4c9e4b220 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.graph_editor.md
@@ -61,21 +61,21 @@ A subgraph can be created in several ways:
 
 * using a list of ops:
 
-```python
-my_sgv = ge.sgv(ops)
-```
+  ```python
+  my_sgv = ge.sgv(ops)
+  ```
 
 * from a name scope:
 
-```python
-my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv_scope("foo/bar", graph=tf.get_default_graph())
+  ```
 
 * using regular expression:
 
-```python
-my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
-```
+  ```python
+  my_sgv = ge.sgv("foo/.*/.*read$", graph=tf.get_default_graph())
+  ```
 
 Note that the Graph Editor is meant to manipulate several graphs at the same
 time, typically during transform or copy operation. For that reason,
diff --git a/tensorflow/docs_src/api_guides/python/io_ops.md b/tensorflow/docs_src/api_guides/python/io_ops.md
index 94cf0de32a2d2ea16d1581e7c42a08b59aa52888..86b4b39409863f09c3669dc6971901f6350377ca 100644
--- a/tensorflow/docs_src/api_guides/python/io_ops.md
+++ b/tensorflow/docs_src/api_guides/python/io_ops.md
@@ -8,7 +8,7 @@ Note: Functions taking `Tensor` arguments can also take anything accepted by
 ## Placeholders
 
 TensorFlow provides a placeholder operation that must be fed with data
-on execution.  For more info, see the section on @{$reading_data#feeding$Feeding data}.
+on execution.  For more info, see the section on @{$reading_data#Feeding$Feeding data}.
 
 *   @{tf.placeholder}
 *   @{tf.placeholder_with_default}
@@ -42,7 +42,7 @@ formats into tensors.
 
 ### Example protocol buffer
 
-TensorFlow's @{$reading_data#standard-tensorflow-format$recommended format for training examples}
+TensorFlow's @{$reading_data#standard_tensorflow_format$recommended format for training examples}
 is serialized `Example` protocol buffers, [described
 here](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 They contain `Features`, [described
diff --git a/tensorflow/docs_src/api_guides/python/nn.md b/tensorflow/docs_src/api_guides/python/nn.md
index 8e6fd1cff93332b84f552c18f627ba05dc67103e..8d8daaae19fa3e7863f9fa88393c35a3d95edf87 100644
--- a/tensorflow/docs_src/api_guides/python/nn.md
+++ b/tensorflow/docs_src/api_guides/python/nn.md
@@ -89,7 +89,7 @@ bottom. Note that this is different from existing libraries such as cuDNN and
 Caffe, which explicitly specify the number of padded pixels and always pad the
 same number of pixels on both sides.
 
-For the `'VALID`' scheme, the output height and width are computed as:
+For the `'VALID'` scheme, the output height and width are computed as:
 
     out_height = ceil(float(in_height - filter_height + 1) / float(strides[1]))
     out_width  = ceil(float(in_width - filter_width + 1) / float(strides[2]))
@@ -98,10 +98,10 @@ and no padding is used.
 
 Given the output size and the padding, the output can be computed as
 
-    output[b, i, j, :] =
-        sum_{di, dj} input[b, strides[1] * i + di - pad_top,
-                           strides[2] * j + dj - pad_left, ...] *
-                     filter[di, dj, ...]
+$$    output[b, i, j, :] =
+        sum_{d_i, d_j} input[b, strides[1] * i + d_i - pad_{top},\
+                           strides[2] * j + d_j - pad_{left}, ...] *
+                     filter[d_i, d_j,\ ...]$$
 
 where any value outside the original input image region are considered zero (
 i.e. we pad zero values around the border of the image).
@@ -161,12 +161,12 @@ Morphological operators are non-linear filters used in image processing.
 ](https://en.wikipedia.org/wiki/Dilation_(morphology))
 is the max-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         max_{dy, dx} input[b,
                            strides[1] * y + rates[1] * dy,
                            strides[2] * x + rates[2] * dx,
                            c] +
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 The `filter` is usually called structuring function. Max-pooling is a special
 case of greyscale morphological dilation when the filter assumes all-zero
@@ -176,12 +176,12 @@ values (a.k.a. flat structuring function).
 ](https://en.wikipedia.org/wiki/Erosion_(morphology))
 is the min-sum counterpart of standard sum-product convolution:
 
-    output[b, y, x, c] =
+$$    output[b, y, x, c] =
         min_{dy, dx} input[b,
                            strides[1] * y - rates[1] * dy,
                            strides[2] * x - rates[2] * dx,
                            c] -
-                     filter[dy, dx, c]
+                     filter[dy, dx, c]$$
 
 Dilation and erosion are dual to each other. The dilation of the input signal
 `f` by the structuring signal `g` is equal to the negation of the erosion of
diff --git a/tensorflow/docs_src/community/contributing.md b/tensorflow/docs_src/community/contributing.md
new file mode 100644
index 0000000000000000000000000000000000000000..afbb8bbdd0fd25f1e4fa607ac6b4f74e4cc37c0c
--- /dev/null
+++ b/tensorflow/docs_src/community/contributing.md
@@ -0,0 +1,49 @@
+# Contributing to TensorFlow
+
+TensorFlow is an open-source project, and we welcome your participation
+and contribution. This page describes how to get involved.
+
+## Repositories
+
+The code for TensorFlow is hosted in the [TensorFlow GitHub
+organization](https://github.com/tensorflow). Multiple projects are located
+inside the organization, including:
+
+* [TensorFlow](https://github.com/tensorflow/tensorflow)
+* [Models](https://github.com/tensorflow/models)
+* [TensorBoard](https://github.com/tensorflow/tensorboard)
+* [TensorFlow.js](https://github.com/tensorflow/tfjs)
+* [TensorFlow Serving](https://github.com/tensorflow/serving)
+* [TensorFlow Documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/docs_src)
+
+## Contributor checklist
+
+* Before contributing to TensorFlow source code, please review the [contribution
+guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
+
+* Join the
+[developers@tensorflow.org](https://groups.google.com/a/tensorflow.org/d/forum/developers)
+mailing list, to coordinate and discuss with others contributing to TensorFlow.
+
+* For coding style conventions, read the @{$style_guide$TensorFlow Style Guide}.
+
+* Finally, review @{$documentation$Writing TensorFlow Documentation}, which
+  explains documentation conventions.
+
+You may also wish to review our guide to @{$benchmarks$defining and running benchmarks}.
+
+## Special Interest Groups
+
+To enable focused collaboration on particular areas of TensorFlow, we host
+Special Interest Groups (SIGs). SIGs do their work in public: if you want to
+join and contribute, review the work of the group, and get in touch with the
+relevant SIG leader.  Membership policies vary on a per-SIG basis.
+
+* **SIG Build** focuses on issues surrounding building, packaging, and
+  distribution of TensorFlow. [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/build).
+
+* **SIG TensorBoard** furthers the development and direction of TensorBoard and its plugins.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard).
+
+* **SIG Rust** collaborates on the development of TensorFlow's Rust bindings.
+  [Mailing list](https://groups.google.com/a/tensorflow.org/d/forum/rust).
diff --git a/tensorflow/docs_src/community/documentation.md b/tensorflow/docs_src/community/documentation.md
index 6f2107ef4086f863e113dbffdebbb4fcbb6c7a99..d5bc7a5a7a96fc157a79c1097ad9520cdd36e0e0 100644
--- a/tensorflow/docs_src/community/documentation.md
+++ b/tensorflow/docs_src/community/documentation.md
@@ -148,19 +148,7 @@ viewing. Do not include url parameters in the source code URL.
 Before building the documentation, you must first set up your environment by
 doing the following:
 
-1. If pip isn't installed on your machine, install it now by issuing the
-following command:
-
-        $ sudo easy_install pip
-
-2. Use pip to install codegen, mock, and pandas by issuing the following
-   command (Note: If you are using
-   a [virtualenv](https://virtualenv.pypa.io/en/stable/) to manage your
-   dependencies, you may not want to use sudo for these installations):
-
-        $ sudo pip install codegen mock pandas
-
-3. If bazel is not installed on your machine, install it now. If you are on
+1. If bazel is not installed on your machine, install it now. If you are on
    Linux, install bazel by issuing the following command:
 
         $ sudo apt-get install bazel  # Linux
@@ -168,10 +156,10 @@ following command:
     If you are on Mac OS, find bazel installation instructions on
     [this page](https://bazel.build/versions/master/docs/install.html#mac-os-x).
 
-4. Change directory to the top-level `tensorflow` directory of the TensorFlow
+2. Change directory to the top-level `tensorflow` directory of the TensorFlow
    source code.
 
-5. Run the `configure` script and answer its prompts appropriately for your
+3. Run the `configure` script and answer its prompts appropriately for your
    system.
 
         $ ./configure
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
new file mode 100644
index 0000000000000000000000000000000000000000..d92f5775fafa394d795dd451077a721a2ecbb259
--- /dev/null
+++ b/tensorflow/docs_src/community/groups.md
@@ -0,0 +1,17 @@
+# User Groups
+
+TensorFlow has communities around the world.
+
+## Asia
+
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+
+
+## Europe
+
+* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
+* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+
diff --git a/tensorflow/docs_src/community/index.md b/tensorflow/docs_src/community/index.md
index ebeff8493ba656fcd77fa8a1d666009a258e97e1..eec2e51a8706b73abcedb8329df3ad03e3b349c3 100644
--- a/tensorflow/docs_src/community/index.md
+++ b/tensorflow/docs_src/community/index.md
@@ -1,18 +1,85 @@
 # Community
 
-This section contains the following documents:
-
-  * @{$welcome$Welcome to the TensorFlow Community}, which explains how
-    you can get involved, where to report issues, and where to join
-    like-minded TensorFlow enthusiasts online.
-  * @{$roadmap$Roadmap}, which summarizes upcoming additions to TensorFlow.
-  * @{$documentation$Writing TensorFlow Documentation}, which explains
-    TensorFlow's documentation conventions.  If you are modifying
-    TensorFlow source code or documentation, please read this guide.
-  * @{$style_guide$TensorFlow Style Guide}, which identifies coding style
-    conventions that TensorFlow developers and users should follow.
-  * @{$community/benchmarks$Benchmarks}, Benchmarks, a guide for defining and
-    running a TensorFlow benchmark.
-  * @{$security$Using TensorFlow Securely}, which explains TensorFlow's security
-    model, a list of recent security reports, and information on how you can
-    report a security vulnerability to the TensorFlow team.
+Welcome to the TensorFlow community! This page explains where to get help, and
+different ways to be part of the community. We are committed to fostering an
+open and welcoming environment, and request that you review our [code of
+conduct](https://github.com/tensorflow/tensorflow/blob/master/CODE_OF_CONDUCT.md).
+
+## Get Help
+
+### Technical Questions
+
+To ask or answer technical questions about TensorFlow, use [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow). For example,
+ask or search about a particular error message you encountered during
+installation.
+
+### Bugs and Feature Requests
+
+To report bugs or make feature requests, file an issue on GitHub. Please choose
+the appropriate repository for the project. Major repositories include:
+
+  * [TensorFlow](https://github.com/tensorflow/tensorflow/issues)
+  * [TensorBoard](https://github.com/tensorflow/tensorboard/issues)
+  * [TensorFlow models](https://github.com/tensorflow/models/issues)
+  
+### Security
+
+Before using TensorFlow, please take a look at our security model, list of
+recent security announcements, and ways you can report security issues to the
+TensorFlow team at the
+[Using TensorFlow Securely](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) page on GitHub.
+
+## Stay Informed
+
+### Announcements Mailing List
+
+All major releases and important announcements are sent to
+[announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
+We recommend that you join this list if you depend on TensorFlow in any way.
+
+### Development Roadmap
+
+The @{$roadmap$Roadmap} summarizes plans for upcoming additions to TensorFlow.
+
+### Social Media
+
+For news and updates from around the universe of TensorFlow projects, follow
+[@tensorflow](https://twitter.com/tensorflow) on Twitter.
+
+### Blog
+
+We post regularly to the [TensorFlow Blog](http://blog.tensorflow.org/),
+with content from the TensorFlow team and the best articles from the community.
+
+### YouTube
+
+Our [YouTube Channel](http://youtube.com/tensorflow/) focuses on machine learing
+and AI with TensorFlow. On it we have a number of new shows, including:
+
+- TensorFlow Meets: meet with community contributors to learn and share what they're doing
+- Ask TensorFlow: the team answers the best questions tagged #AskTensorFlow from social media 
+- Coding TensorFlow: short bites with tips for success with TensorFlow
+
+## Community Support
+
+### Mailing Lists
+
+For general discussion about TensorFlow development and direction, please join
+the [TensorFlow discuss mailing
+list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
+
+A number of other mailing lists exist, focused on different project areas, which
+can be found at @{$lists$TensorFlow Mailing Lists}.
+
+### User Groups
+
+To meet with like-minded people local to you, check out the many
+@{$groups$TensorFlow user groups} around the world.
+
+
+## Contributing To TensorFlow
+
+We welcome contributions and collaboration on TensorFlow. For more information,
+please read [Contributing to TensorFlow](contributing.md).
+
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index af344506c75850d788be718fd95ec32a919660f9..2bae60d9ddc5c18f67b1611054ac58b072e9674a 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -1,7 +1,9 @@
 index.md
-welcome.md
 roadmap.md
+contributing.md
+lists.md
+groups.md
 documentation.md
 style_guide.md
 benchmarks.md
-security.md
+swift.md
diff --git a/tensorflow/docs_src/community/lists.md b/tensorflow/docs_src/community/lists.md
new file mode 100644
index 0000000000000000000000000000000000000000..7450ab36c436538dd584541fb0dafb5a2c6067b3
--- /dev/null
+++ b/tensorflow/docs_src/community/lists.md
@@ -0,0 +1,51 @@
+# Mailing Lists
+
+As a community, we do much of our collaboration on public mailing lists.
+Please note that if you're looking for help using TensorFlow, [Stack
+Overflow](https://stackoverflow.com/questions/tagged/tensorflow) and
+[GitHub issues](https://github.com/tensorflow/tensorflow/issues)
+are the best initial places to look. For more information,
+see [how to get help](/community/#get_help).
+
+## General TensorFlow lists
+
+* [announce](https://groups.google.com/a/tensorflow.org/d/forum/announce) - Low-volume announcements of new releases.
+* [discuss](https://groups.google.com/a/tensorflow.org/d/forum/discuss) - General community discussion around TensorFlow.
+* [developers](https://groups.google.com/a/tensorflow.org/d/forum/developers) - Discussion for developers contributing to TensorFlow.
+
+## Project-specific lists
+
+These projects inside the TensorFlow GitHub organization have lists dedicated to their communities:
+
+* [hub](https://groups.google.com/a/tensorflow.org/d/forum/hub) -
+  Discussion and collaboration around [TensorFlow Hub](https://github.com/tensorflow/hub).
+* [magenta-discuss](https://groups.google.com/a/tensorflow.org/d/forum/magenta-discuss) -
+  General discussion about [Magenta](https://magenta.tensorflow.org/)
+  development and directions.
+* [swift](https://groups.google.com/a/tensorflow.org/d/forum/swift) -
+  Community and collaboration around Swift for TensorFlow.
+* [tensor2tensor](https://groups.google.com/d/forum/tensor2tensor) - Discussion
+  and peer support for Tensor2Tensor.
+* [tfjs-announce](https://groups.google.com/a/tensorflow.org/d/forum/tfjs-announce) -
+  Announcements of new TensorFlow.js releases.
+* [tfjs](https://groups.google.com/a/tensorflow.org/d/forum/tfjs) - Discussion
+  and peer support for TensorFlow.js.
+* [tflite](https://groups.google.com/a/tensorflow.org/d/forum/tflite) - Discussion and
+  peer support for TensorFlow Lite.
+* [tpu-users](https://groups.google.com/a/tensorflow.org/d/forum/tpu-users) - Community discussion
+  and support for TPU users.
+
+## Special Interest Groups
+
+TensorFlow's [Special Interest
+Groups](/community/contributing#special_interest_groups) (SIGs) support
+community collaboration on particular project focuses. Members of these groups
+work together to build and support TensorFlow related projects. While their
+archives are public, different SIGs have their own membership policies.
+
+* [build](https://groups.google.com/a/tensorflow.org/d/forum/build) -
+  Supporting SIG Build, for build, distribution and packaging of TensorFlow.
+* [sig-tensorboard](https://groups.google.com/a/tensorflow.org/d/forum/sig-tensorboard) -
+  Supporting SIG TensorBoard, for plugin development and other contribution.
+* [rust](https://groups.google.com/a/tensorflow.org/d/forum/rust) -
+  Supporting SIG Rust, for the Rust language bindings.
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
new file mode 100644
index 0000000000000000000000000000000000000000..54d9960b23405bb1a384477b62fd161eb4b06faa
--- /dev/null
+++ b/tensorflow/docs_src/community/swift.md
@@ -0,0 +1,35 @@
+# Swift Community
+
+Welcome to the Swift for TensorFlow development community!
+
+Swift for TensorFlow is a result of first-principles thinking applied to machine
+learning frameworks, and works quite differently than existing TensorFlow
+language bindings.  Whereas prior solutions are designed within the constraints
+of what can be achieved by a (typically Python or Lua) library, Swift for
+TensorFlow is based on the belief that machine learning is important enough to
+deserve first-class language and compiler support.
+
+First-class language and compiler support allows us to innovate in areas that
+have traditionally been out of bounds for machine learning libraries.  Our
+results provide the performance of TensorFlow graphs with the ease of use of
+define-by-run models, and provides a great user experience - for example, by
+catching more mistakes before you run your code.
+
+## Open Source
+
+As announced at the TensorFlow Developer Summit, we are planning to launch our
+open source project on GitHub in April.  In addition to releasing the code, we
+will be using an open design model, where design discussions happen in public.
+
+Between now and then, we are writing some technical white papers that explain in
+detail the design approach (e.g., the core compiler partitioning technique that
+underlies the whole thing, our approach to automatic differentiation, etc.),
+implementation tradeoffs, and the status of this work.  We can’t wait to engage
+with the broader community, but prefer to start the conversation when these
+white papers are ready.
+
+[Sign up here to join the community Google
+group](https://groups.google.com/a/tensorflow.org/d/forum/swift). We will
+initially use it for announcements, and then open it for general discussion when
+we are ready in April.
+
diff --git a/tensorflow/docs_src/community/welcome.md b/tensorflow/docs_src/community/welcome.md
deleted file mode 100644
index 6d0458e678b5507fc722e2c3848e84ca2168e1e3..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/community/welcome.md
+++ /dev/null
@@ -1,71 +0,0 @@
-# Welcome to the TensorFlow Community
-
-TensorFlow is an open-source project.  This page explains how to contribute,
-where to ask questions, and how to help each other.
-
-
-## Development
-
-The source code for TensorFlow is on
-[GitHub](https://github.com/tensorflow/tensorflow).
-
-Before contributing to TensorFlow source code, please review the
-[Contribution guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md).
-
-### Projects developed by the TensorFlow community
-
-The TensorFlow community has created many great projects around TensorFlow, including:
-
-* [Machine Learning with TensorFlow (Book & Code)](http://tensorflowbook.com)
-* [@jtoy's awesome "Awesome TensorFlow" list of awesome things](https://github.com/jtoy/awesome-tensorflow)
-* [TensorFlow tutorials](https://github.com/pkmital/tensorflow_tutorials)
-* [Caffe to TensorFlow model converter](https://github.com/ethereon/caffe-tensorflow)
-* [Bitfusion's` GPU-enabled AWS EC2 TensorFlow AMI](https://github.com/bitfusionio/amis/tree/master/awsmrkt-bfboost-ubuntu14-cuda75-tensorflow) ([Launch AMI](https://aws.amazon.com/marketplace/pp/B01EYKBEQ0))
-* [Rust language bindings](https://github.com/google/tensorflow-rust)
-* [Operator Vectorization Library](https://github.com/opveclib/opveclib)
-* [Swift language bindings](https://github.com/PerfectlySoft/Perfect-TensorFlow)
-* [Sublime Tensorflow - A plugin for Sublime Text](https://github.com/baptisteArnaud/Sublime-Tensorflow)
-* [Edward - A library for probabilistic modeling, inference, and criticism](http://edwardlib.org) ([Github](https://github.com/blei-lab/edward), [Forum](https://discourse.edwardlib.org))
-* [GPflow - Gaussian processes in TensorFlow](https://github.com/GPflow/GPflow)
-* [CS 20SI: Tensorflow for Deep Learning Research](https://web.stanford.edu/class/cs20si/) - Please note, this course was designed with TensorFlow v0.12, so some of the notes may be out of date - but it's still a great resource.
-
-## TensorFlow Communities Around the World
-
-Asia:
-
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
-* [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
-
-
-Europe:
-
-* [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
-* [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
-
-
-
-## Support
-
-TensorFlow provides multiple communication paths.  To pick the right path,
-please read the following list carefully:
-
-  * For new release announcements and security updates, subscribe to
-    [announce@tensorflow.org](https://groups.google.com/a/tensorflow.org/forum/#!forum/announce).
-  * To ask or answer technical questions about TensorFlow, use
-    [Stack Overflow](https://stackoverflow.com/questions/tagged/tensorflow).
-    For example, ask or search Stack Overflow about a particular error message
-    you encountered during installation.
-  * To join general discussions about TensorFlow development and directions,
-    please join the
-    [TensorFlow discuss mailing list](https://groups.google.com/a/tensorflow.org/d/forum/discuss).
-    For example, use this mailing list to learn about new features in
-    upcoming releases of TensorFlow.
-  * To report bugs or make feature requests, use the
-    [TensorFlow issues tracker](https://github.com/tensorflow/tensorflow/issues)
-    on GitHub.  For example, use the issue tracker to request a
-    new operation in TensorFlow.
-  * To report vulnerabilities, please follow our
-    [vulnerability disclosure guidelines](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md).
-
diff --git a/tensorflow/docs_src/deploy/deploy_to_js.md b/tensorflow/docs_src/deploy/deploy_to_js.md
new file mode 100644
index 0000000000000000000000000000000000000000..d7ce3ea90bda25a84c6dc8ca52e97b1613043c0b
--- /dev/null
+++ b/tensorflow/docs_src/deploy/deploy_to_js.md
@@ -0,0 +1,4 @@
+# Deploy to JavaScript
+
+You can find details about deploying JavaScript TensorFlow programs
+in the separate [js.tensorflow.org site](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/deploy/leftnav_files b/tensorflow/docs_src/deploy/leftnav_files
index c682e7add16c741279aedb40c1b12f4ca8f0286a..93f5bd1ed20d34eaf7c9ef64ea89e5632331d5c1 100644
--- a/tensorflow/docs_src/deploy/leftnav_files
+++ b/tensorflow/docs_src/deploy/leftnav_files
@@ -2,3 +2,4 @@ index.md
 distributed.md
 hadoop.md
 s3.md
+deploy_to_js.md
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index 15075e1df8e703415b4acb8e53f76dc9a4a41b50..84da2165b599cda64b5f2ae2ceecfb9ac1934a42 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -530,56 +530,58 @@ form [described below](#attr_types).
 
 For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
 instead of only the 0th element, you can register the op like so:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("preserve\_index: int")</b>
-    .Input("to\_zero: int32")
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("preserve_index: int")
+    .Input("to_zero: int32")
     .Output("zeroed: int32");
-</code></pre>
+```
 
 (Note that the set of [attribute types](#attr_types) is different from the
 @{tf.DType$tensor types} used for inputs and outputs.)
 
 Your kernel can then access this attr in its constructor via the `context`
 parameter:
-<pre class="prettyprint"><code class="lang-cpp">
+```c++
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {<b>
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {
     // Get the index of the value to preserve
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;GetAttr("preserve\_index", &preserve\_index\_));
-    // Check that preserve\_index is positive
-    OP\_REQUIRES(context, preserve\_index_ &gt;= 0,
-                errors::InvalidArgument("Need preserve\_index &gt;= 0, got ",
-                                        preserve\_index_));
-  </b>}
-  void Compute(OpKernelContext\* context) override {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("preserve_index", &preserve_index_));
+    // Check that preserve_index is positive
+    OP_REQUIRES(context, preserve_index_ >= 0,
+                errors::InvalidArgument("Need preserve_index >= 0, got ",
+                                        preserve_index_));
+  }
+  void Compute(OpKernelContext* context) override {
     // ...
   }
- <b>private:
-  int preserve\_index\_;</b>
+ private:
+  int preserve_index_;
 };
-</code></pre>
+```
 
 which can then be used in the `Compute` method:
-<pre class="prettyprint"><code class="lang-cpp">
-  void Compute(OpKernelContext\* context) override {
+```c++
+  void Compute(OpKernelContext* context) override {
     // ...
-<br/>
-    <b>// We're using saved attr to validate potentially dynamic input
-    // So we check that preserve\_index is in range
-    OP\_REQUIRES(context, preserve\_index_ &lt; input.dimension(0),
-                errors::InvalidArgument("preserve\_index out of range"));<br/>
-    </b>// Set all the elements of the output tensor to 0
+
+    // We're using saved attr to validate potentially dynamic input
+    // So we check that preserve_index is in range
+    OP_REQUIRES(context, preserve_index_ < input.dimension(0),
+                errors::InvalidArgument("preserve_index out of range"));
+
+    // Set all the elements of the output tensor to 0
     const int N = input.size();
     for (int i = 0; i < N; i++) {
       output\_flat(i) = 0;
-    }<br/>
-    <b>// Preserve the requested input value
-    output\_flat(preserve\_index\_) = input(preserve\_index\_);</b>
+    }
+
+    // Preserve the requested input value
+    output_flat(preserve_index_) = input(preserve_index_);
   }
-</code></pre>
+```
 
 #### Attr types
 
@@ -725,12 +727,12 @@ you would then register an `OpKernel` for each supported type.
 
 For instance, if you'd like the `ZeroOut` op to work on `float`s
 in addition to `int32`s, your op registration might look like:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Your op registration now specifies that the input's type must be `float`, or
 `int32`, and that its output will be the same type, since both have type `T`.
@@ -790,66 +792,73 @@ Your op registration now specifies that the input's type must be `float`, or
 >   """
 > ```
 
-<pre class="prettyprint"><code class="lang-cpp">
-\#include "tensorflow/core/framework/op_kernel.h"<br/>
-class ZeroOut<b>Int32</b>Op : public OpKernel {
+```c++
+#include "tensorflow/core/framework/op_kernel.h"
+
+class ZeroOutInt32Op : public OpKernel {
   // as before
-};<br/>
-class ZeroOut<b>Float</b>Op : public OpKernel {
+};
+
+class ZeroOutFloatOp : public OpKernel {
  public:
-  explicit ZeroOut<b>Float</b>Op(OpKernelConstruction\* context)
-      : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutFloatOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat&lt;<b>float</b>&gt;();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<float>();
+
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat&lt;<b>float</b>&gt;();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<float>();
+
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/><b>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    <b>.TypeConstraint&lt;int32&gt;("T"),</b>
-    ZeroOutOp<b>Int32</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOpInt32);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
     ZeroOutFloatOp);
-</b></code></pre>
+```
 
 > To preserve [backwards compatibility](#backwards-compatibility), you should
 > specify a [default value](#default-values-constraints) when adding an attr to
 > an existing op:
 >
-> <pre class="prettyprint"><code class="lang-cpp">
-> REGISTER\_OP("ZeroOut")
->   <b>.Attr("T: {float, int32} = DT_INT32")</b>
->   .Input("to\_zero: T")
+> ```c++
+> REGISTER_OP("ZeroOut")
+>   .Attr("T: {float, int32} = DT_INT32")
+>   .Input("to_zero: T")
 >   .Output("zeroed: T")
-> </code></pre>
+> ```
 
 Let's say you wanted to add more types, say `double`:
-<pre class="prettyprint"><code class="lang-cpp">
-REGISTER\_OP("ZeroOut")
-    <b>.Attr("T: {float, <b>double,</b> int32}")</b>
-    .Input("to\_zero: <b>T</b>")
-    .Output("zeroed: <b>T</b>");
-</code></pre>
+```c++
+REGISTER_OP("ZeroOut")
+    .Attr("T: {float, double, int32}")
+    .Input("to_zero: T")
+    .Output("zeroed: T");
+```
 
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
diff --git a/tensorflow/docs_src/extend/index.md b/tensorflow/docs_src/extend/index.md
index bdff60b39ec6fe939273a529ec4e46407cface8a..1ab0340ad983de891ef5e18a729c1e4fb3c4e0d9 100644
--- a/tensorflow/docs_src/extend/index.md
+++ b/tensorflow/docs_src/extend/index.md
@@ -16,9 +16,10 @@ TensorFlow:
     for your own file and record formats.
 
 Python is currently the only language supported by TensorFlow's API stability
-promises.  However, TensorFlow also provides functionality in C++, Java, and Go,
+promises. However, TensorFlow also provides functionality in C++, Go, Java and
+[JavaScript](https://js.tensorflow.org),
 plus community support for [Haskell](https://github.com/tensorflow/haskell) and
-[Rust](https://github.com/tensorflow/rust).  If you'd like to create or
+[Rust](https://github.com/tensorflow/rust). If you'd like to create or
 develop TensorFlow features in a language other than these languages, read the
 following guide:
 
diff --git a/tensorflow/docs_src/extend/language_bindings.md b/tensorflow/docs_src/extend/language_bindings.md
index b9fd72978dd11046e5347b9bce2bddd345ca426b..9a968d365be15e087482c9dcf555b8c128a3e21d 100644
--- a/tensorflow/docs_src/extend/language_bindings.md
+++ b/tensorflow/docs_src/extend/language_bindings.md
@@ -112,11 +112,11 @@ There are a few ways to get a list of the `OpDef`s for the registered ops:
     to interpret the `OpDef` messages.
 -   The C++ function `OpRegistry::Global()->GetRegisteredOps()` returns the same
     list of all registered `OpDef`s (defined in
-    [`tensorflow/core/framework/op.h`]). This can be used to write the generator
+    [`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h)). This can be used to write the generator
     in C++ (particularly useful for languages that do not have protocol buffer
     support).
 -   The ASCII-serialized version of that list is periodically checked in to
-    [`tensorflow/core/ops/ops.pbtxt`] by an automated process.
+    [`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt) by an automated process.
 
 The `OpDef` specifies the following:
 
@@ -159,7 +159,7 @@ between the generated code and the `OpDef`s checked into the repository, but is
 useful for languages where code is expected to be generated ahead of time like
 `go get` for Go and `cargo ops` for Rust. At the other end of the spectrum, for
 some languages the code could be generated dynamically from
-[`tensorflow/core/ops/ops.pbtxt`].
+[`tensorflow/core/ops/ops.pbtxt`](https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt).
 
 #### Handling Constants
 
@@ -229,6 +229,3 @@ and "while") is not available in languages other than Python. This will be
 updated when the [C API] provides necessary support.
 
 [C API]: https://www.tensorflow.org/code/tensorflow/c/c_api.h
-[`tensorflow/core/ops/ops.pbtxt`]: https://www.tensorflow.org/code/tensorflow/core/ops/ops.pbtxt
-[`tensorflow/python/BUILD`]: https://www.tensorflow.org/code/tensorflow/python/BUILD
-[`tensorflow/core/framework/op.h`]: https://www.tensorflow.org/code/tensorflow/core/framework/op.h
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 10e717c280f09c4f1bdfea9d0a2c8d3a00191734..2c33a6b6f7e5f1faf04d38e95b74d184134a1edf 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -1,4 +1,4 @@
-# Custom Data Readers
+# Reading custom file and record formats
 
 PREREQUISITES:
 
@@ -9,187 +9,273 @@ PREREQUISITES:
 
 We divide the task of supporting a file format into two pieces:
 
-*   File formats: We use a *Reader* Op to read a *record* (which can be any
-    string) from a file.
-*   Record formats: We use decoder or parsing Ops to turn a string record
+*   File formats: We use a reader `tf.data.Dataset` to read raw *records* (which
+    are typically represented by scalar string tensors, but can have more
+    structure) from a file.
+*   Record formats: We use decoder or parsing ops to turn a string record
     into tensors usable by TensorFlow.
 
 For example, to read a
 [CSV file](https://en.wikipedia.org/wiki/Comma-separated_values), we use
-@{tf.TextLineReader$a Reader for text files}
-followed by
-@{tf.decode_csv$an Op that parses CSV data from a line of text}.
+@{tf.data.TextLineDataset$a dataset for reading text files line-by-line}
+and then @{tf.data.Dataset.map$map} an
+@{tf.decode_csv$op} that parses CSV data from each line of text in the dataset.
 
 [TOC]
 
-## Writing a Reader for a file format
+## Writing a `Dataset` for a file format
 
-A `Reader` is something that reads records from a file.  There are some examples
-of Reader Ops already built into TensorFlow:
+A @{tf.data.Dataset} represents a sequence of *elements*, which can be the
+individual records in a file. There are several examples of "reader" datasets
+that are already built into TensorFlow:
 
-*   @{tf.TFRecordReader}
-    ([source in `kernels/tf_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/tf_record_reader_op.cc))
-*   @{tf.FixedLengthRecordReader}
-    ([source in `kernels/fixed_length_record_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/fixed_length_record_reader_op.cc))
-*   @{tf.TextLineReader}
-    ([source in `kernels/text_line_reader_op.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/text_line_reader_op.cc))
+*   @{tf.data.TFRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.FixedLengthRecordDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
+*   @{tf.data.TextLineDataset}
+    ([source in `kernels/data/reader_dataset_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/kernels/data/reader_dataset_ops.cc))
 
-You can see these all expose the same interface, the only differences
-are in their constructors.  The most important method is `read`.
-It takes a queue argument, which is where it gets filenames to
-read from whenever it needs one (e.g. when the `read` op first runs, or
-the previous `read` reads the last record from a file).  It produces
-two scalar tensors: a string key and a string value.
+Each of these implementations comprises three related classes:
 
-To create a new reader called `SomeReader`, you will need to:
+* A `tensorflow::DatasetOpKernel` subclass (e.g. `TextLineDatasetOp`), which
+  tells TensorFlow how to construct a dataset object from the inputs to and
+  attrs of an op, in its `MakeDataset()` method.
 
-1.  In C++, define a subclass of
-    [`tensorflow::ReaderBase`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h)
-    called `SomeReader`.
-2.  In C++, register a new reader op and kernel with the name `"SomeReader"`.
-3.  In Python, define a subclass of @{tf.ReaderBase} called `SomeReader`.
+* A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
+  which represents the *immutable* definition of the dataset itself, and tells
+  TensorFlow how to construct an iterator object over that dataset, in its
+  `MakeIterator()` method.
 
-You can put all the C++ code in a file in
-`tensorflow/core/user_ops/some_reader_op.cc`. The code to read a file will live
-in a descendant of the C++ `ReaderBase` class, which is defined in
-[`tensorflow/core/kernels/reader_base.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_base.h).
-You will need to implement the following methods:
+* A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
+  `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
+  of an iterator over a particular dataset, and tells TensorFlow how to get the
+  next element from the iterator, in its `GetNextInternal()` method.
 
-*   `OnWorkStartedLocked`: open the next file
-*   `ReadLocked`: read a record or report EOF/error
-*   `OnWorkFinishedLocked`: close the current file, and
-*   `ResetLocked`: get a clean slate after, e.g., an error
+The most important method is the `GetNextInternal()` method, since it defines
+how to actually read records from the file and represent them as one or more
+`Tensor` objects.
 
-These methods have names ending in "Locked" since `ReaderBase` makes sure
-to acquire a mutex before calling any of these methods, so you generally don't
-have to worry about thread safety (though that only protects the members of the
-class, not global state).
+To create a new reader dataset called (for example) `MyReaderDataset`, you will
+need to:
 
-For `OnWorkStartedLocked`, the name of the file to open is the value returned by
-the `current_work()` method.  `ReadLocked` has this signature:
+1. In C++, define subclasses of `tensorflow::DatasetOpKernel`,
+   `tensorflow::GraphDatasetBase`, and `tensorflow::DatasetIterator<Dataset>`
+   that implement the reading logic.
+2. In C++, register a new reader op and kernel with the name
+   `"MyReaderDataset"`.
+3. In Python, define a subclass of @{tf.data.Dataset} called `MyReaderDataset`.
 
-```c++
-Status ReadLocked(string* key, string* value, bool* produced, bool* at_end)
-```
-
-If `ReadLocked` successfully reads a record from the file, it should fill in:
-
-*   `*key`: with an identifier for the record, that a human could use to find
-    this record again.  You can include the filename from `current_work()`,
-    and append a record number or whatever.
-*   `*value`: with the contents of the record.
-*   `*produced`: set to `true`.
-
-If you hit the end of a file (EOF), set `*at_end` to `true`.  In either case,
-return `Status::OK()`.  If there is an error, simply return it using one of the
-helper functions from
-[`tensorflow/core/lib/core/errors.h`](https://www.tensorflow.org/code/tensorflow/core/lib/core/errors.h)
-without modifying any arguments.
-
-Next you will create the actual Reader op.  It will help if you are familiar
-with @{$adding_an_op$the adding an op how-to}.  The main steps
-are:
-
-*   Registering the op.
-*   Define and register an `OpKernel`.
-
-To register the op, you will use a `REGISTER_OP` call defined in
-[`tensorflow/core/framework/op.h`](https://www.tensorflow.org/code/tensorflow/core/framework/op.h).
-Reader ops never take any input and always have a single output with type
-`resource`.  They should have string `container` and `shared_name` attrs.
-You may optionally define additional attrs
-for configuration or include documentation in a `Doc`.  For examples, see
-[`tensorflow/core/ops/io_ops.cc`](https://www.tensorflow.org/code/tensorflow/core/ops/io_ops.cc),
-e.g.:
+You can put all the C++ code in a single file, such as
+`my_reader_dataset_op.cc`. It will help if you are
+familiar with @{$adding_an_op$the adding an op how-to}. The following skeleton
+can be used as a starting point for your implementation:
 
 ```c++
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-REGISTER_OP("TextLineReader")
-    .Output("reader_handle: resource")
-    .Attr("skip_header_lines: int = 0")
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-A Reader that outputs the lines of a file delimited by '\n'.
-)doc");
-```
-
-To define an `OpKernel`, Readers can use the shortcut of descending from
-`ReaderOpKernel`, defined in
-[`tensorflow/core/framework/reader_op_kernel.h`](https://www.tensorflow.org/code/tensorflow/core/framework/reader_op_kernel.h),
-and implement a constructor that calls `SetReaderFactory`.  After defining
-your class, you will need to register it using `REGISTER_KERNEL_BUILDER(...)`.
-An example with no attrs:
+namespace tensorflow {
+namespace {
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TFRecordReaderOp : public ReaderOpKernel {
+class MyReaderDatasetOp : public DatasetOpKernel {
  public:
-  explicit TFRecordReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    Env* env = context->env();
-    SetReaderFactory([this, env]() { return new TFRecordReader(name(), env); });
-  }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TFRecordReader").Device(DEVICE_CPU),
-                        TFRecordReaderOp);
-```
+  MyReaderDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    // Parse and validate any attrs that define the dataset using
+    // `ctx->GetAttr()`, and store them in member variables.
+  }
 
-An example with attrs:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    // Parse and validate any input tensors 0that define the dataset using
+    // `ctx->input()` or the utility function
+    // `ParseScalarArgument<T>(ctx, &arg)`.
 
-```c++
-#include "tensorflow/core/framework/reader_op_kernel.h"
-
-class TextLineReaderOp : public ReaderOpKernel {
- public:
-  explicit TextLineReaderOp(OpKernelConstruction* context)
-      : ReaderOpKernel(context) {
-    int skip_header_lines = -1;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("skip_header_lines", &skip_header_lines));
-    OP_REQUIRES(context, skip_header_lines >= 0,
-                errors::InvalidArgument("skip_header_lines must be >= 0 not ",
-                                        skip_header_lines));
-    Env* env = context->env();
-    SetReaderFactory([this, skip_header_lines, env]() {
-      return new TextLineReader(name(), skip_header_lines, env);
-    });
+    // Create the dataset object, passing any (already-validated) arguments from
+    // attrs or input tensors.
+    *output = new Dataset(ctx);
   }
-};
 
-REGISTER_KERNEL_BUILDER(Name("TextLineReader").Device(DEVICE_CPU),
-                        TextLineReaderOp);
-```
-
-The last step is to add the Python wrapper.  You can either do this by
-@{$adding_an_op#build_the_op_library$compiling a dynamic library}
-or, if you are building TensorFlow from source, adding to `user_ops.py`.
-For the latter, you will import `tensorflow.python.ops.io_ops` in
-[`tensorflow/python/user_ops/user_ops.py`](https://www.tensorflow.org/code/tensorflow/python/user_ops/user_ops.py)
-and add a descendant of [`io_ops.ReaderBase`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
+
+    std::unique_ptr<IteratorBase> MakeIterator(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
+    }
+
+    // Record structure: Each record is represented by a scalar string tensor.
+    //
+    // Dataset elements can have a fixed number of components of different
+    // types and shapes; replace the following two methods to customize this
+    // aspect of the dataset.
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_STRING});
+      return *dtypes;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+
+   protected:
+    // Optional: Implementation of `GraphDef` serialization for this dataset.
+    //
+    // Implement this method if you want to be able to save and restore
+    // instances of this dataset (and any iterators over it).
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // Construct nodes to represent any of the input tensors from this
+      // object's member variables using `b->AddScalar()` and `b->AddVector()`.
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      // Implementation of the reading logic.
+      //
+      // The example implementation in this file yields the string "MyReader!"
+      // ten times. In general there are three cases:
+      //
+      // 1. If an element is successfully read, store it as one or more tensors
+      //    in `*out_tensors`, set `*end_of_sequence = false` and return
+      //    `Status::OK()`.
+      // 2. If the end of input is reached, set `*end_of_sequence = true` and
+      //    return `Status::OK()`.
+      // 3. If an error occurs, return an error status using one of the helper
+      //    functions from "tensorflow/core/lib/core/errors.h".
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        // NOTE: `GetNextInternal()` may be called concurrently, so it is
+        // recommended that you protect the iterator state with a mutex.
+        mutex_lock l(mu_);
+        if (i_ < 10) {
+          // Create a scalar string tensor and add it to the output.
+          Tensor record_tensor(ctx->allocator({}), DT_STRING, {});
+          record_tensor.scalar<string>()() = "MyReader!";
+          out_tensors->emplace_back(std::move(record_tensor));
+          ++i_;
+          *end_of_sequence = false;
+        } else {
+          *end_of_sequence = true;
+        }
+        return Status::OK();
+      }
+
+     protected:
+      // Optional: Implementation of iterator state serialization for this
+      // iterator.
+      //
+      // Implement these two methods if you want to be able to save and restore
+      // instances of this iterator.
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("i"), i_));
+        return Status::OK();
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("i"), &i_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      int64 i_ GUARDED_BY(mu_);
+    };
+  };
+};
 
-```python
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import common_shapes
-from tensorflow.python.ops import io_ops
+// Register the op definition for MyReaderDataset.
+//
+// Dataset ops always have a single output, of type `variant`, which represents
+// the constructed `Dataset` object.
+//
+// Add any attrs and input tensors that define the dataset here.
+REGISTER_OP("MyReaderDataset")
+    .Output("handle: variant")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
 
-class SomeReader(io_ops.ReaderBase):
+// Register the kernel implementation for MyReaderDataset.
+REGISTER_KERNEL_BUILDER(Name("MyReaderDataset").Device(DEVICE_CPU),
+                        MyReaderDatasetOp);
 
-    def __init__(self, name=None):
-        rr = gen_user_ops.some_reader(name=name)
-        super(SomeReader, self).__init__(rr)
+}  // namespace
+}  // namespace tensorflow
+```
 
+The last step is to build the C++ code and add a Python wrapper. The easiest way
+to do this is by @{$adding_an_op#build_the_op_library$compiling a dynamic
+library} (e.g. called `"my_reader_dataset_op.so"`), and adding a Python class
+that subclasses @{tf.data.Dataset} to wrap it. An example Python program is
+given here:
 
-ops.NotDifferentiable("SomeReader")
+```python
+import tensorflow as tf
+
+# Assumes the file is in the current working directory.
+my_reader_dataset_module = tf.load_op_library("./my_reader_dataset_op.so")
+
+class MyReaderDataset(tf.data.Dataset):
+
+  def __init__(self):
+    super(MyReaderDataset, self).__init__()
+    # Create any input attrs or tensors as members of this class.
+
+  def _as_variant_tensor(self):
+    # Actually construct the graph node for the dataset op.
+    #
+    # This method will be invoked when you create an iterator on this dataset
+    # or a dataset derived from it.
+    return my_reader_dataset_module.my_reader_dataset()
+
+  # The following properties define the structure of each element: a scalar
+  # `tf.string` tensor. Change these properties to match the `output_dtypes()`
+  # and `output_shapes()` methods of `MyReaderDataset::Dataset` if you modify
+  # the structure of each element.
+  @property
+  def output_types(self):
+    return tf.string
+
+  @property
+  def output_shapes(self):
+    return tf.TensorShape([])
+
+  @property
+  def output_classes(self):
+    return tf.Tensor
+
+if __name__ == "__main__":
+  # Create a MyReaderDataset and print its elements.
+  with tf.Session() as sess:
+    iterator = MyReaderDataset().make_one_shot_iterator()
+    next_element = iterator.get_next()
+    try:
+      while True:
+        print(sess.run(next_element))  # Prints "MyReader!" ten times.
+    except tf.errors.OutOfRangeError:
+      pass
 ```
 
-You can see some examples in
-[`tensorflow/python/ops/io_ops.py`](https://www.tensorflow.org/code/tensorflow/python/ops/io_ops.py).
+You can see some examples of `Dataset` wrapper classes in
+[`tensorflow/python/data/ops/dataset_ops.py`](https://www.tensorflow.org/code/tensorflow/python/data/ops/dataset_ops.py).
 
 ## Writing an Op for a record format
 
@@ -201,9 +287,7 @@ track down where the bad data came from.
 
 Examples of Ops useful for decoding records:
 
-*   @{tf.parse_single_example}
-    (and
-    @{tf.parse_example})
+*   @{tf.parse_single_example} (and @{tf.parse_example})
 *   @{tf.decode_csv}
 *   @{tf.decode_raw}
 
@@ -211,11 +295,6 @@ Note that it can be useful to use multiple Ops to decode a particular record
 format.  For example, you may have an image saved as a string in
 [a `tf.train.Example` protocol buffer](https://www.tensorflow.org/code/tensorflow/core/example/example.proto).
 Depending on the format of that image, you might take the corresponding output
-from a
-@{tf.parse_single_example}
-op and call @{tf.image.decode_jpeg},
-@{tf.image.decode_png}, or
-@{tf.decode_raw}.  It is common to
-take the output of `tf.decode_raw` and use
-@{tf.slice} and
-@{tf.reshape} to extract pieces.
+from a @{tf.parse_single_example} op and call @{tf.image.decode_jpeg},
+@{tf.image.decode_png}, or @{tf.decode_raw}.  It is common to take the output
+of `tf.decode_raw` and use @{tf.slice} and @{tf.reshape} to extract pieces.
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/get_started/custom_estimators.md
index 941c3e16905a9062b3081ad0af6bcbc1621a146b..275cda12bc397e1a8a980f6c97e6b2d97c5e64e8 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/get_started/custom_estimators.md
@@ -546,7 +546,7 @@ In brief, here's what the three graphs tell you:
 
 * accuracy: The accuracy is recorded by the following two lines:
 
-    * `eval_metric_ops={'my_accuracy': accuracy})`, during evaluation.
+    * `eval_metric_ops={'my_accuracy': accuracy}`, during evaluation.
     * `tf.summary.scalar('accuracy', accuracy[1])`, during training.
 
 These tensorboard graphs are one of the main reasons it's important to pass a
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad89f0154c06d97673bdb0d598ca5387c61bc6ac
--- /dev/null
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -0,0 +1,3 @@
+# Get Started with Eager Execution
+
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
index f59cebe6c41934bbb53d6d2a983a52fbb05a3bfc..fbe0ed74f82bb34bc55dd7bab5819c0d9fdc54e9 100644
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ b/tensorflow/docs_src/get_started/get_started_for_beginners.md
@@ -1,4 +1,4 @@
-# Getting Started for ML Beginners
+# Get Started with Graph Execution
 
 This document explains how to use machine learning to classify (categorize)
 Iris flowers by species.  This document dives deeply into the TensorFlow
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index fb83a770a5534d625ab20bdbdbddab548cedb4a4..b28cb9df75d94a71a7d3a501a84e1cdd8e4c3ecb 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -1,4 +1,4 @@
-# Getting Started
+# Get Started
 
 If you are new to machine learning, we recommend taking the following online
 course prior to diving into TensorFlow documentation:
@@ -10,8 +10,13 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-TensorFlow provides many APIs. This section focuses on the high-level APIs.
-If you are new to TensorFlow, begin by reading one of the following documents:
+The easiest way to get started with tensorflow is using Eager Execution.
+
+  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+
+TensorFlow provides many APIs. The remainder of this section focuses on the
+Estimator API which provide scalable, high-performance models.
+To get started with Estimators begin by reading one of the following documents:
 
   * @{$get_started/get_started_for_beginners}, which is aimed at readers
     new to machine learning.
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 437791d6a32db3e43415e381a034424ae8225f6f..4c12f0d84b3d13e4d9ececcb4559e806486b4120 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,10 +1,14 @@
 index.md
 
-### Getting Started
+### Beginners
+eager.md
 get_started_for_beginners.md
 premade_estimators.md
 
-### Details
+### Estimators
+get_started_for_beginners.md: For Beginners
+premade_estimators.md: Premade Estimators
+>>>
 checkpoints.md
 feature_columns.md
 datasets_quickstart.md
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/get_started/premade_estimators.md
index e50d2f542037c8537f79a2ae53a2cbb3448243c6..4be7e508f94074f20d07e271259bf77074dd19e3 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/get_started/premade_estimators.md
@@ -1,5 +1,4 @@
-
-# Getting Started with TensorFlow
+# Premade Estimators
 
 This document introduces the TensorFlow programming environment and shows you
 how to solve the Iris classification problem in TensorFlow.
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 9059b3f3b6f5e9fd6b3f7a46512577ad05848ba6..274413e29442d448d989e8574f3c8578ea9da5a0 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
@@ -113,6 +113,6 @@ If executing `a.out` fails, ask yourself the following questions:
   * Did you export those environment variables?
 
 If you are still seeing build or execution error messages, search (or post to)
-[StackOverflow](www.stackoverflow.com/questions/tagged/tensorflow) for
+[StackOverflow](https://stackoverflow.com/questions/tagged/tensorflow) for
 possible solutions.
 
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 2e47a6d2127ee7e06a2cc0d2d725145edea49b43..1a0956634d6b03585a39164a2492df3fd1b4ffc7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.7.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index eff066d2009c5191402a0e10b2534aa6df12f544..0dcb059793170f70677bd828c23e48e9b5b45a21 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.7.0-rc1</version>
+                 <version>1.7.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -93,6 +93,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
 
               // Execute the "MyConst" operation in a Session.
               try (Session s = new Session(g);
+                   // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
                    Tensor output = s.runner().fetch("MyConst").run().get(0)) {
                 System.out.println(new String(output.bytesValue(), "UTF-8"));
               }
@@ -123,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
 </dependency>
 ```
 
@@ -147,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -166,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.7.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -174,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.7.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0-rc1.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.7.0.zip).
   3. Extract this .zip file.
 
 
@@ -207,6 +208,7 @@ public class HelloTF {
 
       // Execute the "MyConst" operation in a Session.
       try (Session s = new Session(g);
+           // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
            Tensor output = s.runner().fetch("MyConst").run().get(0)) {
         System.out.println(new String(output.bytesValue(), "UTF-8"));
       }
@@ -225,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.7.0-rc1.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.7.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -239,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.7.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.7.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 27b696696d968dd2f8d6fe7d4a0b9c5d00a9befd..04e4242b0ffd476818f6e5c4522c60111a65e151 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -46,6 +46,35 @@ must be installed on your system:
     a list of supported GPU cards.
   * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
     Toolkit.
+  * The libcupti-dev library, which is the NVIDIA CUDA Profile Tools Interface.
+    This library provides advanced profiling support. To install this library,
+    issue the following command for CUDA Toolkit >= 8.0:
+
+    <pre>
+    $ <b>sudo apt-get install cuda-command-line-tools</b>
+    </pre>
+
+    and add its path to your `LD_LIBRARY_PATH` environment variable:
+
+    <pre>
+    $ <b>export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</b>
+    </pre>
+
+    For CUDA Toolkit <= 7.5 do:
+
+    <pre>
+    $ <b>sudo apt-get install libcupti-dev</b>
+    </pre>
+  * **[OPTIONAL]**  For optimized inferencing performance, you can also install
+    NVIDIA TensorRT 3.0. For details, see
+    [NVIDIA's TensorRT documentation](http://docs.nvidia.com/deeplearning/sdk/tensorrt-install-guide/index.html#installing-tar).
+    Only steps 1-4 in the TensorRT Tar File installation instructions are
+    required for compatibility with TensorFlow; the Python package installation
+    in steps 5 and 6 can be omitted. Detailed installation instructions can be found at [package documentataion](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/tensorrt#installing-tensorrt-304)
+
+    **IMPORTANT:** For compatibility with the pre-built `tensorflow-gpu`
+    package, please use the Ubuntu **14.04** tar file package of TensorRT
+    even when installing onto an Ubuntu 16.04 system.   
 
 If you have an earlier version of the preceding packages, please upgrade to
 the specified versions. If upgrading is not possible, then you may still run
@@ -165,7 +194,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      Virtualenv environment:
 
      <pre>(tensorflow)$ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common_installation_problems).
@@ -270,7 +299,7 @@ take the following steps:
 
      <pre>
      $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b>
      </pre>
 
      If this step fails, see
@@ -456,7 +485,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -630,14 +659,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -649,14 +678,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -668,14 +697,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -687,14 +716,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.7.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 7060ef43da3e978a87250cacf916b4a792274a47..b3e9616a0592c43f457183e53c8e99e55f3f5d94 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -524,7 +524,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +532,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.7.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 148f80efe25f12cfaef9df8a8edfaa700782dacd..7d7c2aa75aeef15d9b400f2bf5dddb083f387a2b 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -350,10 +350,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.7.0rc1 on Linux:
+for TensorFlow 1.7.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.7.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -450,8 +450,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.8.0</td><td>N/A</td><td>N/A</td></tr>
@@ -471,7 +471,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.4.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.5.4</td><td>N/A</td><td>N/A</td></tr>
@@ -486,8 +486,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
-<tr><td>tensorflow-1.7.0rc1</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
-<tr><td>tensorflow_gpu-1.7.0rc1</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
+<tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.7.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.6.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.5.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/docs_src/javascript/index.md b/tensorflow/docs_src/javascript/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..ad63eeb255d870064567a0de8a28815ce2ae0172
--- /dev/null
+++ b/tensorflow/docs_src/javascript/index.md
@@ -0,0 +1,5 @@
+# JavaScript 
+
+You may develop TensorFlow programs in JavaScript, training and deploying
+models right in your browser.  For details, see
+[js.tensorflow.org](https://js.tensorflow.org).
diff --git a/tensorflow/docs_src/javascript/leftnav_files b/tensorflow/docs_src/javascript/leftnav_files
new file mode 100644
index 0000000000000000000000000000000000000000..fc0ab8a5435943f6442969ec5787305b98c7908b
--- /dev/null
+++ b/tensorflow/docs_src/javascript/leftnav_files
@@ -0,0 +1 @@
+index.md
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index 08a5fbe41c87c88399682208c38bf7a892d8fc1a..c35530061dcaf2a4a894dcdf54fd794907d98162 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -51,7 +51,8 @@ If you haven't already, do the following two things:
         // set to 'bazel', 'cmake', 'makefile', 'none'
         def nativeBuildSystem = 'none'
 
-4. Click the Run button (the green arrow) or use **Run -> Run 'android'** from the top menu.
+4. Click the *Run* button (the green arrow) or select *Run > Run 'android'* from the
+    top menu. You may need to rebuild the project using *Build > Rebuild Project*.
 
     If it asks you to use Instant Run, click **Proceed Without Instant Run**.
 
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
index 4cf134cc3c2c323405d769a5ced5d5a68f188203..585470d5f0847716863ba6129bf75c26631fecbd 100644
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ b/tensorflow/docs_src/mobile/leftnav_files
@@ -1,6 +1,7 @@
 index.md
 ### TensorFlow Lite
 tflite/index.md
+tflite/devguide.md
 tflite/demo_android.md
 tflite/demo_ios.md
 >>>
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/docs_src/mobile/tflite/demo_android.md
index c94b5597a673a7e68aed517b325b9719b3b73bbd..7f2f8882a24702d167599452e66afbe720026808 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_android.md
@@ -1,42 +1,144 @@
-# TensorFlow Lite Demo for Android
+# Android Demo App
 
-The TensorFlow Lite demo is a camera app that continuously classifies whatever
-it sees from your device's back camera, using a quantized MobileNet model.
+An example Android application using TensorFLow Lite is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+The demo is a sample camera app that classifies images continuously
+using either a quantized Mobilenet model or a floating point Inception-v3 model.
+To run the demo, a device running Android 5.0 ( API 21) or higher is required.
 
-You'll need an Android device running Android 5.0 or higher to run the demo.
+In the demo app, inference is done using the TensorFlow Lite Java API. The demo
+app classifies frames in real-time, displaying the top most probable
+classifications. It also displays the time taken to detect the object.
 
-To get you started working with TensorFlow Lite on Android, we'll walk you
-through building and deploying our TensorFlow demo app in Android Studio.
+There are three ways to get the demo app to your device:
 
-Note: For a more detailed guide see the
-[TFLite Codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/index.html#0)
+* Download the [prebuilt binary APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+* Use Android Studio to build the application.
+* Download the source code for TensorFlow Lite and the demo and build it using
+  bazel.
 
-It's also possible to build the demo app with Bazel, but we only recommend
-this for advanced users who are very familiar with the Bazel build
-environment. For more information on that, see our page [on Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite#building-tensorflow-lite-and-the-demo-app-from-source).
 
-## Build and deploy with Android Studio
+## Download the pre-built binary
 
-1. Clone the TensorFlow repository from GitHub if you haven't already:
+The easiest way to try the demo is to download the
+[pre-built binary APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk)
 
-        git clone https://github.com/tensorflow/tensorflow
+Once the APK is installed, click the app icon to start the program. The first
+time the app is opened, it asks for runtime permissions to access the device
+camera. The demo app opens the back-camera of the device and recognizes objects
+in the camera's field of view. At the bottom of the image (or at the left
+of the image if the device is in landscape mode), it displays top three objects
+classified and the classification latency.
 
-2. Install the latest version of Android Studio from [here](https://developer.android.com/studio/index.html).
 
-3. From the **Welcome to Android Studio** screen, use the **Import Project
-   (Gradle, Eclipse ADT, etc)** option to import the
-   `tensorflow/contrib/lite/java/demo` directory as an existing Android Studio
-   Project.
+## Build in Android Studio with TensorFlow Lite AAR from JCenter
 
-    Android Studio may prompt you to install Gradle upgrades and other tool
-    versions; you should accept these upgrades.
+Use Android Studio to try out changes in the project code and compile the demo
+app:
 
-4. Download the TensorFlow Lite MobileNet model from [here](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip).
+* Install the latest version of
+  [Android Studio](https://developer.android.com/studio/index.html).
+* Make sure the Android SDK version is greater than 26 and NDK version is greater
+  than 14 (in the Android Studio settings).
+* Import the `tensorflow/contrib/lite/java/demo` directory as a new
+  Android Studio project.
+* Install all the Gradle extensions it requests.
 
-    Unzip this and copy the `mobilenet_quant_v1_224.tflite` file to the assets
-    directory: `tensorflow/contrib/lite/java/demo/app/src/main/assets/`
+To get a model, either:
 
-5. Build and run the app in Android Studio.
+* Download the quantized [Mobilenet TensorFlow Lite model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
+  and unzip and copy `mobilenet_quant_v1_224.tflite` to the assets directory:
+  `tensorflow/contrib/lite/java/demo/app/src/main/assets/`.
+* Or, download the floating point [Inception-v3 model](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
+  and unzip and copy `inceptionv3_non_slim_2015.tflite` to the assets
+  directory. Change the chosen classifier in
+  [Camera2BasicFragment.java](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java)<br>
+  from: `classifier = new ImageClassifierQuantizedMobileNet(getActivity());`<br>
+  to: `classifier = new ImageClassifierFloatInception(getActivity());`.
 
-You'll have to grant permissions for the app to use the device's camera. Point
-the camera at various objects and enjoy seeing how the model classifies things!
+Now you can build and run the demo app.
+
+
+## Build TensorFlow Lite and the demo app from source
+
+### Clone the TensorFlow repo
+
+```sh
+git clone https://github.com/tensorflow/tensorflow
+```
+
+### Install Bazel
+
+If `bazel` is not installed on your system, see
+[Installing Bazel](https://bazel.build/versions/master/docs/install.html).
+
+Note: Bazel does not currently support Android builds on Windows. Windows users
+should download the
+[prebuilt binary](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+
+### Install Android NDK and SDK
+
+The Android NDK is required to build the native (C/C++) TensorFlow Lite code. The
+current recommended version is *14b* and can be found on the
+[NDK Archives](https://developer.android.com/ndk/downloads/older_releases.html#ndk-14b-downloads)
+page.
+
+The Android SDK and build tools can be
+[downloaded separately](https://developer.android.com/tools/revisions/build-tools.html)
+or used as part of
+[Android Studio](https://developer.android.com/studio/index.html). To build the
+TensorFlow Lite Android demo, build tools require API >= 23 (but it will run on
+devices with API >= 21).
+
+In the root of the TensorFlow repository, update the `WORKSPACE` file with the
+`api_level` and location of the SDK and NDK. If you installed it with
+Android Studio, the SDK path can be found in the SDK manager. The default NDK
+path is:`{SDK path}/ndk-bundle.` For example:
+
+```
+android_sdk_repository (
+    name = "androidsdk",
+    api_level = 23,
+    build_tools_version = "23.0.2",
+    path = "/home/xxxx/android-sdk-linux/",
+)
+
+android_ndk_repository(
+    name = "androidndk",
+    path = "/home/xxxx/android-ndk-r10e/",
+    api_level = 19,
+)
+```
+
+Some additional details are available on the
+[TF Lite Android App page](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/README.md).
+
+### Build the source code
+
+To build the demo app, run `bazel`:
+
+```
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/demo/app/src/main:TfLiteCameraDemo
+```
+
+Caution: Because of an bazel bug, we only support building the Android demo app
+within a Python 2 environment.
+
+
+## About the demo
+
+The demo app is resizing each camera image frame (224 width * 224 height) to
+match the quantized MobileNets model (299 * 299 for Inception-v3). The resized
+image is converted—row by row—into a
+[ByteBuffer](https://developer.android.com/reference/java/nio/ByteBuffer.html).
+Its size is  1 * 224 * 224 * 3 bytes, where 1 is the number of images in a batch.
+224 * 224 (299 * 299) is the width and height of the image. 3 bytes represents
+the 3 colors of a pixel.
+
+This demo uses the TensorFlow Lite Java inference API
+for models which take a single input and provide a single output. This outputs a
+two-dimensional array, with the first dimension being the category index and the
+second dimension being the confidence of classification. Both models have 1001
+unique categories and the app sorts the probabilities of all the categories and
+displays the top three. The model file must be downloaded and bundled within the
+assets directory of the app.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/docs_src/mobile/tflite/demo_ios.md
index 3ee9b1cbca6cfef98616bd33bbf91b756b4efa15..3be21da89f9e53d324c2ade0cb937f4b5b30fad4 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_ios.md
+++ b/tensorflow/docs_src/mobile/tflite/demo_ios.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite Demo for iOS
+# iOS Demo App
 
 The TensorFlow Lite demo is a camera app that continuously classifies whatever
 it sees from your device's back camera, using a quantized MobileNet model. These
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/docs_src/mobile/tflite/devguide.md
new file mode 100644
index 0000000000000000000000000000000000000000..4133bc172a1924f0ce8bb515d66fc03d716923c8
--- /dev/null
+++ b/tensorflow/docs_src/mobile/tflite/devguide.md
@@ -0,0 +1,231 @@
+# Developer Guide
+
+Using a TensorFlow Lite model in your mobile app requires multiple
+considerations: you must choose a pre-trained or custom model, convert the model
+to a TensorFLow Lite format, and finally, integrate the model in your app.
+
+## 1. Choose a model
+
+Depending on the use case, you can choose one of the popular open-sourced models,
+such as *InceptionV3* or *MobileNets*, and re-train these models with a custom
+data set or even build your own custom model.
+
+### Use a pre-trained model
+
+[MobileNets](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html)
+is a family of mobile-first computer vision models for TensorFlow designed to
+effectively maximize accuracy, while taking into consideration the restricted
+resources for on-device or embedded applications. MobileNets are small,
+low-latency, low-power models parameterized to meet the resource constraints for
+a variety of uses. They can be used for classification, detection, embeddings, and
+segmentation—similar to other popular large scale models, such as
+[Inception](https://arxiv.org/pdf/1602.07261.pdf). Google provides 16 pre-trained
+[ImageNet](http://www.image-net.org/challenges/LSVRC/) classification checkpoints
+for MobileNets that can be used in mobile projects of all sizes.
+
+[Inception-v3](https://arxiv.org/abs/1512.00567) is an image recognition model
+that achieves fairly high accuracy recognizing general objects with 1000 classes,
+for example, "Zebra", "Dalmatian", and "Dishwasher". The model extracts general
+features from input images using a convolutional neural network and classifies
+them based on those features with fully-connected and softmax layers.
+
+[On Device Smart Reply](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
+is an on-device model that provides one-touch replies for incoming text messages
+by suggesting contextually relevant messages. The model is built specifically for
+memory constrained devices, such as watches and phones, and has been successfully
+used in Smart Replies on Android Wear. Currently, this model is Android-specific.
+
+These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md)
+
+### Re-train Inception-V3 or MobileNet for a custom data set
+
+These pre-trained models were trained on the *ImageNet* data set which contains
+1000 predefined classes. If these classes are not sufficient for your use case,
+the model will need to be re-trained. This technique is called
+*transfer learning* and starts with a model that has been already trained on a
+problem, then retrains the model on a similar problem. Deep learning from
+scratch can take days, but transfer learning is fairly quick. In order to do
+this, you need to generate a custom data set labeled with the relevant classes.
+
+The [TensorFlow for Poets](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/)
+codelab walks through the re-training process step-by-step. The code supports
+both floating point and quantized inference.
+
+### Train a custom model
+
+A developer may choose to train a custom model using Tensorflow (see the
+@{$tutorials} for examples of building and training models). If you have already
+written a model, the first step is to export this to a @{tf.GraphDef} file. This
+is required because some formats do not store the model structure outside the
+code, and we must communicate with other parts of the framework. See
+[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
+to create .pb file for the custom model.
+
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
+[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for supported operators and their usage. This set of operators will continue to
+grow in future Tensorflow Lite releases.
+
+
+## 2. Convert the model format
+
+The model generated (or downloaded) in the previous step is a *standard*
+Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Models generated with transfer learning (re-training) or custom models must be
+converted—but, we must first freeze the graph to convert the model to the
+Tensorflow Lite format. This process uses several model formats:
+
+* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+  computation graph. It contains operators, tensors, and variables definitions.
+* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
+  does not contain a graph structure, it cannot be interpreted by itself.
+* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
+  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
+  CheckPoint and a `GraphDef`, and converting each variable into a constant
+  using the value retrieved from the CheckPoint.
+* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
+  input and output arguments to a model. A `GraphDef` and CheckPoint can be
+  extracted from a `SavedModel`.
+* *TensorFlow Lite model* (.tflite) —A serialized
+  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
+  `FrozenGraphDef`.
+
+### Freeze Graph
+
+To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
+that contain trained weight parameters. The .pb file only contains the structure
+of the graph. The process of merging the checkpoint values with the graph
+structure is called *freezing the graph*.
+
+You should have a checkpoints folder or download them for a pre-trained model
+(for example,
+[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+
+To freeze the graph, use the following command (changing the arguments):
+
+```
+freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
+  --input_checkpoint=/tmp/checkpoints/mobilenet-10202.ckpt \
+  --input_binary=true \
+  --output_graph=/tmp/frozen_mobilenet_v1_224.pb \
+  --output_node_names=MobileNetV1/Predictions/Reshape_1
+```
+
+The `input_binary` flag must be enabled so the protobuf is read and written in
+a binary format. Set the `input_graph` and `input_checkpoint` files.
+
+The `output_node_names` may not be obvious outside of the code that built the
+model. The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
+or `graphviz`.
+
+The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
+(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
+Optimizing Converter tool supports both float and quantized models. To convert
+the frozen `GraphDef` to the .tflite format:
+
+```
+toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+  --input_format=TENSORFLOW_GRAPHDEF \
+  --output_format=TFLITE \
+  --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+  --inference_type=FLOAT \
+  --input_type=FLOAT \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --input_shapes=1,224,224,3
+```
+
+The `input_file` argument should reference the frozen `GraphDef` file
+containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. `output_file` is where the TensorFlow
+Lite model will get generated. The `input_type` and `inference_type`
+arguments should be set to `FLOAT`, unless converting a
+@{$performance/quantization$quantized model}. Setting the `input_array`,
+`output_array`, and `input_shape` arguments are not as straightforward. The
+easiest way to find these values is to explore the graph using Tensorboard. Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+It is also possible to use the Tensorflow Optimizing Converter with protobufs
+from either Python or from the command line (see the 
+[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/python/toco_from_protos.py)
+example). This allows you to integrate the conversion step into the model design
+workflow, ensuring the model is easily convertible to a mobile inference graph.
+For example:
+
+```python
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+out = tf.identity(val, name="out")
+
+with tf.Session() as sess:
+  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
+  open("converteds_model.tflite", "wb").write(tflite_model)
+```
+
+For usage, see the Tensorflow Optimizing Converter
+[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md).
+
+Refer to the
+[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md)
+for troubleshooting help, and if that doesn't help, please
+[file an issue](https://github.com/tensorflow/tensorflow/issues).
+
+The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
+to visualize TensorFlow Lite models after conversion. To build the
+[visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/tools/visualize.py)
+tool:
+
+```sh
+bazel run tensorflow/contrib/lite/tools:visualize -- model.tflite model_viz.html
+```
+
+This generates an interactive HTML page listing subgraphs, operations, and a
+graph visualization.
+
+
+## 3. Use the TensorFlow Lite model for inference in a mobile app
+
+After completing the prior steps, you should now have a `.tflite` model file.
+
+### Android
+
+Since Android apps are written in Java and the core TensorFlow library is in C++,
+a JNI library is provided as an interface. This is only meant for inference—it
+provides the ability to load a graph, set up inputs, and run the model to
+calculate outputs.
+
+The open source Android demo app uses the JNI interface and is available
+[on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
+You can also download a
+[prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
+See the @{$tflite/demo_android} guide for details.
+
+The @{$mobile/android_build} guide has instructions for installing TensorFlow on
+Android and setting up `bazel` and Android Studio.
+
+### iOS
+
+To integrate a TensorFlow model in an iOS app, see the
+[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
+guide and @{$tflite/demo_ios} guide.
+
+#### Core ML support
+
+Core ML is a machine learning framework used in Apple products. In addition to
+using Tensorflow Lite models directly in your applications, you can convert
+trained Tensorflow models to the
+[CoreML](https://developer.apple.com/machine-learning/) format for use on Apple
+devices. To use the converter, refer to the
+[Tensorflow-CoreML converter documentation](https://github.com/tf-coreml/tf-coreml).
+
+### Raspberry Pi
+
+Compile Tensorflow Lite for a Raspberry Pi by following the
+[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/rpi.md)
+This compiles a static library file (`.a`) used to build your app. There are
+plans for Python bindings and a demo app.
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index beb24794fc98724e2423e02a71028f79be45cf75..11f11ea4dc54b9f152f2560384cb47cec6b308c0 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -155,7 +155,9 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-![tensorflow lite architecture](https://www.tensorflow.org/images/tflite-architecture.jpg)
+<img src="/images/tflite-architecture.jpg"
+     alt="TensorFlow Lite architecture diagram"
+     style="max-width:600px;">
 
 Starting with a trained TensorFlow model on disk, you'll convert that model to
 the TensorFlow Lite file format (`.tflite`) using the TensorFlow Lite
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index d11a7e5d07c3e6cfa092e7ac11189ce6c272c1ad..1f894c39fe4554261cd37ebc8cd48af6b36eef43 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,3 +1,4 @@
+index.md
 performance_guide.md
 datasets_performance.md
 performance_models.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index 580a899ac4e4f5c3d97ce023f25083168fe00d01..b1796cf9b2d0bf7459e70ab542b6e6fcb203667a 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -475,7 +475,7 @@ optimizations.
 ### TensorFlow with Intel® MKL DNN
 
 Intel® has added optimizations to TensorFlow for Intel® Xeon® and Intel® Xeon
-Phi™ though the use of Intel® Math Kernel Library for Deep Neural Networks
+Phi™ through the use of the Intel® Math Kernel Library for Deep Neural Networks
 (Intel® MKL-DNN) optimized primitives. The optimizations also provide speedups
 for the consumer line of processors, e.g. i5 and i7 Intel processors. The Intel
 published paper
@@ -581,9 +581,9 @@ Each variable that impacts performance is discussed below.
     for optimal settings.
 
 *   **intra_op_parallelism_threads**: Setting this equal to the number of
-    physical cores is recommended. Setting the value to 0, which is the default
-    and will result in the value being set to the number of logical cores, is an
-    option to try for some architectures.  This value and `OMP_NUM_THREADS`
+    physical cores is recommended. Setting the value to 0, which is the default,
+    results in the value being set to the number of logical cores - this is an
+    alternate option to try for some architectures.  This value and `OMP_NUM_THREADS`
     should be equal.
 
 *   **inter_op_parallelism_threads**: Setting this equal to the number of
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 5e39e710a0dba74dfd68a04367ce402362520590..217ab596b72bc263ae5dda377a8faab8a39b0a3c 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -241,13 +241,10 @@ See also
 
 Clamps an operand to within the range between a minimum and maximum value.
 
-<b> `Clamp(computation, args...)` </b>
+<b> `Clamp(min, operand, max)` </b>
 
 | Arguments     | Type                    | Semantics                        |
 | ------------- | ----------------------- | -------------------------------- |
-| `computation` | `Computation`           | computation of type `T_0, T_1,   |
-:               :                         : ..., T_N -> S` with N parameters :
-:               :                         : of arbitrary type                :
 | `min`         | `ComputationDataHandle` | array of type T                  |
 | `operand`     | `ComputationDataHandle` | array of type T                  |
 | `max`         | `ComputationDataHandle` | array of type T                  |
@@ -791,9 +788,7 @@ DynamicSlice extracts a sub-array from the input array at dynamic
 dimension: [start, start + size). The shape of `start_indices` must be rank ==
 1, with dimension size equal to the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo input dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
@@ -850,9 +845,7 @@ is updated.
 The shape of `start_indices` must be rank == 1, with dimension size equal to
 the rank of `operand`.
 Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined. Currently,
-slice indices are computed modulo update dimension sizes to prevent out-of-bound
-array accesses, but this behavior may change in future implementations.
+calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index d1cd7e7c06e525abd9fadf24d5e706780bb316fc..f5a0eb0a2000a5c35f7e3641e6552d40629305a6 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -4,29 +4,28 @@
 
 [TOC]
 
-TensorFlow debugger (**tfdbg**) is a specialized debugger for TensorFlow. It
-lets you view the internal structure and states of running TensorFlow graphs
-during training and inference, which is difficult to debug with general-purpose
-debuggers such as Python's `pdb` due to TensorFlow's computation-graph paradigm.
-
-> NOTE: TensorFlow debugger uses a
-> [curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based
-> text user interface. On Mac OS X, the `ncurses` library is required and can
-> be installed with `brew install homebrew/dupes/ncurses`. On Windows, curses
-> isn't as well supported, so a
-> [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based interface can
-> be used with tfdbg by installing `pyreadline` with pip.
-> If you use Anaconda3, you can install it with a command
-> such as `"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`.
-> Unofficial Windows curses packages can be downloaded
-> [here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
-> installed using `pip install <your_version>.whl`, however curses on Windows
-> may not work as reliably as curses on Linux or Mac.
-
-> NOTE: This guide focuses on the command-line interface (CLI) of tfdbg. For
-> guide on how to use the graphical user interface (GUI) of tfdbg, i.e., the
-> **TensorBoard Debugger Plugin**, please visit
-> [its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+`tfdbg` is a specialized debugger for TensorFlow. It lets you view the internal
+structure and states of running TensorFlow graphs during training and inference,
+which is difficult to debug with general-purpose debuggers such as Python's `pdb`
+due to TensorFlow's computation-graph paradigm.
+
+This guide focuses on the command-line interface (CLI) of `tfdbg`. For guide on
+how to use the graphical user interface (GUI) of tfdbg, i.e., the
+**TensorBoard Debugger Plugin**, please visit
+[its README](https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/debugger/README.md).
+
+Note: The TensorFlow debugger uses a
+[curses](https://en.wikipedia.org/wiki/Curses_\(programming_library\))-based text
+user interface. On Mac OS X, the `ncurses` library is required and can be
+installed with `brew install homebrew/dupes/ncurses`. On Windows, curses isn't as
+well supported, so a [readline](https://en.wikipedia.org/wiki/GNU_Readline)-based
+interface can be used with tfdbg by installing `pyreadline` with `pip`. If you
+use Anaconda3, you can install it with a command such as
+`"C:\Program Files\Anaconda3\Scripts\pip.exe" install pyreadline`. Unofficial
+Windows curses packages can be downloaded
+[here](https://www.lfd.uci.edu/~gohlke/pythonlibs/#curses), then subsequently
+installed using `pip install <your_version>.whl`, however curses on Windows may
+not work as reliably as curses on Linux or Mac.
 
 This tutorial demonstrates how to use the **tfdbg** CLI to debug the appearance
 of [`nan`s](https://en.wikipedia.org/wiki/NaN)
@@ -748,16 +747,16 @@ There are three possible workarounds or solutions:
    to which tfdbg dumps the debug data. You can use it to let tfdbg dump the
    debug data on a disk with larger free space. For example:
 
-   ``` python
-   # For LocalCLIDebugWrapperSession
-   sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
-
-   # For LocalCLIDebugHook
-   hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
-   ```
+```python
+# For LocalCLIDebugWrapperSession
+sess = tf_debug.LocalCLIDebugWrapperSession(dump_root="/with/lots/of/space")
 
+# For LocalCLIDebugHook
+hooks = [tf_debug.LocalCLIDebugHook(dump_root="/with/lots/of/space")]
+```
    Make sure that the directory pointed to by dump_root is empty or nonexistent.
-   tfdbg cleans up the dump directories before exiting.
+   `tfdbg` cleans up the dump directories before exiting.
+
 *  Reduce the batch size used during the runs.
 *  Use the filtering options of tfdbg's `run` command to watch only specific
    nodes in the graph. For example:
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
new file mode 100644
index 0000000000000000000000000000000000000000..595e6be4af78d7d684ddeca0adea59e5a754134d
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -0,0 +1,848 @@
+# Eager Execution
+
+TensorFlow's eager execution is an imperative programming environment that
+evaluates operations immediately, without building graphs: operations return
+concrete values instead of constructing a computational graph to run later. This
+makes it easy to get started with TensorFlow and debug models, and it
+reduces boilerplate as well. To follow along with this guide, run the code
+samples below in an interactive `python` interpreter.
+
+Eager execution is a flexible machine learning platform for research and
+experimentation, providing:
+
+* *An intuitive interface*—Structure your code naturally and use Python data
+  structures. Quickly iterate on small models and small data.
+* *Easier debugging*—Call ops directly to inspect running models and test
+  changes. Use standard Python debugging tools for immediate error reporting.
+* *Natural control flow*—Use Python control flow instead of graph control
+  flow, simplifying the specification of dynamic models.
+
+Eager execution supports most TensorFlow operations and GPU acceleration. For a
+collection of examples running in eager execution, see:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+Note: Some models may experience increased overhead with eager execution
+enabled. Performance improvements are ongoing, but please
+[file a bug](https://github.com/tensorflow/tensorflow/issues) if you find a
+problem and share your benchmarks.
+
+## Setup and basic usage
+
+Upgrade to the latest version of TensorFlow:
+
+```
+$ pip install --upgrade tensorflow
+```
+
+To start eager execution, add `tf.enable_eager_execution()` to the beginning of
+the program or console session. Do not add this operation to other modules that
+the program calls.
+
+```py
+from __future__ import absolute_import, division, print_function
+
+import tensorflow as tf
+
+tf.enable_eager_execution()
+```
+
+Now you can run TensorFlow operations and the results will return immediately:
+
+```py
+tf.executing_eagerly()        # => True
+
+x = [[2.]]
+m = tf.matmul(x, x)
+print("hello, {}".format(m))  # => "hello, [[4.]]"
+```
+
+Enabling eager execution changes how TensorFlow operations behave—now they
+immediately evaluate and return their values to Python. `tf.Tensor` objects
+reference concrete values instead of symbolic handles to nodes in a computational
+graph. Since there isn't a computational graph to build and run later in a
+session, it's easy to inspect results using `print()` or a debugger. Evaluating,
+printing, and checking tensor values does not break the flow for computing
+gradients.
+
+Eager execution works nicely with [NumPy](http://www.numpy.org/). NumPy
+operations accept `tf.Tensor` arguments. TensorFlow
+[math operations](https://www.tensorflow.org/api_guides/python/math_ops) convert
+Python objects and NumPy arrays to `tf.Tensor` objects. The
+`tf.Tensor.numpy` method returns the object's value as a NumPy `ndarray`.
+
+```py
+a = tf.constant([[1, 2],
+                 [3, 4]])
+print(a)
+# => tf.Tensor([[1 2]
+#               [3 4]], shape=(2, 2), dtype=int32)
+
+# Broadcasting support
+b = tf.add(a, 1)
+print(b)
+# => tf.Tensor([[2 3]
+#               [4 5]], shape=(2, 2), dtype=int32)
+
+# Operator overloading is supported
+print(a * b)
+# => tf.Tensor([[ 2  6]
+#               [12 20]], shape=(2, 2), dtype=int32)
+
+# Use NumPy values
+import numpy as np
+
+c = np.multiply(a, b)
+print(c)
+# => [[ 2  6]
+#     [12 20]]
+
+# Obtain numpy value from a tensor:
+print(a.numpy())
+# => [[1 2]
+#     [3 4]]
+```
+
+The `tf.contrib.eager` module contains symbols available to both eager and graph execution
+environments and is useful for writing code to [work with graphs](#work_with_graphs):
+
+```py
+tfe = tf.contrib.eager
+```
+
+## Dynamic control flow
+
+A major benefit of eager execution is that all the functionality of the host
+language is available while your model is executing. So, for example,
+it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
+
+```py
+def fizzbuzz(max_num):
+  counter = tf.constant(0)
+  for num in range(max_num):
+    num = tf.constant(num)
+    if num % 3 == 0 and num % 5 == 0:
+      print('FizzBuzz')
+    elif num % 3 == 0:
+      print('Fizz')
+    elif num % 5 == 0:
+      print('Buzz')
+    else:
+      print(num)
+    counter += 1
+  return counter
+```
+
+This has conditionals that depend on tensor values and it prints these values
+at runtime.
+
+## Build a model
+
+Many machine learning models are represented by composing layers. When
+using TensorFlow with eager execution you can either write your own layers or
+use a layer provided in the `tf.keras.layers` package.
+
+While you can use any Python object to represent a layer,
+TensorFlow has `tf.keras.layers.Layer` as a convenient base class. Inherit from
+it to implement your own layer:
+
+```py
+class MySimpleLayer(tf.keras.layers.Layer):
+  def __init__(self, output_units):
+    self.output_units = output_units
+
+  def build(self, input):
+    # The build method gets called the first time your layer is used.
+    # Creating variables on build() allows you to make their shape depend
+    # on the input shape and hence remove the need for the user to specify
+    # full shapes. It is possible to create variables during __init__() if
+    # you already know their full shapes.
+    self.kernel = self.add_variable(
+      "kernel", [input.shape[-1], self.output_units])
+
+  def call(self, input):
+    # Override call() instead of __call__ so we can perform some bookkeeping.
+    return tf.matmul(input, self.kernel)
+```
+
+Use `tf.keras.layers.Dense` layer instead  of `MySimpleLayer` above as it has
+a superset of its functionality (it can also add a bias).
+
+When composing layers into models you can use `tf.keras.Sequential` to represent
+models which are a linear stack of layers. It is easy to use for basic models:
+
+```py
+model = tf.keras.Sequential([
+  tf.keras.layers.Dense(10, input_shape=(784,)),  # must declare input shape
+  tf.keras.layers.Dense(10)
+])
+```
+
+Alternatively, organize models in classes by inheriting from `tf.keras.Model`.
+This is a container for layers that is a layer itself, allowing `tf.keras.Model`
+objects to contain other `tf.keras.Model` objects.
+
+```py
+class MNISTModel(tf.keras.Model):
+  def __init__(self):
+    super(MNISTModel, self).__init__()
+    self.dense1 = tf.keras.layers.Dense(units=10)
+    self.dense2 = tf.keras.layers.Dense(units=10)
+
+  def call(self, input):
+    """Run the model."""
+    result = self.dense1(input)
+    result = self.dense2(result)
+    result = self.dense2(result)  # reuse variables from dense2 layer
+    return result
+
+model = MNISTModel()
+```
+
+It's not required to set an input shape for the `tf.keras.Model` class since
+the parameters are set the first time input is passed to the layer.
+
+`tf.keras.layers` classes create and contain their own model variables that
+are tied to the lifetime of their layer objects. To share layer variables, share
+their objects.
+
+
+## Eager training
+
+### Computing gradients
+
+[Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
+is useful for implementing machine learning algorithms such as
+[backpropagation](https://en.wikipedia.org/wiki/Backpropagation) for training
+neural networks. During eager execution, use `tf.GradientTape` to trace
+operations for computing gradients later.
+
+`tf.GradientTape` is an opt-in feature to provide maximal performance when
+not tracing. Since different operations can occur during each call, all
+forward-pass operations get recorded to a "tape". To compute the gradient, play
+the tape backwards and then discard. A particular `tf.GradientTape` can only
+compute one gradient; subsequent calls throw a runtime error.
+
+```py
+w = tfe.Variable([[1.0]])
+with tf.GradientTape() as tape:
+  loss = w * w
+
+grad = tape.gradient(loss, [w])
+print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
+```
+
+Here's an example of `tf.GradientTape` that records forward-pass operations
+to train a simple model:
+
+```py
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 1000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+def prediction(input, weight, bias):
+  return input * weight + bias
+
+# A loss function using mean-squared error
+def loss(weights, biases):
+  error = prediction(training_inputs, weights, biases) - training_outputs
+  return tf.reduce_mean(tf.square(error))
+
+# Return the derivative of loss with respect to weight and bias
+def grad(weights, biases):
+  with tf.GradientTape() as tape:
+    loss_value = loss(weights, biases)
+  return tape.gradient(loss_value, [weights, biases])
+
+train_steps = 200
+learning_rate = 0.01
+# Start with arbitrary values for W and B on the same batch of data
+W = tfe.Variable(5.)
+B = tfe.Variable(10.)
+
+print("Initial loss: {:.3f}".format(loss(W, B)))
+
+for i in range(train_steps):
+  dW, dB = grad(W, B)
+  W.assign_sub(dW * learning_rate)
+  B.assign_sub(dB * learning_rate)
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(W, B)))
+
+print("Final loss: {:.3f}".format(loss(W, B)))
+print("W = {}, B = {}".format(W.numpy(), B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 71.204
+Loss at step 000: 68.333
+Loss at step 020: 30.222
+Loss at step 040: 13.691
+Loss at step 060: 6.508
+Loss at step 080: 3.382
+Loss at step 100: 2.018
+Loss at step 120: 1.422
+Loss at step 140: 1.161
+Loss at step 160: 1.046
+Loss at step 180: 0.996
+Final loss: 0.974
+W = 3.01582956314, B = 2.1191945076
+```
+
+Replay the `tf.GradientTape` to compute the gradients and apply them in a
+training loop. This is demonstrated in an excerpt from the
+[mnist_eager.py](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_eager.py)
+example:
+
+```py
+dataset = tf.data.Dataset.from_tensor_slices((data.train.images,
+                                              data.train.labels))
+...
+for (batch, (images, labels)) in enumerate(dataset):
+  ...
+  with tf.GradientTape() as tape:
+    logits = model(images, training=True)
+    loss_value = loss(logits, labels)
+  ...
+  grads = tape.gradient(loss_value, model.variables)
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+```
+
+
+The following example creates a multi-layer model that classifies the standard
+[MNIST handwritten digits](https://www.tensorflow.org/tutorials/layers). It
+demonstrates the optimizer and layer APIs to build trainable graphs in an eager
+execution environment.
+
+### Train a model
+
+Even without training, call the model and inspect the output in eager execution:
+
+```py
+# Create a tensor representing a blank image
+batch = tf.zeros([1, 1, 784])
+print(batch.shape)  # => (1, 1, 784)
+
+result = model(batch)
+# => tf.Tensor([[[ 0.  0., ..., 0.]]], shape=(1, 1, 10), dtype=float32)
+```
+
+This example uses the
+[dataset.py module](https://github.com/tensorflow/models/blob/master/official/mnist/dataset.py)
+from the
+[TensorFlow MNIST example](https://github.com/tensorflow/models/tree/master/official/mnist);
+download this file to your local directory. Run the following to download the
+MNIST data files to your working directory and prepare a `tf.data.Dataset`
+for training:
+
+```py
+import dataset  # download dataset.py file
+dataset_train = dataset.train('./datasets').shuffle(60000).repeat(4).batch(32)
+```
+
+To train a model, define a loss function to optimize and then calculate
+gradients. Use an optimizer to update the variables:
+
+```py
+def loss(model, x, y):
+  prediction = model(x)
+  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=prediction)
+
+def grad(model, inputs, targets):
+  with tf.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, model.variables)
+
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
+
+x, y = iter(dataset_train).next()
+print("Initial loss: {:.3f}".format(loss(model, x, y)))
+
+# Training loop
+for (i, (x, y)) in enumerate(dataset_train):
+  # Calculate derivatives of the input function with respect to its parameters.
+  grads = grad(model, x, y)
+  # Apply the gradient to the model
+  optimizer.apply_gradients(zip(grads, model.variables),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 200 == 0:
+    print("Loss at step {:04d}: {:.3f}".format(i, loss(model, x, y)))
+
+print("Final loss: {:.3f}".format(loss(model, x, y)))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 2.674
+Loss at step 0000: 2.593
+Loss at step 0200: 2.143
+Loss at step 0400: 2.009
+Loss at step 0600: 2.103
+Loss at step 0800: 1.621
+Loss at step 1000: 1.695
+...
+Loss at step 6600: 0.602
+Loss at step 6800: 0.557
+Loss at step 7000: 0.499
+Loss at step 7200: 0.744
+Loss at step 7400: 0.681
+Final loss: 0.670
+```
+
+And for faster training, move the computation to a GPU:
+
+```py
+with tf.device("/gpu:0"):
+  for (i, (x, y)) in enumerate(dataset_train):
+    # minimize() is equivalent to the grad() and apply_gradients() calls.
+    optimizer.minimize(lambda: loss(model, x, y),
+                       global_step=tf.train.get_or_create_global_step())
+```
+
+### Variables and optimizers
+
+`tfe.Variable` objects store mutable `tf.Tensor` values accessed during
+training to make automatic differentiation easier. The parameters of a model can
+be encapsulated in classes as variables.
+
+Better encapsulate model parameters by using `tfe.Variable` with
+`tf.GradientTape`. For example, the automatic differentiation example above
+can be rewritten:
+
+```py
+class Model(tf.keras.Model):
+  def __init__(self):
+    super(Model, self).__init__()
+    self.W = tfe.Variable(5., name='weight')
+    self.B = tfe.Variable(10., name='bias')
+  def predict(self, inputs):
+    return inputs * self.W + self.B
+
+# A toy dataset of points around 3 * x + 2
+NUM_EXAMPLES = 2000
+training_inputs = tf.random_normal([NUM_EXAMPLES])
+noise = tf.random_normal([NUM_EXAMPLES])
+training_outputs = training_inputs * 3 + 2 + noise
+
+# The loss function to be optimized
+def loss(model, inputs, targets):
+  error = model.predict(inputs) - targets
+  return tf.reduce_mean(tf.square(error))
+
+def grad(model, inputs, targets):
+  with tf.GradientTape() as tape:
+    loss_value = loss(model, inputs, targets)
+  return tape.gradient(loss_value, [model.W, model.B])
+
+# Define:
+# 1. A model.
+# 2. Derivatives of a loss function with respect to model parameters.
+# 3. A strategy for updating the variables based on the derivatives.
+model = Model()
+optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+
+print("Initial loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+
+# Training loop
+for i in range(300):
+  grads = grad(model, training_inputs, training_outputs)
+  optimizer.apply_gradients(zip(grads, [model.W, model.B]),
+                            global_step=tf.train.get_or_create_global_step())
+  if i % 20 == 0:
+    print("Loss at step {:03d}: {:.3f}".format(i, loss(model, training_inputs, training_outputs)))
+
+print("Final loss: {:.3f}".format(loss(model, training_inputs, training_outputs)))
+print("W = {}, B = {}".format(model.W.numpy(), model.B.numpy()))
+```
+
+Output (exact numbers may vary):
+
+```
+Initial loss: 69.066
+Loss at step 000: 66.368
+Loss at step 020: 30.107
+Loss at step 040: 13.959
+Loss at step 060: 6.769
+Loss at step 080: 3.567
+Loss at step 100: 2.141
+Loss at step 120: 1.506
+Loss at step 140: 1.223
+Loss at step 160: 1.097
+Loss at step 180: 1.041
+Loss at step 200: 1.016
+Loss at step 220: 1.005
+Loss at step 240: 1.000
+Loss at step 260: 0.998
+Loss at step 280: 0.997
+Final loss: 0.996
+W = 2.99431324005, B = 2.02129220963
+```
+
+## Use objects for state during eager execution
+
+With graph execution, program state (such as the variables) is stored in global
+collections and their lifetime is managed by the `tf.Session` object. In
+contrast, during eager execution the lifetime of state objects is determined by
+the lifetime of their corresponding Python object.
+
+### Variables are objects
+
+During eager execution, variables persist until the last reference to the object
+is removed, and is then deleted.
+
+```py
+with tf.device("gpu:0"):
+  v = tfe.Variable(tf.random_normal([1000, 1000]))
+  v = None  # v no longer takes up GPU memory
+```
+
+### Object-based saving
+
+`tfe.Checkpoint` can save and restore `tfe.Variable`s to and from
+checkpoints:
+
+```py
+x = tfe.Variable(10.)
+
+checkpoint = tfe.Checkpoint(x=x)  # save as "x"
+
+x.assign(2.)   # Assign a new value to the variables and save.
+save_path = checkpoint.save('./ckpt/')
+
+x.assign(11.)  # Change the variable after saving.
+
+# Restore values from the checkpoint
+checkpoint.restore(save_path)
+
+print(x)  # => 2.0
+```
+
+To save and load models, `tfe.Checkpoint` stores the internal state of objects,
+without requiring hidden variables. To record the state of a `model`,
+an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
+
+```py
+model = MyModel()
+optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+checkpoint_dir = ‘/path/to/model_dir’
+checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
+root = tfe.Checkpoint(optimizer=optimizer,
+                      model=model,
+                      optimizer_step=tf.train.get_or_create_global_step())
+
+root.save(file_prefix=checkpoint_prefix)
+# or
+root.restore(tf.train.latest_checkpoint(checkpoint_dir))
+```
+
+### Object-oriented metrics
+
+`tfe.metrics` are stored as objects. Update a metric by passing the new data to
+the callable, and retrieve the result using the `tfe.metrics.result` method,
+for example:
+
+```py
+m = tfe.metrics.Mean("loss")
+m(0)
+m(5)
+m.result()  # => 2.5
+m([8, 9])
+m.result()  # => 5.5
+```
+
+#### Summaries and TensorBoard
+
+@{$summaries_and_tensorboard$TensorBoard} is a visualization tool for
+understanding, debugging and optimizing the model training process. It uses
+summary events that are written while executing the program.
+
+`tf.contrib.summary` is compatible with both eager and graph execution
+environments. Summary operations, such as `tf.contrib.summary.scalar`, are
+inserted during model construction. For example, to record summaries once every
+100 global steps:
+
+```py
+writer = tf.contrib.summary.create_file_writer(logdir)
+global_step=tf.train.get_or_create_global_step()  # return global step var
+
+writer.set_as_default()
+
+for _ in range(iterations):
+  global_step.assign_add(1)
+  # Must include a record_summaries method
+  with tf.contrib.summary.record_summaries_every_n_global_steps(100):
+    # your model code goes here
+    tf.contrib.summary.scalar('loss', loss)
+     ...
+```
+
+## Advanced automatic differentiation topics
+
+### Dynamic models
+
+`tf.GradientTape` can also be used in dynamic models. This example for a
+[backtracking line search](https://wikipedia.org/wiki/Backtracking_line_search)
+algorithm looks like normal NumPy code, except there are gradients and is
+differentiable, despite the complex control flow:
+
+```py
+def line_search_step(fn, init_x, rate=1.0):
+  with tf.GradientTape() as tape:
+    # Variables are automatically recorded, but manually watch a tensor
+    tape.watch(init_x)
+    value = fn(init_x)
+  grad, = tape.gradient(value, [init_x])
+  grad_norm = tf.reduce_sum(grad * grad)
+  init_value = value
+  while value > init_value - rate * grad_norm:
+    x = init_x - rate * grad
+    value = fn(x)
+    rate /= 2.0
+  return x, value
+```
+
+### Additional functions to compute gradients
+
+`tf.GradientTape` is a powerful interface for computing gradients, but there
+is another [Autograd](https://github.com/HIPS/autograd)-style API available for
+automatic differentiation. These functions are useful if writing math code with
+only tensors and gradient functions, and without `tfe.Variables`:
+
+* `tfe.gradients_function` —Returns a function that computes the derivatives
+  of its input function parameter with respect to its arguments. The input
+  function parameter must return a scalar value. When the returned function is
+  invoked, it returns a list of `tf.Tensor` objects: one element for each
+  argument of the input function. Since anything of interest must be passed as a
+  function parameter, this becomes unwieldy if there's a dependency on many
+  trainable parameters.
+* `tfe.value_and_gradients_function` —Similar to
+  `tfe.gradients_function`, but when the returned function is invoked, it
+  returns the value from the input function in addition to the list of
+  derivatives of the input function with respect to its arguments.
+
+In the following example, `tfe.gradients_function` takes the `square`
+function as an argument and returns a function that computes the partial
+derivatives of `square` with respect to its inputs. To calculate the derivative
+of `square` at `3`, `grad(3.0)` returns `6`.
+
+```py
+def square(x):
+  return tf.multiply(x, x)
+
+grad = tfe.gradients_function(square)
+
+square(3.)  # => 9.0
+grad(3.)    # => [6.0]
+
+# The second-order derivative of square:
+gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
+gradgrad(3.)  # => [2.0]
+
+# The third-order derivative is None:
+gradgradgrad = tfe.gradients_function(lambda x: gradgrad(x)[0])
+gradgradgrad(3.)  # => [None]
+
+
+# With flow control:
+def abs(x):
+  return x if x > 0. else -x
+
+grad = tfe.gradients_function(abs)
+
+grad(3.)   # => [1.0]
+grad(-3.)  # => [-1.0]
+```
+
+### Custom gradients
+
+Custom gradients are an easy way to override gradients in eager and graph
+execution. Within the forward function, define the gradient with respect to the
+inputs, outputs, or intermediate results. For example, here's an easy way to clip
+the norm of the gradients in the backward pass:
+
+```py
+@tf.custom_gradient
+def clip_gradient_by_norm(x, norm):
+  y = tf.identity(x)
+  def grad_fn(dresult):
+    return [tf.clip_by_norm(dresult, norm), None]
+  return y, grad_fn
+```
+
+Custom gradients are commonly used to provide a numerically stable gradient for a
+sequence of operations:
+
+```py
+def log1pexp(x):
+  return tf.log(1 + tf.exp(x))
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# The gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# However, x = 100 fails because of numerical instability.
+grad_log1pexp(100.)  # => [nan]
+```
+
+Here, the `log1pexp` function can be analytically simplified with a custom
+gradient. The implementation below reuses the value for `tf.exp(x)` that is
+computed during the forward pass—making it more efficient by eliminating
+redundant calculations:
+
+```py
+@tf.custom_gradient
+def log1pexp(x):
+  e = tf.exp(x)
+  def grad(dy):
+    return dy * (1 - 1 / (1 + e))
+  return tf.log(1 + e), grad
+
+grad_log1pexp = tfe.gradients_function(log1pexp)
+
+# As before, the gradient computation works fine at x = 0.
+grad_log1pexp(0.)  # => [0.5]
+
+# And the gradient computation also works at x = 100.
+grad_log1pexp(100.)  # => [1.0]
+```
+
+## Performance
+
+Computation is automatically offloaded to GPUs during eager execution. If you
+want control over where a computation runs you can enclose it in a
+`tf.device('/gpu:0')` block (or the CPU equivalent):
+
+```py
+import time
+
+def measure(x, steps):
+  # TensorFlow initializes a GPU the first time it's used, exclude from timing.
+  tf.matmul(x, x)
+  start = time.time()
+  for i in range(steps):
+    x = tf.matmul(x, x)
+    _ = x.numpy()  # Make sure to execute op and not just enqueue it
+  end = time.time()
+  return end - start
+
+shape = (1000, 1000)
+steps = 200
+print("Time to multiply a {} matrix by itself {} times:".format(shape, steps))
+
+# Run on CPU:
+with tf.device("/cpu:0"):
+  print("CPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+
+# Run on GPU, if available:
+if tfe.num_gpus() > 0:
+  with tf.device("/gpu:0"):
+    print("GPU: {} secs".format(measure(tf.random_normal(shape), steps)))
+else:
+  print("GPU: not found")
+```
+
+Output (exact numbers depend on hardware):
+
+```
+Time to multiply a (1000, 1000) matrix by itself 200 times:
+CPU: 4.614904403686523 secs
+GPU: 0.5581181049346924 secs
+```
+
+A `tf.Tensor` object can be copied to a different device to execute its
+operations:
+
+```py
+x = tf.random_normal([10, 10])
+
+x_gpu0 = x.gpu()
+x_cpu = x.cpu()
+
+_ = tf.matmul(x_cpu, x_cpu)    # Runs on CPU
+_ = tf.matmul(x_gpu0, x_gpu0)  # Runs on GPU:0
+
+if tfe.num_gpus() > 1:
+  x_gpu1 = x.gpu(1)
+  _ = tf.matmul(x_gpu1, x_gpu1)  # Runs on GPU:1
+```
+
+### Benchmarks
+
+For compute-heavy models, such as
+[ResNet50](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/resnet50)
+training on a GPU, eager execution performance is comparable to graph execution.
+But this gap grows larger for models with less computation and there is work to
+be done for optimizing hot code paths for models with lots of small operations.
+
+
+## Work with graphs
+
+While eager execution makes development and debugging more interactive,
+TensorFlow graph execution has advantages for distributed training, performance
+optimizations, and production deployment. However, writing graph code can feel
+different than writing regular Python code and more difficult to debug.
+
+For building and training graph-constructed models, the Python program first
+builds a graph representing the computation, then invokes `Session.run` to send
+the graph for execution on the C++-based runtime.  This provides:
+
+* Automatic differentiation using static autodiff.
+* Simple deployment to a platform independent server.
+* Graph-based optimizations (common subexpression elimination, constant-folding, etc.).
+* Compilation and kernel fusion.
+* Automatic distribution and replication (placing nodes on the distributed system).
+
+Deploying code written for eager execution is more difficult: either generate a
+graph from the model, or run the Python runtime and code directly on the server.
+
+### Write compatible code
+
+The same code written for eager execution will also build a graph during graph
+execution. Do this by simply running the same code in a new Python session where
+eager execution is not enabled.
+
+Most TensorFlow operations work during eager execution, but there are some things
+to keep in mind:
+
+* Use `tf.data` for input processing instead of queues. It's faster and easier.
+* Use object-oriented layer APIs—like `tf.keras.layers` and
+  `tf.keras.Model`—since they have explicit storage for variables.
+* Most model code works the same during eager and graph execution, but there are
+  exceptions. (For example, dynamic models using Python control flow to change the
+  computation based on inputs.)
+* Once eager execution is enabled with `tf.enable_eager_execution`, it
+  cannot be turned off. Start a new Python session to return to graph execution.
+
+It's best to write code for both eager execution *and* graph execution. This
+gives you eager's interactive experimentation and debuggability with the
+distributed performance benefits of graph execution.
+
+Write, debug, and iterate in eager execution, then import the model graph for
+production deployment. Use `tfe.Checkpoint` to save and restore model
+variables, this allows movement between eager and graph execution environments.
+See the examples in:
+[tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
+
+### Use eager execution in a graph environment
+
+Selectively enable eager execution in a TensorFlow graph environment using
+`tfe.py_func`. This is used when `tf.enable_eager_execution()` has *not*
+been called.
+
+```py
+def my_py_func(x):
+  x = tf.matmul(x, x)  # You can use tf ops
+  print(x)  # but it's eager!
+  return x
+
+with tf.Session() as sess:
+  x = tf.placeholder(dtype=tf.float32)
+  # Call eager function in graph!
+  pf = tfe.py_func(my_py_func, [x], tf.float32)
+  sess.run(pf, feed_dict={x: [[2.0]]})  # [[4.0]]
+```
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 392ac6f7f12532c3efce5bec1917691f55c7bee5..51c1a1e032baae7eff334da785fc5ffa2438e0ca 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -121,7 +121,7 @@ dimensions:
   devices, which makes it possible to speed up
   @{$deep_cnn$CIFAR-10 training using multiple GPUs}.
 * The Session API allows multiple concurrent steps (i.e. calls to
-  @{tf.Session.run} in parallel. This
+  @{tf.Session.run} in parallel). This
   enables the runtime to get higher throughput, if a single step does not use
   all of the resources in your computer.
 
diff --git a/tensorflow/docs_src/programmers_guide/graphs.md b/tensorflow/docs_src/programmers_guide/graphs.md
index e69b717432e6a8fab0085eb419dcbc0991cd9d28..aa72cae766c3641a2d447032b7dcea58b53ac173 100644
--- a/tensorflow/docs_src/programmers_guide/graphs.md
+++ b/tensorflow/docs_src/programmers_guide/graphs.md
@@ -96,7 +96,7 @@ to all API functions in the same context.  For example:
   (See @{$programmers_guide/variables} for more information about variables.)
 
 * Calling @{tf.train.Optimizer.minimize} will add operations and tensors to the
-  default graph that calculate gradients, and return a @{tf.Operation} that,
+  default graph that calculates gradients, and return a @{tf.Operation} that,
   when run, will apply those gradients to a set of variables.
 
 Most programs rely solely on the default graph. However,
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index e8c2fa6990c8ecfca1cfe76b3f813b4ae6917742..648d001bd3535fe3dcc460c9ebdb6e6a997dc332 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,6 +5,7 @@ works. The units are as follows:
 
 ## High Level APIs
 
+  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
   * @{$programmers_guide/estimators}, which introduces a high-level
     TensorFlow API that greatly simplifies ML programming.
   * @{$programmers_guide/datasets}, which explains how to
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 3fe4cb2ddaee40d9d6c6470bee171dedb27ad890..7ac63bf2e019fc3b6aa7ab1b3e6422a97858d8c6 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,8 +1,9 @@
 index.md
 
 ### High Level APIs
-estimators.md
+eager.md
 datasets.md
+estimators.md
 
 ### Low Level APIs
 low_level_intro.md
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index a9c2cb3e33d4817b9a35400dcce9227ddd635ff4..5e3e49d43402cd76f8b7062483259df4598bd8ff 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -11,7 +11,7 @@ This doc is aimed at users who:
   using an existing model.
 * Have, perhaps, skimmed the code of an example TPU model
   [[1]](https://github.com/tensorflow/models/blob/master/official/mnist/mnist_tpu.py)
-  [[2]](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models).
+  [[2]](https://github.com/tensorflow/tpu/tree/master/models).
 * Are interested in porting an existing `Estimator` model to
   run on Cloud TPUs
 
@@ -280,15 +280,15 @@ Where `params['batch-size']` will contain the batch size.
 ### Static shapes and batch size
 
 The input pipeline generated by your `input_fn` is run on CPU. So it is mostly
-free strict static shape requirements imposed by the XLA/TPU environment. The
-one requirement is that the batches of data fed from your input pipeline to
+free from the strict static shape requirements imposed by the XLA/TPU environment.
+The one requirement is that the batches of data fed from your input pipeline to
 the TPU have a static shape, as determined by the standard TensorFlow shape
 inference algorithm. Intermediate tensors are free to have a dynamic shapes.
 If shape inference has failed, but the shape is known it is possible to
 impose the correct shape using `tf.set_shape()`. 
 
 In the example below the shape
-inference algorithm fails, but it is corrected using `set_shape`:
+inference algorithm fails, but it is correctly using `set_shape`:
 
 ```
 >>> x = tf.zeros(tf.constant([1,2,3])+1)
@@ -371,10 +371,10 @@ in bytes. A minimum of a few MB (`buffer_size=8*1024*1024`) is recommended so
 that data is available when needed.
 
 The TPU-demos repo includes
-[a script](https://github.com/tensorflow/tpu-demos/blob/master/cloud_tpu/datasets/imagenet_to_gcs.py)
+[a script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 for downloading the imagenet dataset and converting it to an appropriate format.
 This together with the imagenet
-[models](https://github.com/tensorflow/tpu-demos/tree/master/cloud_tpu/models)
+[models](https://github.com/tensorflow/tpu/tree/master/models)
 included in the repo demonstrate all of these best-practices.
 
 
@@ -387,7 +387,7 @@ For details on how to actually set up and run a Cloud TPU see:
 This document is by no means exhaustive. The best source of more detail on how
 to make a Cloud TPU compatible model are the example models published in:
 
- * The [TPU Demos Repository.](https://github.com/tensorflow/tpu-demos/)
+ * The [TPU Demos Repository.](https://github.com/tensorflow/tpu)
 
 For more information about tuning TensorFlow code for performance see:
 
diff --git a/tensorflow/docs_src/tutorials/image_retraining.md b/tensorflow/docs_src/tutorials/image_retraining.md
index 93d7c86e42aa90d145d27b56edc0abfec7034686..27784eef9cdb5c6f8b9af44b3fc3f876cda39d13 100644
--- a/tensorflow/docs_src/tutorials/image_retraining.md
+++ b/tensorflow/docs_src/tutorials/image_retraining.md
@@ -1,404 +1,4 @@
 # How to Retrain Inception's Final Layer for New Categories
 
-Modern object recognition models have millions of parameters and can take weeks
-to fully train. Transfer learning is a technique that shortcuts a lot of this
-work by taking a fully-trained model for a set of categories like ImageNet, and
-retrains from the existing weights for new classes. In this example we'll be
-retraining the final layer from scratch, while leaving all the others untouched.
-For more information on the approach you can see
-[this paper on Decaf](https://arxiv.org/pdf/1310.1531v1.pdf).
-
-Though it's not as good as a full training run, this is surprisingly effective
-for many applications, and can be run in as little as thirty minutes on a
-laptop, without requiring a GPU. This tutorial will show you how to run the
-example script on your own images, and will explain some of the options you have
-to help control the training process.
-
-Note: A version of this tutorial is also available
-[as a codelab](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0).
-
-Before you start, you must @{$install$install tensorflow}.
-
-[TOC]
-
-## Training on Flowers
-
-![Daisies by Kelly Sikkema](https://www.tensorflow.org/images/daisies.jpg)
-
-[Image by Kelly Sikkema](https://www.flickr.com/photos/95072945@N05/9922116524/)
-
-Before you start any training, you'll need a set of images to teach the network
-about the new classes you want to recognize. There's a later section that
-explains how to prepare your own images, but to make it easy we've created an
-archive of creative-commons licensed flower photos to use initially. To get the
-set of flower photos, run these commands:
-
-```sh
-cd ~
-curl -O http://download.tensorflow.org/example_images/flower_photos.tgz
-tar xzf flower_photos.tgz
-```
-
-Once you have the images, you can clone the tensorflow repository using the
-following command (these examples are not included in the installation):
-
-```sh
-git clone https://github.com/tensorflow/tensorflow
-```
-
-Then checkout the version of the tensorflow repository matching your
-installation and this tutorial as follows:
-
-``` sh
-cd tensorflow
-git checkout {version}
-```
-
-In the simplest cases the retrainer can then be run like this:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py --image_dir ~/flower_photos
-```
-
-The script has many other options. You can get a full listing with:
-
-```sh
-python tensorflow/examples/image_retraining/retrain.py -h
-```
-
-This script loads the pre-trained Inception v3 model, removes the old top layer,
-and trains a new one on the flower photos you've downloaded. None of the flower
-species were in the original ImageNet classes the full network was trained on.
-The magic of transfer learning is that lower layers that have been trained to
-distinguish between some objects can be reused for many recognition tasks
-without any alteration.
-
-## Bottlenecks
-
-The script can take thirty minutes or more to complete, depending on the speed
-of your machine. The first phase analyzes all the images on disk and calculates
-the bottleneck values for each of them. 'Bottleneck' is an informal term we
-often use for the layer just before the final output layer that actually does
-the classification. This penultimate layer has been trained to output a set of
-values that's good enough for the classifier to use to distinguish between all
-the classes it's been asked to recognize. That means it has to be a meaningful
-and compact summary of the images, since it has to contain enough information
-for the classifier to make a good choice in a very small set of values. The
-reason our final layer retraining can work on new classes is that it turns out
-the kind of information needed to distinguish between all the 1,000 classes in
-ImageNet is often also useful to distinguish between new kinds of objects.
-
-Because every image is reused multiple times during training and calculating
-each bottleneck takes a significant amount of time, it speeds things up to
-cache these bottleneck values on disk so they don't have to be repeatedly
-recalculated. By default they're stored in the `/tmp/bottleneck` directory, and
-if you rerun the script they'll be reused so you don't have to wait for this
-part again.
-
-## Training
-
-Once the bottlenecks are complete, the actual training of the top layer of the
-network begins. You'll see a series of step outputs, each one showing training
-accuracy, validation accuracy, and the cross entropy. The training accuracy
-shows what percent of the images used in the current training batch were
-labeled with the correct class. The validation accuracy is the precision on a
-randomly-selected group of images from a different set. The key difference is
-that the training accuracy is based on images that the network has been able
-to learn from so the network can overfit to the noise in the training data. A
-true measure of the performance of the network is to measure its performance on
-a data set not contained in the training data -- this is measured by the
-validation accuracy. If the train accuracy is high but the validation accuracy
-remains low, that means the network is overfitting and memorizing particular
-features in the training images that aren't helpful more generally. Cross
-entropy is a loss function which gives a glimpse into how well the learning
-process is progressing. The training's objective is to make the loss as small as
-possible, so you can tell if the learning is working by keeping an eye on
-whether the loss keeps trending downwards, ignoring the short-term noise.
-
-By default this script will run 4,000 training steps. Each step chooses 100
-images at random from the training set, finds their bottlenecks from the cache,
-and feeds them into the final layer to get predictions. Those predictions are
-then compared against the actual labels to update the final layer's weights
-through the back-propagation process. As the process continues you should see
-the reported accuracy improve, and after all the steps are done, a final test
-accuracy evaluation is run on a set of images kept separate from the training
-and validation pictures. This test evaluation is the best estimate of how the
-trained model will perform on the classification task. You should see an
-accuracy value of between 90% and 95%, though the exact value will vary from run
-to run since there's randomness in the training process. This number is based on
-the percent of the images in the test set that are given the correct label
-after the model is fully trained.
-
-## Visualizing the Retraining with TensorBoard
-
-The script includes TensorBoard summaries that make it easier to understand, debug, and optimize the retraining. For example, you can visualize the graph and statistics, such as how the weights or accuracy varied during training.
-
-To launch TensorBoard, run this command during or after retraining:
-
-```sh
-tensorboard --logdir /tmp/retrain_logs
-```
-
-Once TensorBoard is running, navigate your web browser to `localhost:6006` to view the TensorBoard.
-
-The script will log TensorBoard summaries to `/tmp/retrain_logs` by default. You can change the directory with the `--summaries_dir` flag.
-
-The [TensorBoard's GitHub](https://github.com/tensorflow/tensorboard) has a lot more information on TensorBoard usage, including tips & tricks, and debugging information.
-
-## Using the Retrained Model
-
-The script will write out a version of the Inception v3 network with a final
-layer retrained to your categories to /tmp/output_graph.pb, and a text file
-containing the labels to /tmp/output_labels.txt. These are both in a format that
-the @{$image_recognition$C++ and Python image classification examples}
-can read in, so you can start using your new model immediately. Since you've
-replaced the top layer, you will need to specify the new name in the script, for
-example with the flag `--output_layer=final_result` if you're using label_image.
-
-Here's an example of how to run the label_image example with your
-retrained graphs:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=Mul \
---output_layer=final_result \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-You should see a list of flower labels, in most cases with daisy on top
-(though each retrained model may be slightly different). You can replace the
-`--image` parameter with your own images to try those out.
-
-If you'd like to use the retrained model in your own Python program, then the
-above
-[`label_image` script](https://www.tensorflow.org/code/tensorflow/examples/label_image/label_image.py)
-is a reasonable starting point. The `label_image`
-directory also contains C++ code which you can use as a template to integrate
-tensorflow with your own applications.
-
-If you find the default Inception v3 model is too large or slow for your
-application, take a look at the [Other Model Architectures section](/tutorials/image_retraining#other_model_architectures)
-below for options to speed up and slim down your network.
-
-## Training on Your Own Categories
-
-If you've managed to get the script working on the flower example images, you
-can start looking at teaching it to recognize categories you care about instead.
-In theory all you'll need to do is point it at a set of sub-folders, each named
-after one of your categories and containing only images from that category. If
-you do that and pass the root folder of the subdirectories as the argument to
-`--image_dir`, the script should train just like it did for the flowers.
-
-Here's what the folder structure of the flowers archive looks like, to give you
-and example of the kind of layout the script is looking for:
-
-![Folder Structure](https://www.tensorflow.org/images/folder_structure.png)
-
-In practice it may take some work to get the accuracy you want. I'll try to
-guide you through some of the common problems you might encounter below.
-
-## Creating a Set of Training Images
-
-The first place to start is by looking at the images you've gathered, since the
-most common issues we see with training come from the data that's being fed in.
-
-For training to work well, you should gather at least a hundred photos of each
-kind of object you want to recognize. The more you can gather, the better the
-accuracy of your trained model is likely to be. You also need to make sure that
-the photos are a good representation of what your application will actually
-encounter. For example, if you take all your photos indoors against a blank wall
-and your users are trying to recognize objects outdoors, you probably won't see
-good results when you deploy.
-
-Another pitfall to avoid is that the learning process will pick up on anything
-that the labeled images have in common with each other, and if you're not
-careful that might be something that's not useful. For example if you photograph
-one kind of object in a blue room, and another in a green one, then the model
-will end up basing its prediction on the background color, not the features of
-the object you actually care about. To avoid this, try to take pictures in as
-wide a variety of situations as you can, at different times, and with different
-devices. If you want to know more about this problem, you can read about the
-classic (and possibly apocryphal)
-[tank recognition problem](https://www.jefftk.com/p/detecting-tanks).
-
-You may also want to think about the categories you use. It might be worth
-splitting big categories that cover a lot of different physical forms into
-smaller ones that are more visually distinct. For example instead of 'vehicle'
-you might use 'car', 'motorbike', and 'truck'. It's also worth thinking about
-whether you have a 'closed world' or an 'open world' problem. In a closed world,
-the only things you'll ever be asked to categorize are the classes of object you
-know about. This might apply to a plant recognition app where you know the user
-is likely to be taking a picture of a flower, so all you have to do is decide
-which species. By contrast a roaming robot might see all sorts of different
-things through its camera as it wanders around the world. In that case you'd
-want the classifier to report if it wasn't sure what it was seeing. This can be
-hard to do well, but often if you collect a large number of typical 'background'
-photos with no relevant objects in them, you can add them to an extra 'unknown'
-class in your image folders.
-
-It's also worth checking to make sure that all of your images are labeled
-correctly. Often user-generated tags are unreliable for our purposes, for
-example using #daisy for pictures of a person named Daisy. If you go through
-your images and weed out any mistakes it can do wonders for your overall
-accuracy.
-
-## Training Steps
-
-If you're happy with your images, you can take a look at improving your results
-by altering the details of the learning process. The simplest one to try is
-`--how_many_training_steps`. This defaults to 4,000, but if you increase it to
-8,000 it will train for twice as long. The rate of improvement in the accuracy
-slows the longer you train for, and at some point will stop altogether, but you
-can experiment to see when you hit that limit for your model.
-
-## Distortions
-
-A common way of improving the results of image training is by deforming,
-cropping, or brightening the training inputs in random ways. This has the
-advantage of expanding the effective size of the training data thanks to all the
-possible variations of the same images, and tends to help the network learn to
-cope with all the distortions that will occur in real-life uses of the
-classifier. The biggest disadvantage of enabling these distortions in our script
-is that the bottleneck caching is no longer useful, since input images are never
-reused exactly. This means the training process takes a lot longer, so I
-recommend trying this as a way of fine-tuning your model once you've got one
-that you're reasonably happy with.
-
-You enable these distortions by passing `--random_crop`, `--random_scale` and
-`--random_brightness` to the script. These are all percentage values that
-control how much of each of the distortions is applied to each image. It's
-reasonable to start with values of 5 or 10 for each of them and then experiment
-to see which of them help with your application. `--flip_left_right` will
-randomly mirror half of the images horizontally, which makes sense as long as
-those inversions are likely to happen in your application. For example it
-wouldn't be a good idea if you were trying to recognize letters, since flipping
-them destroys their meaning.
-
-## Hyper-parameters
-
-There are several other parameters you can try adjusting to see if they help
-your results. The `--learning_rate` controls the magnitude of the updates to the
-final layer during training. Intuitively if this is smaller then the learning
-will take longer, but it can end up helping the overall precision. That's not
-always the case though, so you need to experiment carefully to see what works
-for your case. The `--train_batch_size` controls how many images are examined
-during one training step, and because the learning rate is applied per batch
-you'll need to reduce it if you have larger batches to get the same overall
-effect.
-
-## Training, Validation, and Testing Sets
-
-One of the things the script does under the hood when you point it at a folder
-of images is divide them up into three different sets. The largest is usually
-the training set, which are all the images fed into the network during training,
-with the results used to update the model's weights. You might wonder why we
-don't use all the images for training? A big potential problem when we're doing
-machine learning is that our model may just be memorizing irrelevant details of
-the training images to come up with the right answers. For example, you could
-imagine a network remembering a pattern in the background of each photo it was
-shown, and using that to match labels with objects. It could produce good
-results on all the images it's seen before during training, but then fail on new
-images because it's not learned general characteristics of the objects, just
-memorized unimportant details of the training images.
-
-This problem is known as overfitting, and to avoid it we keep some of our data
-out of the training process, so that the model can't memorize them. We then use
-those images as a check to make sure that overfitting isn't occurring, since if
-we see good accuracy on them it's a good sign the network isn't overfitting. The
-usual split is to put 80% of the images into the main training set, keep 10%
-aside to run as validation frequently during training, and then have a final 10%
-that are used less often as a testing set to predict the real-world performance
-of the classifier. These ratios can be controlled using the
-`--testing_percentage` and `--validation_percentage` flags. In general
-you should be able to leave these values at their defaults, since you won't
-usually find any advantage to training to adjusting them.
-
-Note that the script uses the image filenames (rather than a completely random
-function) to divide the images among the training, validation, and test sets.
-This is done to ensure that images don't get moved between training and testing
-sets on different runs, since that could be a problem if images that had been
-used for training a model were subsequently used in a validation set.
-
-You might notice that the validation accuracy fluctuates among iterations. Much
-of this fluctuation arises from the fact that a random subset of the validation
-set is chosen for each validation accuracy measurement. The fluctuations can be
-greatly reduced, at the cost of some increase in training time, by choosing
-`--validation_batch_size=-1`, which uses the entire validation set for each
-accuracy computation.
-
-Once training is complete, you may find it insightful to examine misclassified
-images in the test set. This can be done by adding the flag
-`--print_misclassified_test_images`. This may help you get a feeling for which
-types of images were most confusing for the model, and which categories were
-most difficult to distinguish. For instance, you might discover that some
-subtype of a particular category, or some unusual photo angle, is particularly
-difficult to identify, which may encourage you to add more training images of
-that subtype. Oftentimes, examining misclassified images can also point to
-errors in the input data set, such as mislabeled, low-quality, or ambiguous
-images. However, one should generally avoid point-fixing individual errors in
-the test set, since they are likely to merely reflect more general problems in
-the (much larger) training set.
-
-## Other Model Architectures
-
-By default the script uses a pretrained version of the Inception v3 model
-architecture. This is a good place to start because it provides high accuracy
-results, but if you intend to deploy your model on mobile devices or other
-resource-constrained environments you may want to trade off a little accuracy
-for much smaller file sizes or faster speeds. To help with that, the
-[retrain.py script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/image_retraining/retrain.py)
-supports different variations on the [Mobilenet architecture](https://research.googleblog.com/2017/06/mobilenets-open-source-models-for.html).
-
-These are a little less precise than Inception v3, but can result in far
-smaller file sizes (a few megabytes) and can be many times faster
-to run. To train with one of these models, pass in the `--architecture` flag,
-for example:
-
-```
-python tensorflow/examples/image_retraining/retrain.py \
-    --image_dir ~/flower_photos --architecture mobilenet_0.25_128
-```
-
-This will create a 1.9MB model file in `/tmp/output_graph.pb`, with only 25% of
-the number of neurons of the full Mobilenet, and trained to take 128x128 sized
-input images.
-
-You can choose '1.0', '0.75', '0.50', or '0.25' to control the number of
-neurons (activations of hidden layers); the number of weights (and hence to
-some extent the file size and speed) shrinks like the square of that fraction.
-You can choose '224', '192', '160', or '128' for the input image size,
-with smaller sizes giving faster speeds.
-
-The speed and size advantages come at a loss to accuracy of course, but for many
-purposes this isn't critical. They can also be somewhat offset with improved
-training data. For example, training with distortions allows me to get above 80%
-accuracy on the flower data set even with the 0.25/128 graph above.
-
-If you're going to be using the Mobilenet models in label_image or your own
-programs, you'll need to feed in an image of the specified size converted to a
-float range into the 'input' tensor. Typically 24-bit images are in the range
-[0,255], and you must convert them to the [-1,1] float range expected by the
-model with the formula  `(image - 128.)/128.`.
-
-The default arguments for the `label_image` script are set for Inception V3.
-To use it with a MobileNet, specify the above normalization parameters as
-`input_mean` and `input_std` on the command line. You also must specify the
-image size that your model expects, as follows:
-
-```sh
-python tensorflow/examples/label_image/label_image.py \
---graph=/tmp/output_graph.pb --labels=/tmp/output_labels.txt \
---input_layer=input \
---output_layer=final_result \
---input_height=224 --input_width=224 \
---input_mean=128 --input_std=128 \
---image=$HOME/flower_photos/daisy/21652746_cc379e0eea_m.jpg
-```
-
-For more information on deploying the retrained model to a mobile device, see
-the [codelab version](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets/#0)
-of this tutorial, especially [part 2](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2-tflite/#0), which describes
-[TensorFlow Lite](/mobile/tflite/) and the additional optimizations it offers
-(including quantization of model weights).
+**NOTE: This tutorial has moved to**
+https://github.com/tensorflow/hub/tree/master/docs/tutorials/image_retraining.md
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index b3ed6589ed062dc2331b7dc64184a2b39062271e..cf8054be6a3e89a307a10fdb711a62ac3a46d410 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -139,15 +139,3 @@ tf_cc_binary(
         "//tensorflow/core:framework",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index 12146477972a116903f731a03b9755aafd92acc1..aa594a63c6ad5ab7129e452e7a6345114b994231 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -76,7 +76,6 @@ android_binary(
     custom_package = "org.tensorflow.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
-    manifest_merger = "legacy",
     resource_files = glob(["res/**"]),
     tags = [
         "manual",
@@ -100,22 +99,6 @@ filegroup(
 )
 # LINT.ThenChange(//tensorflow/examples/android/download-models.gradle)
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-            "gradleBuild/**",
-            "libs/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 filegroup(
     name = "java_files",
     srcs = glob(["src/**/*.java"]),
diff --git a/tensorflow/examples/benchmark/BUILD b/tensorflow/examples/benchmark/BUILD
index c4bb0a5bd952ea175a4fd2444a3d632dc13445de..98611a9aadf6f456dd4f9fe4f423e3e2ce9722ec 100644
--- a/tensorflow/examples/benchmark/BUILD
+++ b/tensorflow/examples/benchmark/BUILD
@@ -23,9 +23,3 @@ tf_py_logged_benchmark(
     name = "sample_logged_benchmark",
     target = "//tensorflow/examples/benchmark:sample_benchmark",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(["**/*"]),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/get_started/regression/BUILD b/tensorflow/examples/get_started/regression/BUILD
index 577b970c9063dfa9a2acdb7d18362aa8adba827f..bee94d7d90fb3f70107a5dd9e9223f3013402073 100644
--- a/tensorflow/examples/get_started/regression/BUILD
+++ b/tensorflow/examples/get_started/regression/BUILD
@@ -2,18 +2,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_test(
     name = "test",
     size = "medium",
diff --git a/tensorflow/examples/how_tos/reading_data/BUILD b/tensorflow/examples/how_tos/reading_data/BUILD
index 4a43585d5395b1df94dd8a8767f92f131cfcaea4..64a054d3712035252666ca84e676add3d079e52a 100644
--- a/tensorflow/examples/how_tos/reading_data/BUILD
+++ b/tensorflow/examples/how_tos/reading_data/BUILD
@@ -54,15 +54,3 @@ py_binary(
         "//tensorflow/examples/tutorials/mnist:input_data",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/image_retraining/BUILD b/tensorflow/examples/image_retraining/BUILD
index 9f9244a74c4d073cc67b7c8252b0bcff86e9400f..ecd79a3b004d0ca9f50d2a6f140dbc353efe30cb 100644
--- a/tensorflow/examples/image_retraining/BUILD
+++ b/tensorflow/examples/image_retraining/BUILD
@@ -49,15 +49,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/image_retraining/retrain.py b/tensorflow/examples/image_retraining/retrain.py
index 99a71206acbd533ec8bc5a9644435eacad564cd4..fcc191250fe8c9d80e788b6d345b041c7ea22f2f 100644
--- a/tensorflow/examples/image_retraining/retrain.py
+++ b/tensorflow/examples/image_retraining/retrain.py
@@ -870,15 +870,16 @@ def run_final_eval(sess, model_info, class_count, image_lists, jpeg_data_tensor,
     resized_image_tensor: The input node of the recognition graph.
     bottleneck_tensor: The bottleneck output layer of the CNN graph.
   """
-  (sess, bottleneck_input, ground_truth_input, evaluation_step,
-   prediction) = build_eval_session(model_info, class_count)
-
   test_bottlenecks, test_ground_truth, test_filenames = (
       get_random_cached_bottlenecks(sess, image_lists, FLAGS.test_batch_size,
                                     'testing', FLAGS.bottleneck_dir,
                                     FLAGS.image_dir, jpeg_data_tensor,
                                     decoded_image_tensor, resized_image_tensor,
                                     bottleneck_tensor, FLAGS.architecture))
+
+  (sess, bottleneck_input, ground_truth_input, evaluation_step,
+   prediction) = build_eval_session(model_info, class_count)
+
   test_accuracy, predictions = sess.run(
       [evaluation_step, prediction],
       feed_dict={
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 2abbe9dacca79b8d6e516550e28a9b203b18f123..c50fd93d03953b12113c17d420c4c5306a02ebe9 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -9,6 +9,8 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+exports_files(["data/grace_hopper.jpg"])
+
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 tf_cc_binary(
@@ -60,17 +62,3 @@ py_binary(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 63bc39de6c0a420e03adada56cbc8b0f895b6155..baa65d3243ffbebdf3ccf8a786a2434dfb7cfdad 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
@@ -137,15 +138,15 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     // gif decoder returns 4-D tensor, remove the first dim
     image_reader =
         Squeeze(root.WithOpName("squeeze_first_dim"),
                 DecodeGif(root.WithOpName("gif_reader"), file_reader));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".bmp")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".bmp")) {
     image_reader = DecodeBmp(root.WithOpName("bmp_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
diff --git a/tensorflow/examples/learn/BUILD b/tensorflow/examples/learn/BUILD
index aba7f600b53cf8286d46ee70823a0a425944076f..bdbcb0b1638a400f12f66bb3c4ee9d852fe145d2 100644
--- a/tensorflow/examples/learn/BUILD
+++ b/tensorflow/examples/learn/BUILD
@@ -152,15 +152,3 @@ sh_test(
         "notap",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 91a5bfa51cda71ed2bca37869c7305d752e1e035..4f9908cd52d98acc20b9238d9a0fdff39284ea32 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -27,17 +27,3 @@ tf_cc_binary(
         "//tensorflow/core:tensorflow",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index e38704fd98cea6928231f2fc2bc989705ae46bb4..96ea525a4e74c68da17d0310f0ad475789314215 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -84,10 +85,10 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
   // Now try to figure out what kind of file it is and decode it.
   const int wanted_channels = 3;
   tensorflow::Output image_reader;
-  if (tensorflow::StringPiece(file_name).ends_with(".png")) {
+  if (tensorflow::str_util::EndsWith(file_name, ".png")) {
     image_reader = DecodePng(root.WithOpName("png_reader"), file_reader,
                              DecodePng::Channels(wanted_channels));
-  } else if (tensorflow::StringPiece(file_name).ends_with(".gif")) {
+  } else if (tensorflow::str_util::EndsWith(file_name, ".gif")) {
     image_reader = DecodeGif(root.WithOpName("gif_reader"), file_reader);
   } else {
     // Assume if it's neither a PNG nor a GIF then it must be a JPEG.
@@ -131,7 +132,7 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
 
 Status SaveImage(const Tensor& tensor, const string& file_path) {
   LOG(INFO) << "Saving image to " << file_path;
-  CHECK(tensorflow::StringPiece(file_path).ends_with(".png"))
+  CHECK(tensorflow::str_util::EndsWith(file_path, ".png"))
       << "Only saving of png files is supported.";
 
   auto root = tensorflow::Scope::NewRootScope();
diff --git a/tensorflow/examples/saved_model/BUILD b/tensorflow/examples/saved_model/BUILD
index 1cdf5ec6e1d80c8337d7929159860e093ad07364..ebefc6576d646467426a784d03f4be206aeaba38 100644
--- a/tensorflow/examples/saved_model/BUILD
+++ b/tensorflow/examples/saved_model/BUILD
@@ -8,19 +8,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "g3doc/sitemap.md",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 py_binary(
     name = "saved_model_half_plus_two",
     srcs = [
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index 12479211c32a965642d23226406617df6ff5a29c..13bca34a86b0c2fba7e5e8e3527d13587feacaae 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -245,15 +245,3 @@ tf_cc_binary(
         "//tensorflow/core:protos_all_cc",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/estimators/BUILD b/tensorflow/examples/tutorials/estimators/BUILD
index ecbc1a431d9a2173e80434b6f9350c225fc9bfb4..bab609f208b6ca3dd6daa8ecfd0c0c762ef87a22 100644
--- a/tensorflow/examples/tutorials/estimators/BUILD
+++ b/tensorflow/examples/tutorials/estimators/BUILD
@@ -20,15 +20,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/layers/BUILD b/tensorflow/examples/tutorials/layers/BUILD
index f8a29c79c63cb77d15ff03d0cf5c98ae36ccc3f8..aad78b18409bab1fe6924849ec5b61c6f3a052f7 100644
--- a/tensorflow/examples/tutorials/layers/BUILD
+++ b/tensorflow/examples/tutorials/layers/BUILD
@@ -19,15 +19,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
index 6d4e67063d8470788a74e0083b62a2db12dd7c64..aa1b2ec2db34f3cb0350bfde88a1598ed71456de 100644
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ b/tensorflow/examples/tutorials/mnist/BUILD
@@ -132,15 +132,3 @@ py_test(
         "//tensorflow:tensorflow_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/monitors/BUILD b/tensorflow/examples/tutorials/monitors/BUILD
index 4220e8144de1259dc5bd873ddb5810bf95dcafae..1c49e3fe5390ad48a3dea7cd5688996270b1dc9d 100644
--- a/tensorflow/examples/tutorials/monitors/BUILD
+++ b/tensorflow/examples/tutorials/monitors/BUILD
@@ -23,15 +23,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD
index bfcf4592690a1692db67090c9b6d4e1e4832c45f..2e19c038bdf04235ccd2f4fdbfeff250ca72a07e 100644
--- a/tensorflow/examples/tutorials/word2vec/BUILD
+++ b/tensorflow/examples/tutorials/word2vec/BUILD
@@ -21,14 +21,3 @@ py_binary(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index c99870c686c18c1b201ec44c8335b3d9ba24f5a1..cc8835728d59b6a57d46167686987aa34ab9d0a0 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -49,17 +49,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 838f4f230193b871dfd62b5c19943e2f9fa0fc89..09da8c189277edc268d1ae210815a243ece5800e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1320,201 +1320,134 @@ func PadV2(scope *Scope, input tf.Output, paddings tf.Output, constant_values tf
 	return op.Output(0)
 }
 
-// Returns the complex conjugate of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// complex numbers that are the complex conjugate of each element in `input`. The
-// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
-// real part and *b* is the imaginary part.
-//
-// The complex conjugate returned by this operation is of the form \\(a - bj\\).
-//
-// For example:
+// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
-// ```
-func Conj(scope *Scope, input tf.Output) (output tf.Output) {
+// This is typically used by gradient computations for a broadcasting operation.
+func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conj",
+		Type: "BroadcastGradientArgs",
 		Input: []tf.Input{
-			input,
+			s0, s1,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
-type ResourceSparseApplyMomentumAttr func(optionalAttr)
-
-// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
-//
-// Set use_nesterov = True if you want to use Nesterov momentum.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-//
-// accum = accum * momentum + grad
-// var -= lr * accum
+// Returns the gradient of `Tile`.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	momentum: Momentum. Must be a scalar.
+// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
 //
-// Returns the created operation.
-func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
+// Since `Tile` takes an input and repeats the input `multiples` times
+// along each dimension, `TileGrad` takes in `multiples` and aggregates
+// each repeated tile of `input` into `output`.
+func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyMomentum",
+		Type: "TileGrad",
 		Input: []tf.Input{
-			var_, accum, lr, grad, indices, momentum,
+			input, multiples,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a sequence of numbers.
-//
-// This operation creates a sequence of numbers that begins at `start` and
-// extends by increments of `delta` up to but not including `limit`.
-//
-// For example:
+// Constructs a tensor by tiling a given tensor.
 //
-// ```
-// # 'start' is 3
-// # 'limit' is 18
-// # 'delta' is 3
-// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
-// ```
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
 // Arguments:
-//	start: 0-D (scalar). First entry in the sequence.
-//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
-//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
-//
-// Returns 1-D.
-func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Range",
+		Type: "Tile",
 		Input: []tf.Input{
-			start, limit, delta,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentSqrtN.
-//
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
-//
-// Arguments:
-//	grad: gradient propagated to the SparseSegmentSqrtN op.
-//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
-func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
+type StridedSliceGradAttr func(optionalAttr)
+
+// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNGrad",
-		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
-		},
+}
+
+// StridedSliceGradEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentMean",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
+}
+
+// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Pop the element at the top of the stack.
+// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Returns the gradient of `StridedSlice`.
 //
-// Arguments:
-//	handle: The handle to a stack.
-//	elem_type: The type of the elem that is popped.
+// Since `StridedSlice` cuts out pieces of its `input` which is size
+// `shape`, its gradient will have the same shape (which is passed here
+// as `shape`). The gradient will be zero in any element that the slice
+// does not select.
 //
-// Returns The tensor that is popped from the top of the stack.
-func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
+// Arguments are the same as StridedSliceGrad with the exception that
+// `dy` is the input gradient to be propagated and `shape` is the
+// shape of `StridedSlice`'s `input`.
+func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StackPopV2",
+		Type: "StridedSliceGrad",
 		Input: []tf.Input{
-			handle,
+			shape, begin, end, strides, dy,
 		},
 		Attrs: attrs,
 	}
@@ -1522,104 +1455,181 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// StridedSliceAttr is an optional argument to StridedSlice.
+type StridedSliceAttr func(optionalAttr)
+
+// StridedSliceBeginMask sets the optional begin_mask attribute to value.
 //
-// For example:
+// value: a bitmask where a bit i being 1 means to ignore the begin
+// value and instead use the largest interval possible. At runtime
+// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
+// `[-1, n-1]` if `stride[i] < 0`
+// If not specified, defaults to 0
+func StridedSliceBeginMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// StridedSliceEndMask sets the optional end_mask attribute to value.
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+// value: analogous to `begin_mask`
+// If not specified, defaults to 0
+func StridedSliceEndMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
 //
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
+// value: a bitmask where bit `i` being 1 means the `i`th
+// position is actually an ellipsis. One bit at most can be 1.
+// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
+// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
+// implicitly creates as many range specifications as necessary to fully
+// specify the sliced range for every dimension. For example for a 4-dimensional
+// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
+// If not specified, defaults to 0
+func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
+// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+//
+// value: a bitmask where bit `i` being 1 means the `i`th
+// specification creates a new shape 1 dimension. For example
+// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
+// If not specified, defaults to 0
+func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
 
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
 //
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+// value: a bitmask where bit `i` implies that the `i`th
+// specification should shrink the dimensionality. begin and end
+// must imply a slice of size 1 in the dimension. For example in
+// python one might do `foo[:, 3, :]` which would result in
+// `shrink_axis_mask` being 2.
+// If not specified, defaults to 0
+func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["shrink_axis_mask"] = value
 	}
 }
 
-// Converts a sparse representation into a dense tensor.
+// Return a strided slice from `input`.
 //
-// Builds an array `dense` with shape `output_shape` such that
+// Note, most python users will want to use the Python `Tensor.__getitem__`
+// or `Variable.__getitem__` rather than this op directly.
 //
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+// The goal of this op is to produce a new tensor with a subset of
+// the elements from the `n` dimensional `input` tensor. The subset is chosen using
+// a sequence of `m` sparse range specifications encoded into the arguments
+// of this function. Note, in some cases
+// `m` could be equal to `n`, but this need not be the case. Each
+// range specification entry can be one of the following:
 //
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
+// - An ellipsis (...). Ellipses are used to imply zero or more
+//   dimensions of full-dimension selection and are produced using
+//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
 //
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// - A new axis. This is used to insert a new shape=1 dimension and is
+//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
+//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+//
+//
+// - A range `begin:end:stride`. This is used to specify how much to choose from
+//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
+//   which represents the index of the first value to select while `end` represents
+//   the index of the last value to select. The number of values selected in each
+//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
+//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
+//   the second to last. `begin_mask` controls whether to replace the explicitly
+//   given `begin` with an implicit effective value of `0` if `stride > 0` and
+//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
+//   required to create the largest open interval. For example, given a shape
+//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
+//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
+//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
+//   first dimension of a tensor while dropping the last two (in the original
+//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
+//
+// - A single index. This is used to keep only elements that have a given
+//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
+//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
+//   `shrink_axis_mask`.
+//
+// Each conceptual range specification is encoded in the op's argument. This
+// encoding is best understand by considering a non-trivial example. In
+// particular,
+// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//
+// ```
+// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
+// end = [2, 4, x, x, -3, x]
+// strides = [1, 1, x, x, -1, 1]
+// begin_mask = 1<<4 | 1 << 5 = 48
+// end_mask = 1<<5 = 32
+// ellipsis_mask = 1<<3 = 8
+// new_axis_mask = 1<<2 4
+// shrink_axis_mask = 1<<0
 // ```
 //
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
+// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
+// the slice becomes (2, 1, 5, 5, 2, 5).
+// Let us walk step by step through each argument specification.
 //
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
+// 1.  The first argument in the example slice is turned into `begin = 1` and
+// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
+// also set the appropriate bit in `shrink_axis_mask`.
+//
+// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
+// zero bits contributed.
+//
+// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
+// dimension in the final shape. Dummy values are contributed to begin,
+// end and stride, while the new_axis_mask bit is set.
+//
+// 4. `...` grab the full ranges from as many dimensions as needed to
+// fully specify a slice for every dimension of the input shape.
+//
+// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
+// with a dimension that has shape `s` is converted to a positive index
+// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
+// is done internally so begin, end and strides receive x, -3, and -1.
+// The appropriate begin_mask bit is set to indicate the start range is the
+// full range (ignoring the x).
+//
+// 6. `:` indicates that the entire contents of the corresponding dimension
+// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
+// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
+// `end_mask` are also set.
+//
+// *Requirements*:
+//   `0 != strides[i] for i in [0, m)`
+//   `ellipsis_mask must be a power of two (only one ellipsis)`
 //
 // Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
 //
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+//	begin: `begin[k]` specifies the offset into the `k`th range specification.
+// The exact dimension this corresponds to will be determined by context.
+// Out-of-bounds values will be silently clamped. If the `k`th bit of
+// `begin_mask` then `begin[k]` is ignored and the full range of the
+// appropriate dimension is used instead. Negative values causes indexing
+// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
+//	end: `end[i]` is like `begin` with the exception that `end_mask` is
+// used to determine full ranges.
+//	strides: `strides[i]` specifies the increment in the `i`th specification
+// after extracting a given element. Negative indices will reverse
+// the original order. Out or range values are
+// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
+func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1628,9 +1638,9 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "StridedSlice",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			input, begin, end, strides,
 		},
 		Attrs: attrs,
 	}
@@ -1638,216 +1648,239 @@ func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Outpu
 	return op.Output(0)
 }
 
-// Counts the number of occurrences of each value in an integer array.
+// Return a slice from 'input'.
 //
-// Outputs a vector with length `size` and the same dtype as `weights`. If
-// `weights` are empty, then index `i` stores the number of times the value `i` is
-// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
-// the value in `weights` at each index where the corresponding value in `arr` is
-// `i`.
+// The output tensor is a tensor with dimensions described by 'size'
+// whose values are extracted from 'input' starting at the offsets in
+// 'begin'.
 //
-// Values in `arr` outside of the range [0, size) are ignored.
+// *Requirements*:
+//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
 //
 // Arguments:
-//	arr: int32 `Tensor`.
-//	size: non-negative int32 scalar `Tensor`.
-//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
-// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
-// equal to 1.
 //
-// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
-// each value in the range [0, size).
-func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
+//	begin: begin[i] specifies the offset into the 'i'th dimension of
+// 'input' to slice from.
+//	size: size[i] specifies the number of elements of the 'i'th dimension
+// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
+// i are included in the slice (i.e. this is equivalent to setting
+// size[i] = input.dim_size(i) - begin[i]).
+func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Bincount",
+		Type: "Slice",
 		Input: []tf.Input{
-			arr, size, weights,
+			input, begin, size,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// SizeAttr is an optional argument to Size.
+type SizeAttr func(optionalAttr)
+
+// SizeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func SizeOutType(value tf.DataType) SizeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the size of a tensor.
 //
-// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
-// dimension, selecting a subset of dimension 0, specified by `indices`.
+// This operation returns an integer representing the number of elements in
+// `input`.
 //
 // For example:
 //
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// # Select two rows, one segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
-// # => [[0 0 0 0]]
-//
-// # Select two rows, two segment.
-// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
-// # => [[ 1  2  3  4]
-// #     [-1 -2 -3 -4]]
-//
-// # Select all rows, two segments.
-// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
-// # => [[0 0 0 0]
-// #     [5 6 7 8]]
-//
-// # Which is equivalent to:
-// tf.segment_sum(c, tf.constant([0, 0, 1]))
 // ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
+// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
+// size(t) ==> 12
+// ```
+func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSum",
+		Type: "Size",
 		Input: []tf.Input{
-			data, indices, segment_ids,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic sine of x element-wise.
-func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the rank of a tensor.
+//
+// This operation returns an integer representing the rank of `input`.
+//
+// For example:
+//
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// # shape of tensor 't' is [2, 2, 3]
+// rank(t) ==> 3
+// ```
+//
+// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
+// of a tensor is the number of indices required to uniquely select each element
+// of the tensor. Rank is also known as "order", "degree", or "ndims."
+func Rank(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sinh",
+		Type: "Rank",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
+// ReverseSequenceAttr is an optional argument to ReverseSequence.
+type ReverseSequenceAttr func(optionalAttr)
+
+// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+//
+// value: The dimension along which reversal is performed.
+// If not specified, defaults to 0
+func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+	return func(m optionalAttr) {
+		m["batch_dim"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// Reverses variable length slices.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// This op first slices `input` along the dimension `batch_dim`, and for each
+// slice `i`, reverses the first `seq_lengths[i]` elements along
+// the dimension `seq_dim`.
 //
-// Computes a tensor such that
-// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
-// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
-// need not be sorted and need not cover all values in the full
-// range of valid values.
+// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
+// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
 //
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
-// If the given segment ID `i` is negative, the value is dropped and will not be
-// added to the sum of the segment.
+// The output slice `i` along dimension `batch_dim` is then given by input
+// slice `i`, with the first `seq_lengths[i]` slices along dimension
+// `seq_dim` reversed.
 //
-// `num_segments` should equal the number of distinct segment IDs.
+// For example:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
-// </div>
+// ```
+// # Given this:
+// batch_dim = 0
+// seq_dim = 1
+// input.dims = (4, 8, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// Arguments:
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
+// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
+// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
+// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
 //
-//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
+// # while entries past seq_lens are copied through:
+// output[0, 7:, :, ...] = input[0, 7:, :, ...]
+// output[1, 2:, :, ...] = input[1, 2:, :, ...]
+// output[2, 3:, :, ...] = input[2, 3:, :, ...]
+// output[3, 2:, :, ...] = input[3, 2:, :, ...]
+// ```
 //
+// In contrast, if:
 //
-// Returns Has same shape as data, except for the first `segment_ids.rank`
-// dimensions, which are replaced with a single dimension which has size
-// `num_segments`.
-func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentSum",
-		Input: []tf.Input{
-			data, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns which elements of x are finite.
+// ```
+// # Given this:
+// batch_dim = 2
+// seq_dim = 0
+// input.dims = (8, ?, 4, ...)
+// seq_lengths = [7, 2, 3, 5]
 //
-// @compatibility(numpy)
-// Equivalent to np.isfinite
-// @end_compatibility
-func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
+// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
+// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
+// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
+// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
+// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+//
+// # while entries past seq_lens are copied through:
+// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
+// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
+// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
+// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
+// ```
+//
+// Arguments:
+//	input: The input to reverse.
+//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
+// `max(seq_lengths) <= input.dims(seq_dim)`
+//	seq_dim: The dimension which is partially reversed.
+//
+// Returns The partially reversed input. It has the same shape as `input`.
+func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IsFinite",
+		Type: "ReverseSequence",
 		Input: []tf.Input{
-			x,
+			input, seq_lengths,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatMulAttr is an optional argument to MatMul.
-type MatMulAttr func(optionalAttr)
+// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
+type UniqueWithCountsAttr func(optionalAttr)
 
-// MatMulTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, "a" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeA(value bool) MatMulAttr {
+// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
 	return func(m optionalAttr) {
-		m["transpose_a"] = value
+		m["out_idx"] = value
 	}
 }
 
-// MatMulTransposeB sets the optional transpose_b attribute to value.
+// Finds unique elements in a 1-D tensor.
 //
-// value: If true, "b" is transposed before multiplication.
-// If not specified, defaults to false
-func MatMulTransposeB(value bool) MatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// Multiply the matrix "a" by the matrix "b".
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. Finally, it returns a third tensor `count` that
+// contains the count of each element of `y` in `x`. In other words:
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// "a" (after being transposed if transpose_a is true) must match the
-// outer dimension of "b" (after being transposed if transposed_b is
-// true).
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// *Note*: The default kernel implementation for MatMul on GPUs uses
-// cublas.
-func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
+// For example:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx, count = unique_with_counts(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// count ==> [2, 1, 3, 1, 2]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns 1-D.1-D.1-D.
+func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -1856,397 +1889,524 @@ func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (pro
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatMul",
+		Type: "UniqueWithCounts",
 		Input: []tf.Input{
-			a, b,
+			x,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Selects elements from `x` or `y`, depending on `condition`.
+// UniqueV2Attr is an optional argument to UniqueV2.
+type UniqueV2Attr func(optionalAttr)
+
+// UniqueV2OutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
-// The `x`, and `y` tensors must all have the same shape, and the
-// output will also have that shape.
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
 //
-// The `condition` tensor must be a scalar if `x` and `y` are scalars.
-// If `x` and `y` are vectors or higher rank, then `condition` must be either a
-// scalar, a vector with size matching the first dimension of `x`, or must have
-// the same shape as `x`.
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
 //
-// The `condition` tensor acts as a mask that chooses, based on the value at each
-// element, whether the corresponding element / row in the output should be
-// taken from `x` (if true) or `y` (if false).
+// For example:
 //
-// If `condition` is a vector and `x` and `y` are higher rank matrices, then
-// it chooses which row (outer dimension) to copy from `x` and `y`.
-// If `condition` has the same shape as `x` and `y`, then it chooses which
-// element to copy from `x` and `y`.
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
 //
-// For example:
+// Arguments:
+//	x: A `Tensor`.
+//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
+// find the unique elements.
 //
-// ```python
-// # 'condition' tensor is [[True,  False]
-// #                        [False, True]]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e)  # => [[1, 6], [7, 4]]
+// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
+// value of x in the output y.
+func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniqueV2",
+		Input: []tf.Input{
+			x, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
 //
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`. This operation also returns a
+// tensor `idx` the same size as `x` that contains the index of each value of `x`
+// in the unique output `y`. In other words:
 //
-// # 'condition' tensor is [True, False]
-// # 't' is [[1, 2],
-// #         [3, 4]]
-// # 'e' is [[5, 6],
-// #         [7, 8]]
-// select(condition, t, e) ==> [[1, 2],
-//                              [7, 8]]
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// For example:
 //
 // ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
 //
 // Arguments:
+//	x: 1-D.
 //
-//	x: = A `Tensor` which may have the same shape as `condition`.
-// If `condition` is rank 1, `x` may have higher rank,
-// but its first dimension must match the size of `condition`.
-//	y: = A `Tensor` with the same type and shape as `x`.
-//
-// Returns = A `Tensor` with the same type and shape as `x` and `y`.
-func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+// Returns 1-D.1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Select",
+		Type: "Unique",
 		Input: []tf.Input{
-			condition, x, y,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns the truth value of x OR y element-wise.
+// Shuffle dimensions of x according to a permutation and conjugate the result.
 //
-// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalOr",
+		Type: "ConjugateTranspose",
 		Input: []tf.Input{
-			x, y,
+			x, perm,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
+// Reshapes a tensor.
 //
-// The regularized incomplete beta integral is defined as:
+// Given `tensor`, this operation returns a tensor that has the same values
+// as `tensor` with shape `shape`.
+//
+// If one component of `shape` is the special value -1, the size of that dimension
+// is computed so that the total size remains constant.  In particular, a `shape`
+// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
 //
+// If `shape` is 1-D or higher, then the operation returns a tensor with shape
+// `shape` filled with the values of `tensor`. In this case, the number of elements
+// implied by `shape` must be the same as the number of elements in `tensor`.
 //
-// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+// For example:
 //
-// where
+// ```
+// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
+// # tensor 't' has shape [9]
+// reshape(t, [3, 3]) ==> [[1, 2, 3],
+//                         [4, 5, 6],
+//                         [7, 8, 9]]
 //
+// # tensor 't' is [[[1, 1], [2, 2]],
+// #                [[3, 3], [4, 4]]]
+// # tensor 't' has shape [2, 2, 2]
+// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
+//                         [3, 3, 4, 4]]
 //
-// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+// # tensor 't' is [[[1, 1, 1],
+// #                 [2, 2, 2]],
+// #                [[3, 3, 3],
+// #                 [4, 4, 4]],
+// #                [[5, 5, 5],
+// #                 [6, 6, 6]]]
+// # tensor 't' has shape [3, 2, 3]
+// # pass '[-1]' to flatten 't'
+// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
 //
+// # -1 can also be used to infer the shape
 //
-// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
-// beta function.
-func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
+// # -1 is inferred to be 9:
+// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 2:
+// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
+//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
+// # -1 is inferred to be 3:
+// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
+//                               [2, 2, 2],
+//                               [3, 3, 3]],
+//                              [[4, 4, 4],
+//                               [5, 5, 5],
+//                               [6, 6, 6]]]
+//
+// # tensor 't' is [7]
+// # shape `[]` reshapes to a scalar
+// reshape(t, []) ==> 7
+// ```
+//
+// Arguments:
+//
+//	shape: Defines the shape of the output tensor.
+func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Betainc",
+		Type: "Reshape",
 		Input: []tf.Input{
-			a, b, x,
+			tensor, shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
-//
-// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
+// Checks a tensor for NaN and Inf values.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// When run, reports an `InvalidArgument` error if `tensor` has any values
+// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
 //
 // Arguments:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+//	message: Prefix of the error message.
+func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"message": message}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtNWithNumSegments",
+		Type: "CheckNumerics",
 		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
+			tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
-//
-// The upper regularized incomplete Gamma function is defined as:
-//
-// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+// Returns the complex conjugate of a complex number.
 //
-// where
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// complex numbers that are the complex conjugate of each element in `input`. The
+// complex numbers in `input` must be of the form \\(a + bj\\), where *a* is the
+// real part and *b* is the imaginary part.
 //
-// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+// The complex conjugate returned by this operation is of the form \\(a - bj\\).
 //
-// is the upper incomplete Gama function.
+// For example:
 //
-// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
-// Gamma function.
-func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
+// ```
+func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igammac",
+		Type: "Conj",
 		Input: []tf.Input{
-			a, x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
-type LogUniformCandidateSamplerAttr func(optionalAttr)
+// ResourceSparseApplyMomentumAttr is an optional argument to ResourceSparseApplyMomentum.
+type ResourceSparseApplyMomentumAttr func(optionalAttr)
 
-// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
+// ResourceSparseApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseLocking(value bool) ResourceSparseApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
 }
 
-// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ResourceSparseApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceSparseApplyMomentumUseNesterov(value bool) ResourceSparseApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a log-uniform distribution.
+// Update relevant entries in '*var' and '*accum' according to the momentum scheme.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// Set use_nesterov = True if you want to use Nesterov momentum.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// That is for rows we have grad for, we update var and accum as follows:
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	momentum: Momentum. Must be a scalar.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, momentum tf.Output, optional ...ResourceSparseApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LogUniformCandidateSampler",
+		Type: "ResourceSparseApplyMomentum",
 		Input: []tf.Input{
-			true_classes,
+			var_, accum, lr, grad, indices, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns (x - y)(x - y) element-wise.
+// Creates a sequence of numbers.
 //
-// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// This operation creates a sequence of numbers that begins at `start` and
+// extends by increments of `delta` up to but not including `limit`.
+//
+// For example:
+//
+// ```
+// # 'start' is 3
+// # 'limit' is 18
+// # 'delta' is 3
+// tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
+// ```
+//
+// Arguments:
+//	start: 0-D (scalar). First entry in the sequence.
+//	limit: 0-D (scalar). Upper limit of sequence, exclusive.
+//	delta: 0-D (scalar). Optional. Default is 1. Number that increments `start`.
+//
+// Returns 1-D.
+func Range(scope *Scope, start tf.Output, limit tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SquaredDifference",
+		Type: "Range",
 		Input: []tf.Input{
-			x, y,
+			start, limit, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Forwards the input to the output.
+// Computes gradients for SparseSegmentSqrtN.
 //
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
-//
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
+//	grad: gradient propagated to the SparseSegmentSqrtN op.
+//	indices: indices passed to the corresponding SparseSegmentSqrtN op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentSqrtN op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentSqrtN op.
+func SparseSegmentSqrtNGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LoopCond",
+		Type: "SparseSegmentSqrtNGrad",
 		Input: []tf.Input{
-			input,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+// Computes the mean along sparse segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentMean(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
+		Type: "SparseSegmentMean",
 		Input: []tf.Input{
-			x, y,
+			data, indices, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise.
+// Pop the element at the top of the stack.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	handle: The handle to a stack.
+//	elem_type: The type of the elem that is popped.
+//
+// Returns The tensor that is popped from the top of the stack.
+func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	opspec := tf.OpSpec{
-		Type: "Div",
+		Type: "StackPopV2",
 		Input: []tf.Input{
-			x, y,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x * y element-wise.
+// Computes the sum along sparse segments of a tensor.
 //
-// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Mul",
+		Type: "SparseSegmentSumWithNumSegments",
 		Input: []tf.Input{
-			x, y,
+			data, indices, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddAttr is an optional argument to BiasAdd.
-type BiasAddAttr func(optionalAttr)
+// PreventGradientAttr is an optional argument to PreventGradient.
+type PreventGradientAttr func(optionalAttr)
 
-// BiasAddDataFormat sets the optional data_format attribute to value.
+// PreventGradientMessage sets the optional message attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddDataFormat(value string) BiasAddAttr {
+// value: Will be printed in the error when anyone tries to differentiate
+// this operation.
+// If not specified, defaults to ""
+func PreventGradientMessage(value string) PreventGradientAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["message"] = value
 	}
 }
 
-// Adds `bias` to `value`.
+// An identity op that triggers an error if a gradient is requested.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, the TensorFlow gradient system
+// will return an error when trying to lookup the gradient of this op,
+// because no gradient must ever be registered for this function.  This
+// op exists to prevent subtle bugs from silently returning unimplemented
+// gradients in some corner cases.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	input: any tensor.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
+// Returns the same input tensor.
+func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2255,9 +2415,9 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAdd",
+		Type: "PreventGradient",
 		Input: []tf.Input{
-			value, bias,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -2265,41 +2425,68 @@ func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddA
 	return op.Output(0)
 }
 
-// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
-type SparseReduceSumSparseAttr func(optionalAttr)
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Asin",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Converts a sparse representation into a dense tensor.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
-// SparseTensor.
+// Builds an array `dense` with shape `output_shape` such that
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2308,149 +2495,108 @@ func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSumSparse",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AddV2",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns x + y element-wise.
-//
-// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Add",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// NthElementAttr is an optional argument to NthElement.
-type NthElementAttr func(optionalAttr)
-
-// NthElementReverse sets the optional reverse attribute to value.
-//
-// value: When set to True, find the nth-largest value in the vector and vice
-// versa.
-// If not specified, defaults to false
-func NthElementReverse(value bool) NthElementAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Finds values of the `n`-th order statistic for the last dimension.
-//
-// If the input is a vector (rank-1), finds the entries which is the nth-smallest
-// value in the vector and outputs their values as scalar tensor.
+// Counts the number of occurrences of each value in an integer array.
 //
-// For matrices (resp. higher rank input), computes the entries which is the
-// nth-smallest value in each row (resp. vector along the last dimension). Thus,
+// Outputs a vector with length `size` and the same dtype as `weights`. If
+// `weights` are empty, then index `i` stores the number of times the value `i` is
+// counted in `arr`. If `weights` are non-empty, then index `i` stores the sum of
+// the value in `weights` at each index where the corresponding value in `arr` is
+// `i`.
 //
-//     values.shape = input.shape[:-1]
+// Values in `arr` outside of the range [0, size) are ignored.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `n+1`.
-//	n: 0-D. Position of sorted vector to select along the last dimension (along
-// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
+//	arr: int32 `Tensor`.
+//	size: non-negative int32 scalar `Tensor`.
+//	weights: is an int32, int64, float32, or float64 `Tensor` with the same
+// shape as `arr`, or a length-0 `Tensor`, in which case it acts as all weights
+// equal to 1.
 //
-// Returns The `n`-th order statistic along each last dimensional slice.
-func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
+// Returns 1D `Tensor` with length equal to `size`. The counts or summed weights for
+// each value in the range [0, size).
+func Bincount(scope *Scope, arr tf.Output, size tf.Output, weights tf.Output) (bins tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "NthElement",
+		Type: "Bincount",
 		Input: []tf.Input{
-			input, n,
+			arr, size, weights,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the Max along segments of a tensor.
+// Computes the sum along sparse segments of a tensor.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
 // segments.
 //
-// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
-// Instead of computing the sum over segments, it computes the maximum
-// such that:
+// Like `SegmentSum`, but `segment_ids` can have rank less than `data`'s first
+// dimension, selecting a subset of dimension 0, specified by `indices`.
 //
-// \\(output_i = \max_j data_j\\) where max is over `j` such
-// that `segment_ids[j] == i`.
+// For example:
 //
-// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
-//  `output[i] = numeric_limits<T>::min()`.
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
-// </div>
+// # Select two rows, one segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 0]))
+// # => [[0 0 0 0]]
 //
-// Arguments:
+// # Select two rows, two segment.
+// tf.sparse_segment_sum(c, tf.constant([0, 1]), tf.constant([0, 1]))
+// # => [[ 1  2  3  4]
+// #     [-1 -2 -3 -4]]
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.
+// # Select all rows, two segments.
+// tf.sparse_segment_sum(c, tf.constant([0, 1, 2]), tf.constant([0, 0, 1]))
+// # => [[0 0 0 0]
+// #     [5 6 7 8]]
+//
+// # Which is equivalent to:
+// tf.segment_sum(c, tf.constant([0, 0, 1]))
+// ```
 //
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //
 // Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// has size `k`, the number of segments.
+func SparseSegmentSum(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "UnsortedSegmentMax",
+		Type: "SparseSegmentSum",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x element-wise.  \\(y = e^x\\).
-func Exp(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes hyperbolic sine of x element-wise.
+func Sinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exp",
+		Type: "Sinh",
 		Input: []tf.Input{
 			x,
 		},
@@ -2459,161 +2605,185 @@ func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns an element-wise indication of the sign of a number.
-//
-// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
-//
-// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-func Sign(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sign",
+		Type: "Relu6",
 		Input: []tf.Input{
-			x,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ArgMinAttr is an optional argument to ArgMin.
-type ArgMinAttr func(optionalAttr)
-
-// ArgMinOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMinOutputType(value tf.DataType) ArgMinAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
-	}
-}
-
-// Returns the index with the smallest value across dimensions of a tensor.
+// Computes the sum along segments of a tensor.
 //
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// Arguments:
+// Computes a tensor such that
+// `(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such
+// that `segment_ids[j...] == i`.  Unlike `SegmentSum`, `segment_ids`
+// need not be sorted and need not cover all values in the full
+// range of valid values.
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ArgMin",
-		Input: []tf.Input{
-			input, dimension,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// If the given segment ID `i` is negative, the value is dropped and will not be
+// added to the sum of the segment.
 //
-// output range specified with 'requested_output_min' and 'requested_output_max'.
+// `num_segments` should equal the number of distinct segment IDs.
 //
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentSum.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	requested_output_min: The float value that the minimum quantized output value represents.
-//	requested_output_max: The float value that the maximum quantized output value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//	segment_ids: A tensor whose shape is a prefix of `data.shape`.
 //
-// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
-func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+//
+// Returns Has same shape as data, except for the first `segment_ids.rank`
+// dimensions, which are replaced with a single dimension which has size
+// `num_segments`.
+func UnsortedSegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Requantize",
+		Type: "UnsortedSegmentSum",
 		Input: []tf.Input{
-			input, input_min, input_max, requested_output_min, requested_output_max,
+			data, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes the determinant of one or more square matrices.
+// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
+type ResourceStridedSliceAssignAttr func(optionalAttr)
+
+// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["begin_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["end_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["ellipsis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["new_axis_mask"] = value
+	}
+}
+
+// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// If not specified, defaults to 0
+func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+	return func(m optionalAttr) {
+		m["shrink_axis_mask"] = value
+	}
+}
+
+// Assign `value` to the sliced l-value reference of `ref`.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
+// The values of `value` are assigned to the positions in the variable
+// `ref` that are selected by the slice parameters. The slice parameters
+// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// NOTE this op currently does not support broadcasting and so `value`'s
+// shape must be exactly the shape produced by the slice of `ref`.
 //
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the created operation.
+func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
+		Type: "ResourceStridedSliceAssign",
 		Input: []tf.Input{
-			input,
+			ref, begin, end, strides, value,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes sin of x element-wise.
-func Sin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sin",
-		Input: []tf.Input{
-			x,
-		},
+// ArgMaxAttr is an optional argument to ArgMax.
+type ArgMaxAttr func(optionalAttr)
+
+// ArgMaxOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the index with the largest value across dimensions of a tensor.
+//
+// Note that in case of ties the identity of the return value is not guaranteed.
+//
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Erfc",
+		Type: "ArgMax",
 		Input: []tf.Input{
-			x,
+			input, dimension,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes Psi, the derivative of Lgamma (the log of the absolute value of
+// Returns which elements of x are finite.
 //
-// `Gamma(x)`), element-wise.
-func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isfinite
+// @end_compatibility
+func IsFinite(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Digamma",
+		Type: "IsFinite",
 		Input: []tf.Input{
 			x,
 		},
@@ -2622,74 +2792,50 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// MatMulAttr is an optional argument to MatMul.
+type MatMulAttr func(optionalAttr)
 
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+// MatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+// value: If true, "a" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeA(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+// MatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+// value: If true, "b" is transposed before multiplication.
+// If not specified, defaults to false
+func MatMulTransposeB(value bool) MatMulAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// Computes the gradients of convolution with respect to the filter.
+// Multiply the matrix "a" by the matrix "b".
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// "a" (after being transposed if transpose_a is true) must match the
+// outer dimension of "b" (after being transposed if transposed_b is
+// true).
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+// *Note*: The default kernel implementation for MatMul on GPUs uses
+// cublas.
+func MatMul(scope *Scope, a tf.Output, b tf.Output, optional ...MatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
+		Type: "MatMul",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -2697,70 +2843,121 @@ func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output,
 	return op.Output(0)
 }
 
-// Returns the number of work units this Reader has finished processing.
+// Selects elements from `x` or `y`, depending on `condition`.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumWorkUnitsCompletedV2",
-		Input: []tf.Input{
-			reader_handle,
+// The `x`, and `y` tensors must all have the same shape, and the
+// output will also have that shape.
+//
+// The `condition` tensor must be a scalar if `x` and `y` are scalars.
+// If `x` and `y` are vectors or higher rank, then `condition` must be either a
+// scalar, a vector with size matching the first dimension of `x`, or must have
+// the same shape as `x`.
+//
+// The `condition` tensor acts as a mask that chooses, based on the value at each
+// element, whether the corresponding element / row in the output should be
+// taken from `x` (if true) or `y` (if false).
+//
+// If `condition` is a vector and `x` and `y` are higher rank matrices, then
+// it chooses which row (outer dimension) to copy from `x` and `y`.
+// If `condition` has the same shape as `x` and `y`, then it chooses which
+// element to copy from `x` and `y`.
+//
+// For example:
+//
+// ```python
+// # 'condition' tensor is [[True,  False]
+// #                        [False, True]]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e)  # => [[1, 6], [7, 4]]
+//
+//
+// # 'condition' tensor is [True, False]
+// # 't' is [[1, 2],
+// #         [3, 4]]
+// # 'e' is [[5, 6],
+// #         [7, 8]]
+// select(condition, t, e) ==> [[1, 2],
+//                              [7, 8]]
+//
+// ```
+//
+// Arguments:
+//
+//	x: = A `Tensor` which may have the same shape as `condition`.
+// If `condition` is rank 1, `x` may have higher rank,
+// but its first dimension must match the size of `condition`.
+//	y: = A `Tensor` with the same type and shape as `x`.
+//
+// Returns = A `Tensor` with the same type and shape as `x` and `y`.
+func Select(scope *Scope, condition tf.Output, x tf.Output, y tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Select",
+		Input: []tf.Input{
+			condition, x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the log of the absolute value of `Gamma(x)` element-wise.
-func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns the truth value of x OR y element-wise.
+//
+// *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Lgamma",
+		Type: "LogicalOr",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
+// Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
-// For an explanation see "Differentiation of the Cholesky algorithm" by
-// Iain Murray http://arxiv.org/abs/1602.07527.
+// The regularized incomplete beta integral is defined as:
 //
-// Arguments:
-//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
-//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
-// Algorithm depends only on lower triangular part of the innermost matrices of
-// this tensor.
 //
-// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
-func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
+// \\(I_x(a, b) = \frac{B(x; a, b)}{B(a, b)}\\)
+//
+// where
+//
+//
+// \\(B(x; a, b) = \int_0^x t^{a-1} (1 - t)^{b-1} dt\\)
+//
+//
+// is the incomplete beta function and \\(B(a, b)\\) is the *complete*
+// beta function.
+func Betainc(scope *Scope, a tf.Output, b tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "CholeskyGrad",
+		Type: "Betainc",
 		Input: []tf.Input{
-			l, grad,
+			a, b, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the mean along sparse segments of a tensor.
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
 //
-// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// N is the size of the segment being reduced.
+//
+// Like `SparseSegmentSqrtN`, but allows missing ids in `segment_ids`. If an id is
 // misisng, the `output` tensor at that position will be zeroed.
 //
 // Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
@@ -2772,14 +2969,14 @@ func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output)
 //	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
 //	num_segments: Should equal the number of distinct segment IDs.
 //
-// Returns Has same shape as data, except for dimension 0 which has size
-// `num_segments`.
-func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtNWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanWithNumSegments",
+		Type: "SparseSegmentSqrtNWithNumSegments",
 		Input: []tf.Input{
 			data, indices, segment_ids, num_segments,
 		},
@@ -2788,31 +2985,58 @@ func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.O
 	return op.Output(0)
 }
 
-// Computes hyperbolic cosine of x element-wise.
-func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Compute the upper regularized incomplete Gamma function `Q(a, x)`.
+//
+// The upper regularized incomplete Gamma function is defined as:
+//
+// \\(Q(a, x) = Gamma(a, x) / Gamma(a) = 1 - P(a, x)\\)
+//
+// where
+//
+// \\(Gamma(a, x) = int_{x}^{\infty} t^{a-1} exp(-t) dt\\)
+//
+// is the upper incomplete Gama function.
+//
+// Note, above `P(a, x)` (`Igamma`) is the lower regularized complete
+// Gamma function.
+func Igammac(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cosh",
+		Type: "Igammac",
 		Input: []tf.Input{
-			x,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that emits each dim-0 slice of `components` once.
-func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorSliceDataset",
+		Type: "ApproximateEqual",
 		Input: []tf.Input{
-			tf.OutputList(components),
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -2820,73 +3044,72 @@ func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf
 	return op.Output(0)
 }
 
-// Computes natural logarithm of (1 + x) element-wise.
+// Returns x / y element-wise.
 //
-// I.e., \\(y = \log_e (1 + x)\\).
-func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Div(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Log1p",
+		Type: "Div",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear 6 gradients for a Relu6 operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
-//	features: The features passed as input to the corresponding Relu6 operation, or
-// its output; using either one produces the same result.
+// Returns x * y element-wise.
 //
-// Returns The gradients:
-// `gradients * (features > 0) * (features < 6)`.
-func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// *NOTE*: `Multiply` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mul(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu6Grad",
+		Type: "Mul",
 		Input: []tf.Input{
-			gradients, features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicAttr is an optional argument to ResizeBicubic.
-type ResizeBicubicAttr func(optionalAttr)
+// BiasAddAttr is an optional argument to BiasAdd.
+type BiasAddAttr func(optionalAttr)
 
-// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
+// BiasAddDataFormat sets the optional data_format attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddDataFormat(value string) BiasAddAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["data_format"] = value
 	}
 }
 
-// Resize `images` to `size` using bicubic interpolation.
+// Adds `bias` to `value`.
 //
-// Input images can be of different types but output images are always float.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAdd(scope *Scope, value tf.Output, bias tf.Output, optional ...BiasAddAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -2895,9 +3118,9 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubic",
+		Type: "BiasAdd",
 		Input: []tf.Input{
-			images, size,
+			value, bias,
 		},
 		Attrs: attrs,
 	}
@@ -2905,180 +3128,126 @@ func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...R
 	return op.Output(0)
 }
 
-// Computes natural logarithm of x element-wise.
+// SparseReduceSumSparseAttr is an optional argument to SparseReduceSumSparse.
+type SparseReduceSumSparseAttr func(optionalAttr)
+
+// SparseReduceSumSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// I.e., \\(y = \log_e x\\).
-func Log(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Log",
-		Input: []tf.Input{
-			x,
-		},
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumSparseKeepDims(value bool) SparseReduceSumSparseAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Rounds the values of a tensor to the nearest integer, element-wise.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RecordInputAttr is an optional argument to RecordInput.
-type RecordInputAttr func(optionalAttr)
-
-// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
-//
-// value: Random seeds used to produce randomized records.
-// If not specified, defaults to 301
-func RecordInputFileRandomSeed(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_random_seed"] = value
-	}
-}
-
-// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In contrast to SparseReduceSum, this Op returns a
+// SparseTensor.
 //
-// value: Shifts the list of files after the list is randomly
-// shuffled.
-// If not specified, defaults to 0
-func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_shuffle_shift_ratio"] = value
-	}
-}
-
-// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// value: The randomization shuffling buffer.
-// If not specified, defaults to 10000
-func RecordInputFileBufferSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_buffer_size"] = value
-	}
-}
-
-// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
-// value: How many sstables are opened and concurrently iterated over.
-// If not specified, defaults to 16
-func RecordInputFileParallelism(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["file_parallelism"] = value
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceSumSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RecordInputBatchSize sets the optional batch_size attribute to value.
-//
-// value: The batch size.
-// If not specified, defaults to 32
-func RecordInputBatchSize(value int64) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["batch_size"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
-}
-
-// RecordInputCompressionType sets the optional compression_type attribute to value.
-//
-// value: The type of compression for the file. Currently ZLIB and
-// GZIP are supported. Defaults to none.
-// If not specified, defaults to ""
-func RecordInputCompressionType(value string) RecordInputAttr {
-	return func(m optionalAttr) {
-		m["compression_type"] = value
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSumSparse",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Emits randomized records.
-//
-// Arguments:
-//	file_pattern: Glob pattern for the data files.
+// Returns x + y element-wise.
 //
-// Returns A tensor of shape [batch_size].
-func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func AddV2(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"file_pattern": file_pattern}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RecordInput",
-
-		Attrs: attrs,
+		Type: "AddV2",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes reciprocal of square root of x element-wise.
+// Returns x + y element-wise.
 //
-// I.e., \\(y = 1 / \sqrt{x}\\).
-func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Rsqrt",
+		Type: "Add",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixInverseAttr is an optional argument to MatrixInverse.
-type MatrixInverseAttr func(optionalAttr)
+// NthElementAttr is an optional argument to NthElement.
+type NthElementAttr func(optionalAttr)
 
-// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// NthElementReverse sets the optional reverse attribute to value.
+//
+// value: When set to True, find the nth-largest value in the vector and vice
+// versa.
 // If not specified, defaults to false
-func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
+func NthElementReverse(value bool) NthElementAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["reverse"] = value
 	}
 }
 
-// Computes the inverse of one or more square invertible matrices or their
-//
-// adjoints (conjugate transposes).
+// Finds values of the `n`-th order statistic for the last dimension.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the inverse for all input submatrices `[..., :, :]`.
+// If the input is a vector (rank-1), finds the entries which is the nth-smallest
+// value in the vector and outputs their values as scalar tensor.
 //
-// The op uses LU decomposition with partial pivoting to compute the inverses.
+// For matrices (resp. higher rank input), computes the entries which is the
+// nth-smallest value in each row (resp. vector along the last dimension). Thus,
 //
-// If a matrix is not invertible there is no guarantee what the op does. It
-// may detect the condition and raise an exception or it may simply return a
-// garbage result.
+//     values.shape = input.shape[:-1]
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
+//	input: 1-D or higher with last dimension at least `n+1`.
+//	n: 0-D. Position of sorted vector to select along the last dimension (along
+// each row for matrices). Valid range of n is `[0, input.shape[:-1])`
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.inv
-// @end_compatibility
-func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
+// Returns The `n`-th order statistic along each last dimensional slice.
+func NthElement(scope *Scope, input tf.Output, n tf.Output, optional ...NthElementAttr) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3087,9 +3256,9 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixInverse",
+		Type: "NthElement",
 		Input: []tf.Input{
-			input,
+			input, n,
 		},
 		Attrs: attrs,
 	}
@@ -3097,50 +3266,73 @@ func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr)
 	return op.Output(0)
 }
 
-// Computes square of x element-wise.
+// Computes the Max along segments of a tensor.
 //
-// I.e., \\(y = x * x = x^2\\).
-func Square(scope *Scope, x tf.Output) (y tf.Output) {
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// This operator is similar to the [unsorted segment sum operator](../../../api_docs/python/math_ops.md#UnsortedSegmentSum).
+// Instead of computing the sum over segments, it computes the maximum
+// such that:
+//
+// \\(output_i = \max_j data_j\\) where max is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the maximum is empty for a given segment ID `i`, it outputs the smallest possible value for specific numeric type,
+//  `output[i] = numeric_limits<T>::min()`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/UnsortedSegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.
+//
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func UnsortedSegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Square",
+		Type: "UnsortedSegmentMax",
 		Input: []tf.Input{
-			x,
+			data, segment_ids, num_segments,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Computes exponential of x element-wise.  \\(y = e^x\\).
+func Exp(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Elu",
+		Type: "Exp",
 		Input: []tf.Input{
-			features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the reciprocal of x element-wise.
+// Returns an element-wise indication of the sign of a number.
 //
-// I.e., \\(y = 1 / x\\).
-func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
+// `y = sign(x) = -1` if `x < 0`; 0 if `x == 0`; 1 if `x > 0`.
+//
+// For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
+func Sign(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reciprocal",
+		Type: "Sign",
 		Input: []tf.Input{
 			x,
 		},
@@ -3149,166 +3341,182 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// OrderedMapClearAttr is an optional argument to OrderedMapClear.
-type OrderedMapClearAttr func(optionalAttr)
+// ArgMinAttr is an optional argument to ArgMin.
+type ArgMinAttr func(optionalAttr)
 
-// OrderedMapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+// ArgMinOutputType sets the optional output_type attribute to value.
+// If not specified, defaults to DT_INT64
+func ArgMinOutputType(value tf.DataType) ArgMinAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["output_type"] = value
 	}
 }
 
-// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Returns the index with the smallest value across dimensions of a tensor.
 //
-// REQUIRES: value >= 0
-func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearContainer(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// Note that in case of ties the identity of the return value is not guaranteed.
 //
-// Returns the created operation.
-func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
+// Arguments:
+//
+//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
+// Describes which dimension of the input Tensor to reduce across. For vectors,
+// use dimension = 0.
+func ArgMin(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapClear",
+		Type: "ArgMin",
+		Input: []tf.Input{
+			input, dimension,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
+//
+// output range specified with 'requested_output_min' and 'requested_output_max'.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	requested_output_min: The float value that the minimum quantized output value represents.
+//	requested_output_max: The float value that the maximum quantized output value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The requested_output_min value is copied into this output.The requested_output_max value is copied into this output.
+func Requantize(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, requested_output_min tf.Output, requested_output_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "Requantize",
+		Input: []tf.Input{
+			input, input_min, input_max, requested_output_min, requested_output_max,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the reciprocal of x element-wise.
+// Computes the determinant of one or more square matrices.
 //
-// I.e., \\(y = 1 / x\\).
-func Inv(scope *Scope, x tf.Output) (y tf.Output) {
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Inv",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAbsAttr is an optional argument to ComplexAbs.
-type ComplexAbsAttr func(optionalAttr)
-
-// ComplexAbsTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+// Computes sin of x element-wise.
+func Sin(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sin",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the complex absolute value of a tensor.
-//
-// Given a tensor `x` of complex numbers, this operation returns a tensor of type
-// `float` or `double` that is the absolute value of each element in `x`. All
-// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
-// value is computed as \\( \sqrt{a^2 + b^2}\\).
-func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ComplexAbs",
+		Type: "Erfc",
 		Input: []tf.Input{
 			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of x AND y element-wise.
+// Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
-// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// `Gamma(x)`), element-wise.
+func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalAnd",
+		Type: "Digamma",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "Cast",
+		Type: "Transpose",
 		Input: []tf.Input{
-			x,
+			x, perm,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxAttr is an optional argument to Max.
-type MaxAttr func(optionalAttr)
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
 
-// MaxKeepDims sets the optional keep_dims attribute to value.
+// MinKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MaxKeepDims(value bool) MaxAttr {
+func MinKeepDims(value bool) MinAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the maximum of elements across dimensions of a tensor.
+// Computes the minimum of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -3321,7 +3529,7 @@ func MaxKeepDims(value bool) MaxAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3330,7 +3538,7 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Max",
+		Type: "Min",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -3340,367 +3548,463 @@ func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (ou
 	return op.Output(0)
 }
 
-// Quantized Batch normalization.
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
 //
-// This op is deprecated and will be removed in the future. Prefer
-// `tf.nn.batch_normalization`.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	t_min: The value represented by the lowest quantized input.
-//	t_max: The value represented by the highest quantized input.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	m_min: The value represented by the lowest quantized mean.
-//	m_max: The value represented by the highest quantized mean.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v_min: The value represented by the lowest quantized variance.
-//	v_max: The value represented by the highest quantized variance.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	beta_min: The value represented by the lowest quantized offset.
-//	beta_max: The value represented by the highest quantized offset.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	gamma_min: The value represented by the lowest quantized gamma.
-//	gamma_max: The value represented by the highest quantized gamma.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBatchNormWithGlobalNormalization",
+		Type: "Conv2DBackpropFilter",
 		Input: []tf.Input{
-			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
-type HistogramFixedWidthAttr func(optionalAttr)
-
-// HistogramFixedWidthDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT32
-func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Return histogram of values.
-//
-// Given the tensor `values`, this operation returns a rank 1 histogram counting
-// the number of entries in `values` that fall into every bin.  The bins are
-// equal width and determined by the arguments `value_range` and `nbins`.
-//
-// ```python
-// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-// nbins = 5
-// value_range = [0.0, 5.0]
-// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-//
-// with tf.get_default_session() as sess:
-//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-//   variables.global_variables_initializer().run()
-//   sess.run(hist) => [2, 1, 1, 0, 2]
-// ```
+// Returns the number of work units this Reader has finished processing.
 //
 // Arguments:
-//	values: Numeric `Tensor`.
-//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
-// values <= value_range[0] will be mapped to hist[0],
-// values >= value_range[1] will be mapped to hist[-1].
-//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
-//
-// Returns A 1-D `Tensor` holding histogram of values.
-func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumWorkUnitsCompletedV2(scope *Scope, reader_handle tf.Output) (units_completed tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "ReaderNumWorkUnitsCompletedV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the log of the absolute value of `Gamma(x)` element-wise.
+func Lgamma(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HistogramFixedWidth",
+		Type: "Lgamma",
 		Input: []tf.Input{
-			values, value_range, nbins,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
+// Computes the reverse mode backpropagated gradient of the Cholesky algorithm.
 //
-// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
+// For an explanation see "Differentiation of the Cholesky algorithm" by
+// Iain Murray http://arxiv.org/abs/1602.07527.
 //
 // Arguments:
+//	l: Output of batch Cholesky algorithm l = cholesky(A). Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
+//	grad: df/dl where f is some scalar function. Shape is `[..., M, M]`.
+// Algorithm depends only on lower triangular part of the innermost matrices of
+// this tensor.
 //
-//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_bias: The float value that the lowest quantized bias value represents.
-//	max_bias: The float value that the highest quantized bias value represents.
-//
-//
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
+// Returns Symmetrized version of df/dA . Shape is `[..., M, M]`
+func CholeskyGrad(scope *Scope, l tf.Output, grad tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizedBiasAdd",
+		Type: "CholeskyGrad",
 		Input: []tf.Input{
-			input, bias, min_input, max_input, min_bias, max_bias,
+			l, grad,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Produces the average pool of the input tensor for quantized types.
+// Computes the mean along sparse segments of a tensor.
+//
+// Like `SparseSegmentMean`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.  The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which has size
+// `num_segments`.
+func SparseSegmentMeanWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAvgPool",
+		Type: "SparseSegmentMeanWithNumSegments",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			data, indices, segment_ids, num_segments,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
-type FractionalAvgPoolAttr func(optionalAttr)
-
-// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+// Computes hyperbolic cosine of x element-wise.
+func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cosh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
-// If not specified, defaults to false
-func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
+// Creates a dataset that emits each dim-0 slice of `components` once.
+func TensorSliceDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "TensorSliceDataset",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+// Computes natural logarithm of (1 + x) element-wise.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalAvgPool node in the computation graph. Mainly used
-// in unit test to make FractionalAvgPool deterministic.
-// If not specified, defaults to false
-func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["deterministic"] = value
+// I.e., \\(y = \log_e (1 + x)\\).
+func Log1p(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log1p",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed sets the optional seed attribute to value.
+// Computes rectified linear 6 gradients for a Relu6 operation.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Relu6 operation.
+//	features: The features passed as input to the corresponding Relu6 operation, or
+// its output; using either one produces the same result.
+//
+// Returns The gradients:
+// `gradients * (features > 0) * (features < 6)`.
+func Relu6Grad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6Grad",
+		Input: []tf.Input{
+			gradients, features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+// ResizeBicubicAttr is an optional argument to ResizeBicubic.
+type ResizeBicubicAttr func(optionalAttr)
+
+// ResizeBicubicAlignCorners sets the optional align_corners attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicAlignCorners(value bool) ResizeBicubicAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Performs fractional average pooling on the input.
+// Resize `images` to `size` using bicubic interpolation.
 //
-// Fractional average pooling is similar to Fractional max pooling in the pooling
-// region generation step. The only difference is that after pooling regions are
-// generated, a mean operation is performed instead of a max operation in each
-// pooling region.
+// Input images can be of different types but output images are always float.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBicubic(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBicubicAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPool",
+		Type: "ResizeBicubic",
 		Input: []tf.Input{
-			value,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// RandomCropAttr is an optional argument to RandomCrop.
-type RandomCropAttr func(optionalAttr)
+// Computes natural logarithm of x element-wise.
+//
+// I.e., \\(y = \log_e x\\).
+func Log(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Log",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// RandomCropSeed sets the optional seed attribute to value.
+// Rounds the values of a tensor to the nearest integer, element-wise.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomCropSeed(value int64) RandomCropAttr {
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RecordInputAttr is an optional argument to RecordInput.
+type RecordInputAttr func(optionalAttr)
+
+// RecordInputFileRandomSeed sets the optional file_random_seed attribute to value.
+//
+// value: Random seeds used to produce randomized records.
+// If not specified, defaults to 301
+func RecordInputFileRandomSeed(value int64) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["file_random_seed"] = value
 	}
 }
 
-// RandomCropSeed2 sets the optional seed2 attribute to value.
+// RecordInputFileShuffleShiftRatio sets the optional file_shuffle_shift_ratio attribute to value.
 //
-// value: An second seed to avoid seed collision.
+// value: Shifts the list of files after the list is randomly
+// shuffled.
 // If not specified, defaults to 0
-func RandomCropSeed2(value int64) RandomCropAttr {
+func RecordInputFileShuffleShiftRatio(value float32) RecordInputAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["file_shuffle_shift_ratio"] = value
 	}
 }
 
-// Randomly crop `image`.
+// RecordInputFileBufferSize sets the optional file_buffer_size attribute to value.
 //
-// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+// value: The randomization shuffling buffer.
+// If not specified, defaults to 10000
+func RecordInputFileBufferSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_buffer_size"] = value
+	}
+}
+
+// RecordInputFileParallelism sets the optional file_parallelism attribute to value.
 //
-// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
-// width.  The values must be non negative.
+// value: How many sstables are opened and concurrently iterated over.
+// If not specified, defaults to 16
+func RecordInputFileParallelism(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["file_parallelism"] = value
+	}
+}
+
+// RecordInputBatchSize sets the optional batch_size attribute to value.
 //
-// This Op picks a random location in `image` and crops a `height` by `width`
-// rectangle from that location.  The random location is picked so the cropped
-// area will fit inside the original image.
+// value: The batch size.
+// If not specified, defaults to 32
+func RecordInputBatchSize(value int64) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["batch_size"] = value
+	}
+}
+
+// RecordInputCompressionType sets the optional compression_type attribute to value.
+//
+// value: The type of compression for the file. Currently ZLIB and
+// GZIP are supported. Defaults to none.
+// If not specified, defaults to ""
+func RecordInputCompressionType(value string) RecordInputAttr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// Emits randomized records.
 //
 // Arguments:
-//	image: 3-D of shape `[height, width, channels]`.
-//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
+//	file_pattern: Glob pattern for the data files.
 //
-// Returns 3-D of shape `[crop_height, crop_width, channels].`
-func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
+// Returns A tensor of shape [batch_size].
+func RecordInput(scope *Scope, file_pattern string, optional ...RecordInputAttr) (records tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"file_pattern": file_pattern}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomCrop",
+		Type: "RecordInput",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes reciprocal of square root of x element-wise.
+//
+// I.e., \\(y = 1 / \sqrt{x}\\).
+func Rsqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Rsqrt",
 		Input: []tf.Input{
-			image, size,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TopKV2Attr is an optional argument to TopKV2.
-type TopKV2Attr func(optionalAttr)
+// MatrixInverseAttr is an optional argument to MatrixInverse.
+type MatrixInverseAttr func(optionalAttr)
 
-// TopKV2Sorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKV2Sorted(value bool) TopKV2Attr {
+// MatrixInverseAdjoint sets the optional adjoint attribute to value.
+// If not specified, defaults to false
+func MatrixInverseAdjoint(value bool) MatrixInverseAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
+// Computes the inverse of one or more square invertible matrices or their
 //
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
+// adjoints (conjugate transposes).
 //
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the inverse for all input submatrices `[..., :, :]`.
 //
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// The op uses LU decomposition with partial pivoting to compute the inverses.
 //
-// If two elements are equal, the lower-index element appears first.
+// If a matrix is not invertible there is no guarantee what the op does. It
+// may detect the condition and raise an exception or it may simply return a
+// garbage result.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: 0-D.  Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	input: Shape is `[..., M, M]`.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.inv
+// @end_compatibility
+func MatrixInverse(scope *Scope, input tf.Output, optional ...MatrixInverseAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3709,239 +4013,161 @@ func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopKV2",
+		Type: "MatrixInverse",
 		Input: []tf.Input{
-			input, k,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns x // y element-wise.
+// Computes square of x element-wise.
 //
-// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// I.e., \\(y = x * x = x^2\\).
+func Square(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FloorDiv",
+		Type: "Square",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
 //
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
+		Type: "Elu",
 		Input: []tf.Input{
-			diagonal,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x <= y) element-wise.
+// Computes the reciprocal of x element-wise.
 //
-// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LessEqual",
+		Type: "Reciprocal",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax activations.
-//
-// For each batch `i` and class `j` we have
-//
-//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
-//
-// Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+// OrderedMapClearAttr is an optional argument to OrderedMapClear.
+type OrderedMapClearAttr func(optionalAttr)
+
+// OrderedMapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns Same shape as `logits`.
-func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func OrderedMapClearCapacity(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Softmax",
-		Input: []tf.Input{
-			logits,
-		},
+}
+
+// OrderedMapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapClearMemoryLimit(value int64) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// DecodeBmpAttr is an optional argument to DecodeBmp.
-type DecodeBmpAttr func(optionalAttr)
+// OrderedMapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearContainer(value string) OrderedMapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// DecodeBmpChannels sets the optional channels attribute to value.
-// If not specified, defaults to 0
-func DecodeBmpChannels(value int64) DecodeBmpAttr {
+// OrderedMapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapClearSharedName(value string) OrderedMapClearAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Decode the first frame of a BMP-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the BMP-encoded image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
-//
-// Arguments:
-//	contents: 0-D.  The BMP-encoded image.
+// Op removes all elements in the underlying container.
 //
-// Returns 3-D with shape `[height, width, channels]`. RGB order
-func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
+// Returns the created operation.
+func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBmp",
-		Input: []tf.Input{
-			contents,
-		},
+		Type: "OrderedMapClear",
+
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
+// Computes the reciprocal of x element-wise.
 //
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// I.e., \\(y = 1 / x\\).
+func Inv(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
+		Type: "Inv",
 		Input: []tf.Input{
-			gradients, features,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchMatMulAttr is an optional argument to BatchMatMul.
-type BatchMatMulAttr func(optionalAttr)
-
-// BatchMatMulAdjX sets the optional adj_x attribute to value.
-//
-// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjX(value bool) BatchMatMulAttr {
-	return func(m optionalAttr) {
-		m["adj_x"] = value
-	}
-}
+// ComplexAbsAttr is an optional argument to ComplexAbs.
+type ComplexAbsAttr func(optionalAttr)
 
-// BatchMatMulAdjY sets the optional adj_y attribute to value.
-//
-// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
-// If not specified, defaults to false
-func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+// ComplexAbsTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ComplexAbsTout(value tf.DataType) ComplexAbsAttr {
 	return func(m optionalAttr) {
-		m["adj_y"] = value
+		m["Tout"] = value
 	}
 }
 
-// Multiplies slices of two tensors in batches.
-//
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be adjointed (to adjoint a matrix
-// means to transpose and conjugate it) before multiplication by setting
-// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if adj_x else r_x
-//     c_o = r_y if adj_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-//
-// Arguments:
-//	x: 2-D or higher with shape `[..., r_x, c_x]`.
-//	y: 2-D or higher with shape `[..., r_y, c_y]`.
+// Computes the complex absolute value of a tensor.
 //
-// Returns 3-D or higher with shape `[..., r_o, c_o]`
-func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
+// Given a tensor `x` of complex numbers, this operation returns a tensor of type
+// `float` or `double` that is the absolute value of each element in `x`. All
+// elements in `x` must be complex numbers of the form \\(a + bj\\). The absolute
+// value is computed as \\( \sqrt{a^2 + b^2}\\).
+func ComplexAbs(scope *Scope, x tf.Output, optional ...ComplexAbsAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -3950,9 +4176,9 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "BatchMatMul",
+		Type: "ComplexAbs",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -3960,69 +4186,79 @@ func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMul
 	return op.Output(0)
 }
 
-// Returns which elements of x are NaN.
+// Returns the truth value of x AND y element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.isnan
-// @end_compatibility
-func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `LogicalAnd` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LogicalAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IsNan",
+		Type: "LogicalAnd",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes rectified linear gradients for a Relu operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Relu operation.
-//	features: The features passed as input to the corresponding Relu operation, OR
-// the outputs of that operation (both work equivalently).
-//
-// Returns `gradients * (features > 0)`.
-func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+// Cast x of type SrcT to y of DstT.
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"DstT": DstT}
 	opspec := tf.OpSpec{
-		Type: "ReluGrad",
+		Type: "Cast",
 		Input: []tf.Input{
-			gradients, features,
+			x,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient of morphological 2-D dilation with respect to the input.
+// MaxAttr is an optional argument to Max.
+type MaxAttr func(optionalAttr)
+
+// MaxKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MaxKeepDims(value bool) MaxAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the maximum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
-func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
+// Returns The reduced tensor.
+func Max(scope *Scope, input tf.Output, axis tf.Output, optional ...MaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropInput",
+		Type: "Max",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -4030,139 +4266,103 @@ func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, ou
 	return op.Output(0)
 }
 
-// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
-type CTCBeamSearchDecoderAttr func(optionalAttr)
-
-// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
-//
-// value: If true, merge repeated classes in output.
-// If not specified, defaults to true
-func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs beam search decoding on the logits given in input.
+// Quantized Batch normalization.
 //
-// A note about the attribute merge_repeated: For the beam search decoder,
-// this means that if consecutive entries in a beam are the same, only
-// the first of these is emitted.  That is, when the top path is "A B B B B",
-// "A B" is returned if merge_repeated = True but "A B B B B" is
-// returned if merge_repeated = False.
+// This op is deprecated and will be removed in the future. Prefer
+// `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch)`.
-//	beam_width: A scalar >= 0 (beam search beam width).
-//	top_paths: A scalar >= 0, <= beam_width (controls output size).
+//	t: A 4D input Tensor.
+//	t_min: The value represented by the lowest quantized input.
+//	t_max: The value represented by the highest quantized input.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	m_min: The value represented by the lowest quantized mean.
+//	m_max: The value represented by the highest quantized mean.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v_min: The value represented by the lowest quantized variance.
+//	v_max: The value represented by the highest quantized variance.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	beta_min: The value represented by the lowest quantized offset.
+//	beta_max: The value represented by the highest quantized offset.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	gamma_min: The value represented by the lowest quantized gamma.
+//	gamma_max: The value represented by the highest quantized gamma.
 //
-// Returns A list (length: top_paths) of indices matrices.  Matrix j,
-// size `(total_decoded_outputs[j] x 2)`, has indices of a
-// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
-// size `(length total_decoded_outputs[j])`, has the values of a
-// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
-// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
-// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
-// sequence log-probabilities.
-func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func QuantizedBatchNormWithGlobalNormalization(scope *Scope, t tf.Output, t_min tf.Output, t_max tf.Output, m tf.Output, m_min tf.Output, m_max tf.Output, v tf.Output, v_min tf.Output, v_max tf.Output, beta tf.Output, beta_min tf.Output, beta_max tf.Output, gamma tf.Output, gamma_min tf.Output, gamma_max tf.Output, out_type tf.DataType, variance_epsilon float32, scale_after_normalization bool) (result tf.Output, result_min tf.Output, result_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type, "variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "CTCBeamSearchDecoder",
+		Type: "QuantizedBatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			t, t_min, t_max, m, m_min, m_max, v, v_min, v_max, beta, beta_min, beta_max, gamma, gamma_min, gamma_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
-		scope.UpdateErr("CTCBeamSearchDecoder", err)
-		return
-	}
-	log_probability = op.Output(idx)
-	return decoded_indices, decoded_values, decoded_shape, log_probability
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
-type AudioSpectrogramAttr func(optionalAttr)
+// HistogramFixedWidthAttr is an optional argument to HistogramFixedWidth.
+type HistogramFixedWidthAttr func(optionalAttr)
 
-// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
-//
-// value: Whether to return the squared magnitude or just the
-// magnitude. Using squared magnitude can avoid extra calculations.
-// If not specified, defaults to false
-func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
+// HistogramFixedWidthDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT32
+func HistogramFixedWidthDtype(value tf.DataType) HistogramFixedWidthAttr {
 	return func(m optionalAttr) {
-		m["magnitude_squared"] = value
+		m["dtype"] = value
 	}
 }
 
-// Produces a visualization of audio data over time.
-//
-// Spectrograms are a standard way of representing audio information as a series of
-// slices of frequency information, one slice for each window of time. By joining
-// these together into a sequence, they form a distinctive fingerprint of the sound
-// over time.
-//
-// This op expects to receive audio data as an input, stored as floats in the range
-// -1 to 1, together with a window width in samples, and a stride specifying how
-// far to move the window between slices. From this it generates a three
-// dimensional output. The lowest dimension has an amplitude value for each
-// frequency during that time slice. The next dimension is time, with successive
-// frequency slices. The final dimension is for the channels in the input, so a
-// stereo audio input would have two here for example.
+// Return histogram of values.
 //
-// This means the layout when converted and saved as an image is rotated 90 degrees
-// clockwise from a typical spectrogram. Time is descending down the Y axis, and
-// the frequency decreases from left to right.
+// Given the tensor `values`, this operation returns a rank 1 histogram counting
+// the number of entries in `values` that fall into every bin.  The bins are
+// equal width and determined by the arguments `value_range` and `nbins`.
 //
-// Each value in the result represents the square root of the sum of the real and
-// imaginary parts of an FFT on the current window of samples. In this way, the
-// lowest dimension represents the power of each frequency in the current window,
-// and adjacent windows are concatenated in the next dimension.
+// ```python
+// # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+// nbins = 5
+// value_range = [0.0, 5.0]
+// new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 //
-// To get a more intuitive and visual look at what this operation does, you can run
-// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
-// resulting spectrogram as a PNG image.
+// with tf.get_default_session() as sess:
+//   hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+//   variables.global_variables_initializer().run()
+//   sess.run(hist) => [2, 1, 1, 0, 2]
+// ```
 //
 // Arguments:
-//	input: Float representation of audio data.
-//	window_size: How wide the input window is in samples. For the highest efficiency
-// this should be a power of two, but other values are accepted.
-//	stride: How widely apart the center of adjacent sample windows should be.
+//	values: Numeric `Tensor`.
+//	value_range: Shape [2] `Tensor` of same `dtype` as `values`.
+// values <= value_range[0] will be mapped to hist[0],
+// values >= value_range[1] will be mapped to hist[-1].
+//	nbins: Scalar `int32 Tensor`.  Number of histogram bins.
 //
-// Returns 3D representation of the audio frequencies as an image.
-func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
+// Returns A 1-D `Tensor` holding histogram of values.
+func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output, nbins tf.Output, optional ...HistogramFixedWidthAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSpectrogram",
+		Type: "HistogramFixedWidth",
 		Input: []tf.Input{
-			input,
+			values, value_range, nbins,
 		},
 		Attrs: attrs,
 	}
@@ -4170,174 +4370,222 @@ func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride i
 	return op.Output(0)
 }
 
-// Compute the polygamma function \\(\psi^{(n)}(x)\\).
+// Adds Tensor 'bias' to Tensor 'input' for Quantized types.
 //
-// The polygamma function is defined as:
+// Broadcasts the values of bias on dimensions 0..N-2 of 'input'.
 //
+// Arguments:
 //
-// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
+//	bias: A 1D bias Tensor with size matching the last dimension of 'input'.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_bias: The float value that the lowest quantized bias value represents.
+//	max_bias: The float value that the highest quantized bias value represents.
 //
-// where \\(\psi(x)\\) is the digamma function.
-func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedBiasAdd(scope *Scope, input tf.Output, bias tf.Output, min_input tf.Output, max_input tf.Output, min_bias tf.Output, max_bias tf.Output, out_type tf.DataType) (output tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "Polygamma",
+		Type: "QuantizedBiasAdd",
 		Input: []tf.Input{
-			a, x,
+			input, bias, min_input, max_input, min_bias, max_bias,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Produces the average pool of the input tensor for quantized types.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// input of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	input: 4-D with shape `[batch, height, width, channels]`.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
 //	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.  The length must be 4 to match the number of dimensions of the input.
 //	padding: The type of padding algorithm to use.
 //
-// Returns Gradients of gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradWithArgmax",
+		Type: "QuantizedAvgPool",
 		Input: []tf.Input{
-			input, grad, argmax,
+			input, min_input, max_input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
-type MaxPoolGradGradV2Attr func(optionalAttr)
+// FractionalAvgPoolAttr is an optional argument to FractionalAvgPool.
+type FractionalAvgPoolAttr func(optionalAttr)
 
-// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
+// FractionalAvgPoolPseudoRandom sets the optional pseudo_random attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalAvgPoolPseudoRandom(value bool) FractionalAvgPoolAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// FractionalAvgPoolOverlapping sets the optional overlapping attribute to value.
 //
-// Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolOverlapping(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGradV2",
-		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
-		},
-		Attrs: attrs,
+}
+
+// FractionalAvgPoolDeterministic sets the optional deterministic attribute to value.
+//
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalAvgPool node in the computation graph. Mainly used
+// in unit test to make FractionalAvgPool deterministic.
+// If not specified, defaults to false
+func FractionalAvgPoolDeterministic(value bool) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes gradients of the maxpooling function.
+// FractionalAvgPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FractionalAvgPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalAvgPoolSeed2(value int64) FractionalAvgPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Performs fractional average pooling on the input.
+//
+// Fractional average pooling is similar to Fractional max pooling in the pooling
+// region generation step. The only difference is that after pooling regions are
+// generated, a mean operation is performed instead of a max operation in each
+// pooling region.
 //
 // Arguments:
-//	input: The original input.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
-// output of `max_pool`.
-//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns Gradients w.r.t. the input of `max_pool`.
-func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
+// Returns output tensor after fractional avg pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalAvgPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalAvgPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradWithArgmax",
+		Type: "FractionalAvgPool",
 		Input: []tf.Input{
-			input, grad, argmax,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// AvgPool3DAttr is an optional argument to AvgPool3D.
-type AvgPool3DAttr func(optionalAttr)
+// RandomCropAttr is an optional argument to RandomCrop.
+type RandomCropAttr func(optionalAttr)
 
-// AvgPool3DDataFormat sets the optional data_format attribute to value.
+// RandomCropSeed sets the optional seed attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DDataFormat(value string) AvgPool3DAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomCropSeed(value int64) RandomCropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["seed"] = value
 	}
 }
 
-// Performs 3D average pooling on the input.
+// RandomCropSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomCropSeed2(value int64) RandomCropAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly crop `image`.
+//
+// DEPRECATED at GraphDef version 8: Random crop is now pure Python
+//
+// `size` is a 1-D int64 tensor with 2 elements representing the crop height and
+// width.  The values must be non negative.
+//
+// This Op picks a random location in `image` and crops a `height` by `width`
+// rectangle from that location.  The random location is picked so the cropped
+// area will fit inside the original image.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	image: 3-D of shape `[height, width, channels]`.
+//	size: 1-D of length 2 containing: `crop_height`, `crop_width`..
 //
-// Returns The average pooled output tensor.
-func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
+// Returns 3-D of shape `[crop_height, crop_width, channels].`
+func RandomCrop(scope *Scope, image tf.Output, size tf.Output, optional ...RandomCropAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3D",
+		Type: "RandomCrop",
 		Input: []tf.Input{
-			input,
+			image, size,
 		},
 		Attrs: attrs,
 	}
@@ -4345,254 +4593,263 @@ func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// TopKV2Attr is an optional argument to TopKV2.
+type TopKV2Attr func(optionalAttr)
+
+// TopKV2Sorted sets the optional sorted attribute to value.
 //
-// the result here is consistent with a truncating divide. E.g.
-// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKV2Sorted(value bool) TopKV2Attr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
 //
-// *NOTE*: `Mod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// Arguments:
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: 0-D.  Number of top elements to look for along the last dimension (along each
+// row for matrices).
+//
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopKV2(scope *Scope, input tf.Output, k tf.Output, optional ...TopKV2Attr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Mod",
+		Type: "TopKV2",
 		Input: []tf.Input{
-			x, y,
+			input, k,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// DepthToSpaceAttr is an optional argument to DepthToSpace.
-type DepthToSpaceAttr func(optionalAttr)
-
-// DepthToSpaceDataFormat sets the optional data_format attribute to value.
-// If not specified, defaults to "NHWC"
-func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// Returns x // y element-wise.
+//
+// *NOTE*: `FloorDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DepthToSpace for tensors of type T.
-//
-// Rearranges data from depth into blocks of spatial data.
-// This is the reverse transformation of SpaceToDepth. More specifically,
-// this op outputs a copy of the input tensor where values from the `depth`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions.
-// The attr `block_size` indicates the input block size and how the data is moved.
-//
-//   * Chunks of data of size `block_size * block_size` from depth are rearranged
-//     into non-overlapping blocks of size `block_size x block_size`
-//   * The width the output tensor is `input_depth * block_size`, whereas the
-//     height is `input_height * block_size`.
-//   * The Y, X coordinates within each block of the output image are determined
-//     by the high order component of the input channel index.
-//   * The depth of the input tensor must be divisible by
-//     `block_size * block_size`.
+// Returns a batched diagonal tensor with a given batched diagonal values.
 //
-// The `data_format` attr specifies the layout of the input and output tensors
-// with the following options:
-//   "NHWC": `[ batch, height, width, channels ]`
-//   "NCHW": `[ batch, channels, height, width ]`
-//   "NCHW_VECT_C":
-//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// It is useful to consider the operation as transforming a 6-D Tensor.
-// e.g. for data_format = NHWC,
-//      Each element in the input tensor can be specified via 6 coordinates,
-//      ordered by decreasing memory layout significance as:
-//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
-//                         within the input image, bX, bY means coordinates
-//                         within the output block, oC means output channels).
-//      The output would be the input transposed to the following layout:
-//      n,iY,bY,iX,bX,oC
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
 //
-// This operation is useful for resizing the activations between convolutions
-// (but keeping all data), e.g. instead of pooling. It is also useful for training
-// purely convolutional models.
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
 //
-// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
-// block_size = 2:
+// For example:
 //
 // ```
-// x = [[[[1, 2, 3, 4]]]]
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
 //
-// ```
+// and diagonal.shape = (2, 4)
 //
-// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
 //
-// ```
-//    [[[[1], [2]],
-//      [[3], [4]]]]
+// which has shape (2, 4, 4)
 // ```
 //
-// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
-// the corresponding output will have 2x2 elements and will have a depth of
-// 1 channel (1 = `4 / (block_size * block_size)`).
-// The output element shape is `[2, 2, 1]`.
-//
-// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
 //
-// ```
-// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
-// ```
-//
-// This operation, for block size of 2, will return the following tensor of shape
-// `[1, 2, 2, 3]`
-//
-// ```
-//    [[[[1, 2, 3], [4, 5, 6]],
-//      [[7, 8, 9], [10, 11, 12]]]]
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the inverse permutation of a tensor.
 //
-// ```
+// This operation computes the inverse of an index permutation. It takes a 1-D
+// integer tensor `x`, which represents the indices of a zero-based array, and
+// swaps each value with its index position. In other words, for an output tensor
+// `y` and an input tensor `x`, this operation computes the following:
 //
-// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
 //
-// ```
-// x =  [[[[1, 2, 3, 4],
-//        [5, 6, 7, 8]],
-//       [[9, 10, 11, 12],
-//        [13, 14, 15, 16]]]]
-// ```
+// The values must include 0. There can be no duplicate values or negative values.
 //
-// the operator will return the following tensor of shape `[1 4 4 1]`:
+// For example:
 //
 // ```
-// x = [[[ [1],   [2],  [5],  [6]],
-//       [ [3],   [4],  [7],  [8]],
-//       [ [9],  [10], [13],  [14]],
-//       [ [11], [12], [15],  [16]]]]
-//
+// # tensor `x` is [3, 4, 0, 2, 1]
+// invert_permutation(x) ==> [2, 4, 3, 0, 1]
 // ```
 //
 // Arguments:
+//	x: 1-D.
 //
-//	block_size: The size of the spatial block, same as in Space2Depth.
-func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
+// Returns 1-D.
+func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DepthToSpace",
+		Type: "InvertPermutation",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
-type Conv3DBackpropInputV2Attr func(optionalAttr)
-
-// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+// Computes log softmax activations.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+// For each batch `i` and class `j` we have
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LogSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the input.
+// Returns the truth value of (x <= y) element-wise.
 //
-// Arguments:
-//	input_sizes: An integer vector representing the tensor shape of `input`,
-// where `input` is a 5-D
-// `[batch, depth, rows, cols, in_channels]` tensor.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
+// *NOTE*: `LessEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func LessEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInputV2",
+		Type: "LessEqual",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes square root of x element-wise.
+// Computes softmax activations.
 //
-// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+// For each batch `i` and class `j` we have
+//
+//     softmax[i, j] = exp(logits[i, j]) / sum_j(exp(logits[i, j]))
+//
+// Arguments:
+//	logits: 2-D with shape `[batch_size, num_classes]`.
+//
+// Returns Same shape as `logits`.
+func Softmax(scope *Scope, logits tf.Output) (softmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sqrt",
+		Type: "Softmax",
 		Input: []tf.Input{
-			x,
+			logits,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradients of 3-D convolution with respect to the filter.
+// DecodeBmpAttr is an optional argument to DecodeBmp.
+type DecodeBmpAttr func(optionalAttr)
+
+// DecodeBmpChannels sets the optional channels attribute to value.
+// If not specified, defaults to 0
+func DecodeBmpChannels(value int64) DecodeBmpAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// Decode the first frame of a BMP-encoded image to a uint8 tensor.
 //
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the BMP-encoded image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+//	contents: 0-D.  The BMP-encoded image.
+//
+// Returns 3-D with shape `[height, width, channels]`. RGB order
+func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilter",
+		Type: "DecodeBmp",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -4600,113 +4857,88 @@ func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_b
 	return op.Output(0)
 }
 
-// Computes the gradient for the rsqrt of `x` wrt its input.
+// Computes softsign gradients for a softsign operation.
 //
-// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RsqrtGrad",
+		Type: "SoftsignGrad",
 		Input: []tf.Input{
-			y, dy,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ReverseSequenceAttr is an optional argument to ReverseSequence.
-type ReverseSequenceAttr func(optionalAttr)
+// BatchMatMulAttr is an optional argument to BatchMatMul.
+type BatchMatMulAttr func(optionalAttr)
 
-// ReverseSequenceBatchDim sets the optional batch_dim attribute to value.
+// BatchMatMulAdjX sets the optional adj_x attribute to value.
 //
-// value: The dimension along which reversal is performed.
-// If not specified, defaults to 0
-func ReverseSequenceBatchDim(value int64) ReverseSequenceAttr {
+// value: If `True`, adjoint the slices of `x`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjX(value bool) BatchMatMulAttr {
 	return func(m optionalAttr) {
-		m["batch_dim"] = value
+		m["adj_x"] = value
 	}
 }
 
-// Reverses variable length slices.
+// BatchMatMulAdjY sets the optional adj_y attribute to value.
 //
-// This op first slices `input` along the dimension `batch_dim`, and for each
-// slice `i`, reverses the first `seq_lengths[i]` elements along
-// the dimension `seq_dim`.
+// value: If `True`, adjoint the slices of `y`. Defaults to `False`.
+// If not specified, defaults to false
+func BatchMatMulAdjY(value bool) BatchMatMulAttr {
+	return func(m optionalAttr) {
+		m["adj_y"] = value
+	}
+}
+
+// Multiplies slices of two tensors in batches.
 //
-// The elements of `seq_lengths` must obey `seq_lengths[i] <= input.dims[seq_dim]`,
-// and `seq_lengths` must be a vector of length `input.dims[batch_dim]`.
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size. Each of the
+// individual slices can optionally be adjointed (to adjoint a matrix
+// means to transpose and conjugate it) before multiplication by setting
+// the `adj_x` or `adj_y` flag to `True`, which are by default `False`.
 //
-// The output slice `i` along dimension `batch_dim` is then given by input
-// slice `i`, with the first `seq_lengths[i]` slices along dimension
-// `seq_dim` reversed.
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
 //
-// For example:
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
 //
-// ```
-// # Given this:
-// batch_dim = 0
-// seq_dim = 1
-// input.dims = (4, 8, ...)
-// seq_lengths = [7, 2, 3, 5]
-//
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0, 0:7, :, ...] = input[0, 7:0:-1, :, ...]
-// output[1, 0:2, :, ...] = input[1, 2:0:-1, :, ...]
-// output[2, 0:3, :, ...] = input[2, 3:0:-1, :, ...]
-// output[3, 0:5, :, ...] = input[3, 5:0:-1, :, ...]
-//
-// # while entries past seq_lens are copied through:
-// output[0, 7:, :, ...] = input[0, 7:, :, ...]
-// output[1, 2:, :, ...] = input[1, 2:, :, ...]
-// output[2, 3:, :, ...] = input[2, 3:, :, ...]
-// output[3, 2:, :, ...] = input[3, 2:, :, ...]
-// ```
-//
-// In contrast, if:
-//
-// ```
-// # Given this:
-// batch_dim = 2
-// seq_dim = 0
-// input.dims = (8, ?, 4, ...)
-// seq_lengths = [7, 2, 3, 5]
+//     r_o = c_x if adj_x else r_x
+//     c_o = r_y if adj_y else c_y
 //
-// # then slices of input are reversed on seq_dim, but only up to seq_lengths:
-// output[0:7, :, 0, :, ...] = input[7:0:-1, :, 0, :, ...]
-// output[0:2, :, 1, :, ...] = input[2:0:-1, :, 1, :, ...]
-// output[0:3, :, 2, :, ...] = input[3:0:-1, :, 2, :, ...]
-// output[0:5, :, 3, :, ...] = input[5:0:-1, :, 3, :, ...]
+// It is computed as:
 //
-// # while entries past seq_lens are copied through:
-// output[7:, :, 0, :, ...] = input[7:, :, 0, :, ...]
-// output[2:, :, 1, :, ...] = input[2:, :, 1, :, ...]
-// output[3:, :, 2, :, ...] = input[3:, :, 2, :, ...]
-// output[2:, :, 3, :, ...] = input[2:, :, 3, :, ...]
-// ```
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
 //
 // Arguments:
-//	input: The input to reverse.
-//	seq_lengths: 1-D with length `input.dims(batch_dim)` and
-// `max(seq_lengths) <= input.dims(seq_dim)`
-//	seq_dim: The dimension which is partially reversed.
+//	x: 2-D or higher with shape `[..., r_x, c_x]`.
+//	y: 2-D or higher with shape `[..., r_y, c_y]`.
 //
-// Returns The partially reversed input. It has the same shape as `input`.
-func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_dim int64, optional ...ReverseSequenceAttr) (output tf.Output) {
+// Returns 3-D or higher with shape `[..., r_o, c_o]`
+func BatchMatMul(scope *Scope, x tf.Output, y tf.Output, optional ...BatchMatMulAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"seq_dim": seq_dim}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReverseSequence",
+		Type: "BatchMatMul",
 		Input: []tf.Input{
-			input, seq_lengths,
+			x, y,
 		},
 		Attrs: attrs,
 	}
@@ -4714,274 +4946,209 @@ func ReverseSequence(scope *Scope, input tf.Output, seq_lengths tf.Output, seq_d
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
-type DepthwiseConv2dNativeAttr func(optionalAttr)
-
-// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
-//
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
-// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
-// a different filter to each input channel (expanding from 1 channel to
-// `channel_multiplier` channels for each), then concatenates the results
-// together. Thus, the output has `in_channels * channel_multiplier` channels.
-//
-// ```
-// for k in 0..in_channels-1
-//   for q in 0..channel_multiplier-1
-//     output[b, i, j, k * channel_multiplier + q] =
-//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-//                         filter[di, dj, k, q]
-// ```
-//
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
-//
-// Arguments:
-//
+// Returns which elements of x are NaN.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`.
-//	padding: The type of padding algorithm to use.
-func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isnan
+// @end_compatibility
+func IsNan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNative",
+		Type: "IsNan",
 		Input: []tf.Input{
-			input, filter,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
-type MaxPoolGradV2Attr func(optionalAttr)
-
-// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the maxpooling function.
+// Computes rectified linear gradients for a Relu operation.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	gradients: The backpropagated gradients to the corresponding Relu operation.
+//	features: The features passed as input to the corresponding Relu operation, OR
+// the outputs of that operation (both work equivalently).
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
+// Returns `gradients * (features > 0)`.
+func ReluGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradV2",
+		Type: "ReluGrad",
 		Input: []tf.Input{
-			orig_input, orig_output, grad, ksize, strides,
+			gradients, features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a reader to a previously saved state.
-//
-// Not all Readers support being restored, so this can produce an
-// Unimplemented error.
+// Computes the gradient of morphological 2-D dilation with respect to the input.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-//	state: Result of a ReaderSerializeState of a Reader with type
-// matching reader_handle.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
+// Returns 4-D with shape `[batch, in_height, in_width, depth]`.
+func Dilation2DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (in_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ReaderRestoreStateV2",
+		Type: "Dilation2DBackpropInput",
 		Input: []tf.Input{
-			reader_handle, state,
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
-type MaxPoolGradAttr func(optionalAttr)
+// CTCBeamSearchDecoderAttr is an optional argument to CTCBeamSearchDecoder.
+type CTCBeamSearchDecoderAttr func(optionalAttr)
 
-// MaxPoolGradDataFormat sets the optional data_format attribute to value.
+// CTCBeamSearchDecoderMergeRepeated sets the optional merge_repeated attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
+// value: If true, merge repeated classes in output.
+// If not specified, defaults to true
+func CTCBeamSearchDecoderMergeRepeated(value bool) CTCBeamSearchDecoderAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["merge_repeated"] = value
 	}
 }
 
-// Computes gradients of the maxpooling function.
+// Performs beam search decoding on the logits given in input.
+//
+// A note about the attribute merge_repeated: For the beam search decoder,
+// this means that if consecutive entries in a beam are the same, only
+// the first of these is emitted.  That is, when the top path is "A B B B B",
+// "A B" is returned if merge_repeated = True but "A B B B B" is
+// returned if merge_repeated = False.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch)`.
+//	beam_width: A scalar >= 0 (beam search beam width).
+//	top_paths: A scalar >= 0, <= beam_width (controls output size).
 //
-// Returns Gradients w.r.t. the input to `max_pool`.
-func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
+// Returns A list (length: top_paths) of indices matrices.  Matrix j,
+// size `(total_decoded_outputs[j] x 2)`, has indices of a
+// `SparseTensor<int64, 2>`.  The rows store: [batch, time].A list (length: top_paths) of values vectors.  Vector j,
+// size `(length total_decoded_outputs[j])`, has the values of a
+// `SparseTensor<int64, 2>`.  The vector stores the decoded classes for beam j.A list (length: top_paths) of shape vector.  Vector j,
+// size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
+// Its values are: `[batch_size, max_decoded_length[j]]`.A matrix, shaped: `(batch_size x top_paths)`.  The
+// sequence log-probabilities.
+func CTCBeamSearchDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, beam_width int64, top_paths int64, optional ...CTCBeamSearchDecoderAttr) (decoded_indices []tf.Output, decoded_values []tf.Output, decoded_shape []tf.Output, log_probability tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"beam_width": beam_width, "top_paths": top_paths}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGrad",
+		Type: "CTCBeamSearchDecoder",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			inputs, sequence_length,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// CropAndResizeAttr is an optional argument to CropAndResize.
-type CropAndResizeAttr func(optionalAttr)
-
-// CropAndResizeMethod sets the optional method attribute to value.
-//
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeMethod(value string) CropAndResizeAttr {
-	return func(m optionalAttr) {
-		m["method"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if decoded_indices, idx, err = makeOutputList(op, idx, "decoded_indices"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_values, idx, err = makeOutputList(op, idx, "decoded_values"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
+	}
+	if decoded_shape, idx, err = makeOutputList(op, idx, "decoded_shape"); err != nil {
+		scope.UpdateErr("CTCBeamSearchDecoder", err)
+		return
 	}
+	log_probability = op.Output(idx)
+	return decoded_indices, decoded_values, decoded_shape, log_probability
 }
 
-// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
+// AudioSpectrogramAttr is an optional argument to AudioSpectrogram.
+type AudioSpectrogramAttr func(optionalAttr)
+
+// AudioSpectrogramMagnitudeSquared sets the optional magnitude_squared attribute to value.
 //
-// value: Value used for extrapolation, when applicable.
-// If not specified, defaults to 0
-func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+// value: Whether to return the squared magnitude or just the
+// magnitude. Using squared magnitude can avoid extra calculations.
+// If not specified, defaults to false
+func AudioSpectrogramMagnitudeSquared(value bool) AudioSpectrogramAttr {
 	return func(m optionalAttr) {
-		m["extrapolation_value"] = value
+		m["magnitude_squared"] = value
 	}
 }
 
-// Extracts crops from the input image tensor and bilinearly resizes them (possibly
+// Produces a visualization of audio data over time.
 //
-// with aspect ratio change) to a common output size specified by `crop_size`. This
-// is more general than the `crop_to_bounding_box` op which extracts a fixed size
-// slice from the input image and does not allow resizing or aspect ratio change.
+// Spectrograms are a standard way of representing audio information as a series of
+// slices of frequency information, one slice for each window of time. By joining
+// these together into a sequence, they form a distinctive fingerprint of the sound
+// over time.
 //
-// Returns a tensor with `crops` from the input `image` at positions defined at the
-// bounding box locations in `boxes`. The cropped boxes are all resized (with
-// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-// method will give identical results to using `tf.image.resize_bilinear()`
-// with `align_corners=True`.
+// This op expects to receive audio data as an input, stored as floats in the range
+// -1 to 1, together with a window width in samples, and a stride specifying how
+// far to move the window between slices. From this it generates a three
+// dimensional output. The lowest dimension has an amplitude value for each
+// frequency during that time slice. The next dimension is time, with successive
+// frequency slices. The final dimension is for the channels in the input, so a
+// stereo audio input would have two here for example.
+//
+// This means the layout when converted and saved as an image is rotated 90 degrees
+// clockwise from a typical spectrogram. Time is descending down the Y axis, and
+// the frequency decreases from left to right.
+//
+// Each value in the result represents the square root of the sum of the real and
+// imaginary parts of an FFT on the current window of samples. In this way, the
+// lowest dimension represents the power of each frequency in the current window,
+// and adjacent windows are concatenated in the next dimension.
+//
+// To get a more intuitive and visual look at what this operation does, you can run
+// tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
+// resulting spectrogram as a PNG image.
 //
 // Arguments:
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
-// cropped image patches are resized to this size. The aspect ratio of the image
-// content is not preserved. Both `crop_height` and `crop_width` need to be
-// positive.
+//	input: Float representation of audio data.
+//	window_size: How wide the input window is in samples. For the highest efficiency
+// this should be a power of two, but other values are accepted.
+//	stride: How widely apart the center of adjacent sample windows should be.
 //
-// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
+// Returns 3D representation of the audio frequencies as an image.
+func AudioSpectrogram(scope *Scope, input tf.Output, window_size int64, stride int64, optional ...AudioSpectrogramAttr) (spectrogram tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"window_size": window_size, "stride": stride}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResize",
+		Type: "AudioSpectrogram",
 		Input: []tf.Input{
-			image, boxes, box_ind, crop_size,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -4989,236 +5156,127 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 	return op.Output(0)
 }
 
-// Fills empty rows in the input 2-D `SparseTensor` with a default value.
-//
-// The input `SparseTensor` is represented via the tuple of inputs
-// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
-// same `dense_shape` but with indices `output_indices` and values
-// `output_values`.
-//
-// This op inserts a single entry for every row that doesn't have any values.
-// The index is created as `[row, 0, ..., 0]` and the inserted value
-// is `default_value`.
-//
-// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [2, 0]: c
-//     [3, 1]: d
-//
-// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
-//
-//     [0, 1]: a
-//     [0, 3]: b
-//     [1, 0]: default_value
-//     [2, 0]: c
-//     [3, 1]: d
-//     [4, 0]: default_value
-//
-// The output `SparseTensor` will be in row-major order and will have the
-// same shape as the input.
-//
-// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
-//
-//     empty_row_indicator[i] = True iff row i was an empty row.
+// Compute the polygamma function \\(\psi^{(n)}(x)\\).
 //
-// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
-// backpropagation,
+// The polygamma function is defined as:
 //
-//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
 //
-// Arguments:
-//	indices: 2-D. the indices of the sparse tensor.
-//	values: 1-D. the values of the sparse tensor.
-//	dense_shape: 1-D. the shape of the sparse tensor.
-//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
-//   for rows missing from the input sparse tensor.
-// output indices: 2-D. the indices of the filled sparse tensor.
+// \\(\psi^{(n)}(x) = \frac{d^n}{dx^n} \psi(x)\\)
 //
-// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
-// input sparse tensor.1-D. a map from the input indices to the output indices.
-func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
+// where \\(\psi(x)\\) is the digamma function.
+func Polygamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRows",
+		Type: "Polygamma",
 		Input: []tf.Input{
-			indices, values, dense_shape, default_value,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// input of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Reverse",
+		Type: "MaxPoolGradGradWithArgmax",
 		Input: []tf.Input{
-			tensor, dims,
+			input, grad, argmax,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes log softmax activations.
-//
-// For each batch `i` and class `j` we have
+// MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
+type MaxPoolGradGradV2Attr func(optionalAttr)
+
+// MaxPoolGradGradV2DataFormat sets the optional data_format attribute to value.
 //
-//     logsoftmax[i, j] = logits[i, j] - log(sum(exp(logits[i])))
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradV2DataFormat(value string) MaxPoolGradGradV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//	logits: 2-D with shape `[batch_size, num_classes]`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Same shape as `logits`.
-func LogSoftmax(scope *Scope, logits tf.Output) (logsoftmax tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "LogSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the inverse permutation of a tensor.
-//
-// This operation computes the inverse of an index permutation. It takes a 1-D
-// integer tensor `x`, which represents the indices of a zero-based array, and
-// swaps each value with its index position. In other words, for an output tensor
-// `y` and an input tensor `x`, this operation computes the following:
-//
-// `y[x[i]] = i for i in [0, 1, ..., len(x) - 1]`
-//
-// The values must include 0. There can be no duplicate values or negative values.
-//
-// For example:
-//
-// ```
-// # tensor `x` is [3, 4, 0, 2, 1]
-// invert_permutation(x) ==> [2, 4, 3, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns 1-D.
-func InvertPermutation(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InvertPermutation",
+		Type: "MaxPoolGradGradV2",
 		Input: []tf.Input{
-			x,
+			orig_input, orig_output, grad, ksize, strides,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BiasAddGradAttr is an optional argument to BiasAddGrad.
-type BiasAddGradAttr func(optionalAttr)
-
-// BiasAddGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the bias tensor will be added to the last dimension
-// of the value tensor.
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// The tensor will be added to "in_channels", the third-to-the-last
-//     dimension.
-// If not specified, defaults to "NHWC"
-func BiasAddGradDataFormat(value string) BiasAddGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// The backward operation for "BiasAdd" on the "bias" tensor.
-//
-// It accumulates all the values from out_backprop into the feature dimension.
-// For NHWC data format, the feature dimension is the last. For NCHW data format,
-// the feature dimension is the third-to-last.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	out_backprop: Any number of dimensions.
+//	input: The original input.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t. the
+// output of `max_pool`.
+//	argmax: The indices of the maximum values chosen for each output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 1-D with size the feature dimension of `out_backprop`.
-func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input of `max_pool`.
+func MaxPoolGradWithArgmax(scope *Scope, input tf.Output, grad tf.Output, argmax tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "BiasAddGrad",
+		Type: "MaxPoolGradWithArgmax",
 		Input: []tf.Input{
-			out_backprop,
+			input, grad, argmax,
 		},
 		Attrs: attrs,
 	}
@@ -5226,167 +5284,192 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
-type FusedBatchNormV2Attr func(optionalAttr)
-
-// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
-//
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
+// AvgPool3DAttr is an optional argument to AvgPool3D.
+type AvgPool3DAttr func(optionalAttr)
 
-// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
+// AvgPool3DDataFormat sets the optional data_format attribute to value.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DDataFormat(value string) AvgPool3DAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
-//
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
-	}
-}
-
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Performs 3D average pooling on the input.
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
-		Input: []tf.Input{
-			x, scale, offset, mean, variance,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// Returns the rank of a tensor.
-//
-// This operation returns an integer representing the rank of `input`.
-//
-// For example:
-//
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// # shape of tensor 't' is [2, 2, 3]
-// rank(t) ==> 3
-// ```
-//
-// **Note**: The rank of a tensor is not the same as the rank of a matrix. The rank
-// of a tensor is the number of indices required to uniquely select each element
-// of the tensor. Rank is also known as "order", "degree", or "ndims."
-func Rank(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rank",
+		Type: "AvgPool3D",
 		Input: []tf.Input{
 			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// Arguments:
-//	tensor: A Tensor of type `T`.
+// the result here is consistent with a truncating divide. E.g.
+// `tf.truncatediv(x, y) * y + truncate_mod(x, y) = x`.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// *NOTE*: `Mod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "Mod",
 		Input: []tf.Input{
-			tensor,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
+// DepthToSpaceAttr is an optional argument to DepthToSpace.
+type DepthToSpaceAttr func(optionalAttr)
 
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+// DepthToSpaceDataFormat sets the optional data_format attribute to value.
+// If not specified, defaults to "NHWC"
+func DepthToSpaceDataFormat(value string) DepthToSpaceAttr {
 	return func(m optionalAttr) {
-		m["adjoint"] = value
+		m["data_format"] = value
 	}
 }
 
-// Solves systems of linear equations.
+// DepthToSpace for tensors of type T.
 //
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// Rearranges data from depth into blocks of spatial data.
+// This is the reverse transformation of SpaceToDepth. More specifically,
+// this op outputs a copy of the input tensor where values from the `depth`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions.
+// The attr `block_size` indicates the input block size and how the data is moved.
 //
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//   * Chunks of data of size `block_size * block_size` from depth are rearranged
+//     into non-overlapping blocks of size `block_size x block_size`
+//   * The width the output tensor is `input_depth * block_size`, whereas the
+//     height is `input_height * block_size`.
+//   * The Y, X coordinates within each block of the output image are determined
+//     by the high order component of the input channel index.
+//   * The depth of the input tensor must be divisible by
+//     `block_size * block_size`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// The `data_format` attr specifies the layout of the input and output tensors
+// with the following options:
+//   "NHWC": `[ batch, height, width, channels ]`
+//   "NCHW": `[ batch, channels, height, width ]`
+//   "NCHW_VECT_C":
+//       `qint8 [ batch, channels / 4, height, width, 4 ]`
+//
+// It is useful to consider the operation as transforming a 6-D Tensor.
+// e.g. for data_format = NHWC,
+//      Each element in the input tensor can be specified via 6 coordinates,
+//      ordered by decreasing memory layout significance as:
+//      n,iY,iX,bY,bX,oC  (where n=batch index, iX, iY means X or Y coordinates
+//                         within the input image, bX, bY means coordinates
+//                         within the output block, oC means output channels).
+//      The output would be the input transposed to the following layout:
+//      n,iY,bY,iX,bX,oC
+//
+// This operation is useful for resizing the activations between convolutions
+// (but keeping all data), e.g. instead of pooling. It is also useful for training
+// purely convolutional models.
+//
+// For example, given an input of shape `[1, 1, 1, 4]`, data_format = "NHWC" and
+// block_size = 2:
+//
+// ```
+// x = [[[[1, 2, 3, 4]]]]
+//
+// ```
+//
+// This operation will output a tensor of shape `[1, 2, 2, 1]`:
+//
+// ```
+//    [[[[1], [2]],
+//      [[3], [4]]]]
+// ```
+//
+// Here, the input has a batch of 1 and each batch element has shape `[1, 1, 4]`,
+// the corresponding output will have 2x2 elements and will have a depth of
+// 1 channel (1 = `4 / (block_size * block_size)`).
+// The output element shape is `[2, 2, 1]`.
+//
+// For an input tensor with larger depth, here of shape `[1, 1, 1, 12]`, e.g.
+//
+// ```
+// x = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
+// ```
+//
+// This operation, for block size of 2, will return the following tensor of shape
+// `[1, 2, 2, 3]`
+//
+// ```
+//    [[[[1, 2, 3], [4, 5, 6]],
+//      [[7, 8, 9], [10, 11, 12]]]]
+//
+// ```
+//
+// Similarly, for the following input of shape `[1 2 2 4]`, and a block size of 2:
+//
+// ```
+// x =  [[[[1, 2, 3, 4],
+//        [5, 6, 7, 8]],
+//       [[9, 10, 11, 12],
+//        [13, 14, 15, 16]]]]
+// ```
+//
+// the operator will return the following tensor of shape `[1 4 4 1]`:
+//
+// ```
+// x = [[[ [1],   [2],  [5],  [6]],
+//       [ [3],   [4],  [7],  [8]],
+//       [ [9],  [10], [13],  [14]],
+//       [ [11], [12], [15],  [16]]]]
+//
+// ```
+//
+// Arguments:
+//
+//	block_size: The size of the spatial block, same as in Space2Depth.
+func DepthToSpace(scope *Scope, input tf.Output, block_size int64, optional ...DepthToSpaceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"block_size": block_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "DepthToSpace",
 		Input: []tf.Input{
-			matrix, rhs,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -5394,63 +5477,137 @@ func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...Matr
 	return op.Output(0)
 }
 
-// Computes acos of x element-wise.
-func Acos(scope *Scope, x tf.Output) (y tf.Output) {
+// Conv3DBackpropInputV2Attr is an optional argument to Conv3DBackpropInputV2.
+type Conv3DBackpropInputV2Attr func(optionalAttr)
+
+// Conv3DBackpropInputV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv3DBackpropInputV2Dilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the tensor shape of `input`,
+// where `input` is a 5-D
+// `[batch, depth, rows, cols, in_channels]` tensor.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInputV2(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropInputV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Acos",
+		Type: "Conv3DBackpropInputV2",
 		Input: []tf.Input{
-			x,
+			input_sizes, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
+// Computes square root of x element-wise.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
+// I.e., \\(y = \sqrt{x} = x^{1/2}\\).
+func Sqrt(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sqrt",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropFilterV2
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	opspec := tf.OpSpec{
+		Type: "Conv3DBackpropFilter",
+		Input: []tf.Input{
+			input, filter, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the rsqrt of `x` wrt its input.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Specifically, `grad = dy * -0.5 * y^3`, where `y = rsqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func RsqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT",
+		Type: "RsqrtGrad",
 		Input: []tf.Input{
-			input, fft_length,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
-type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
+// DepthwiseConv2dNativeAttr is an optional argument to DepthwiseConv2dNative.
+type DepthwiseConv2dNativeAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+// DepthwiseConv2dNativeDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
 // default format "NHWC", the data is stored in the order of:
@@ -5458,13 +5615,13 @@ type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 // Alternatively, the format could be "NCHW", the data storage order of:
 //     [batch, channels, height, width].
 // If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
+func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
+// DepthwiseConv2dNativeDilations sets the optional dilations attribute to value.
 //
 // value: 1-D tensor of length 4.  The dilation factor for each dimension of
 // `input`. If set to k > 1, there will be k-1 skipped cells between each filter
@@ -5472,114 +5629,51 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
 // If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
 	}
 }
 
-// Computes the gradients of depthwise convolution with respect to the filter.
+// Computes a 2-D depthwise convolution given 4-D `input` and `filter` tensors.
 //
-// Arguments:
-//	input: 4-D with shape based on `data_format`.  For example, if
-// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
-// in_width, in_channels]` tensor.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, channel_multiplier]`, containing
+// `in_channels` convolutional filters of depth 1, `depthwise_conv2d` applies
+// a different filter to each input channel (expanding from 1 channel to
+// `channel_multiplier` channels for each), then concatenates the results
+// together. Thus, the output has `in_channels * channel_multiplier` channels.
 //
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// LRNGradAttr is an optional argument to LRNGrad.
-type LRNGradAttr func(optionalAttr)
-
-// LRNGradDepthRadius sets the optional depth_radius attribute to value.
-//
-// value: A depth radius.
-// If not specified, defaults to 5
-func LRNGradDepthRadius(value int64) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNGradBias sets the optional bias attribute to value.
-//
-// value: An offset (usually > 0 to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNGradBias(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNGradAlpha sets the optional alpha attribute to value.
-//
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNGradAlpha(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
-	}
-}
-
-// LRNGradBeta sets the optional beta attribute to value.
+// ```
+// for k in 0..in_channels-1
+//   for q in 0..channel_multiplier-1
+//     output[b, i, j, k * channel_multiplier + q] =
+//       sum_{di, dj} input[b, strides[1] * i + di, strides[2] * j + dj, k] *
+//                         filter[di, dj, k, q]
+// ```
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNGradBeta(value float32) LRNGradAttr {
-	return func(m optionalAttr) {
-		m["beta"] = value
-	}
-}
-
-// Gradients for Local Response Normalization.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//	input_grads: 4-D with shape `[batch, height, width, channels]`.
-//	input_image: 4-D with shape `[batch, height, width, channels]`.
-//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// Returns The gradients for LRN.
-func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`.
+//	padding: The type of padding algorithm to use.
+func DepthwiseConv2dNative(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRNGrad",
+		Type: "DepthwiseConv2dNative",
 		Input: []tf.Input{
-			input_grads, input_image, output_image,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -5587,44 +5681,47 @@ func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_
 	return op.Output(0)
 }
 
-// AnyAttr is an optional argument to Any.
-type AnyAttr func(optionalAttr)
+// MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
+type MaxPoolGradV2Attr func(optionalAttr)
 
-// AnyKeepDims sets the optional keep_dims attribute to value.
+// MaxPoolGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AnyKeepDims(value bool) AnyAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradV2DataFormat(value string) MaxPoolGradV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes the "logical or" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The reduced tensor.
-func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradV2(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolGradV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Any",
+		Type: "MaxPoolGradV2",
 		Input: []tf.Input{
-			input, axis,
+			orig_input, orig_output, grad, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -5632,105 +5729,71 @@ func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (ou
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
-type ResourceApplyFtrlAttr func(optionalAttr)
-
-// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
+// Restore a reader to a previously saved state.
 //
-// accum_new = accum + grad * grad
-// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Not all Readers support being restored, so this can produce an
+// Unimplemented error.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 regulariation. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	reader_handle: Handle to a Reader.
+//	state: Result of a ReaderSerializeState of a Reader with type
+// matching reader_handle.
 //
 // Returns the created operation.
-func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
+func ReaderRestoreStateV2(scope *Scope, reader_handle tf.Output, state tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrl",
+		Type: "ReaderRestoreStateV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, lr_power,
+			reader_handle, state,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RandomUniformAttr is an optional argument to RandomUniform.
-type RandomUniformAttr func(optionalAttr)
-
-// RandomUniformSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformSeed(value int64) RandomUniformAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// MaxPoolGradAttr is an optional argument to MaxPoolGrad.
+type MaxPoolGradAttr func(optionalAttr)
 
-// RandomUniformSeed2 sets the optional seed2 attribute to value.
+// MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformSeed2(value int64) RandomUniformAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradDataFormat(value string) MaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs random values from a uniform distribution.
-//
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// Computes gradients of the maxpooling function.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients w.r.t. the output of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A tensor of the specified shape filled with uniform random values.
-func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
+// Returns Gradients w.r.t. the input to `max_pool`.
+func MaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniform",
+		Type: "MaxPoolGrad",
 		Input: []tf.Input{
-			shape,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -5738,30 +5801,66 @@ func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ..
 	return op.Output(0)
 }
 
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
+// CropAndResizeAttr is an optional argument to CropAndResize.
+type CropAndResizeAttr func(optionalAttr)
 
-// AssertSummarize sets the optional summarize attribute to value.
+// CropAndResizeMethod sets the optional method attribute to value.
 //
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeMethod(value string) CropAndResizeAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["method"] = value
 	}
 }
 
-// Asserts that the given condition is true.
+// CropAndResizeExtrapolationValue sets the optional extrapolation_value attribute to value.
 //
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
+// value: Value used for extrapolation, when applicable.
+// If not specified, defaults to 0
+func CropAndResizeExtrapolationValue(value float32) CropAndResizeAttr {
+	return func(m optionalAttr) {
+		m["extrapolation_value"] = value
+	}
+}
+
+// Extracts crops from the input image tensor and bilinearly resizes them (possibly
 //
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
+// with aspect ratio change) to a common output size specified by `crop_size`. This
+// is more general than the `crop_to_bounding_box` op which extracts a fixed size
+// slice from the input image and does not allow resizing or aspect ratio change.
 //
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
+// Returns a tensor with `crops` from the input `image` at positions defined at the
+// bounding box locations in `boxes`. The cropped boxes are all resized (with
+// bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
+// result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
+// resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
+// method will give identical results to using `tf.image.resize_bilinear()`
+// with `align_corners=True`.
+//
+// Arguments:
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1]` in image height coordinates. We do allow `y1` > `y2`, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`. All
+// cropped image patches are resized to this size. The aspect ratio of the image
+// content is not preserved. Both `crop_height` and `crop_width` need to be
+// positive.
+//
+// Returns A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Output, crop_size tf.Output, optional ...CropAndResizeAttr) (crops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5770,126 +5869,175 @@ func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...Ass
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Assert",
+		Type: "CropAndResize",
 		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			image, boxes, box_ind, crop_size,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
+// Fills empty rows in the input 2-D `SparseTensor` with a default value.
 //
-// For each entry in `x`, calculates the number of `1` (on) bits in the binary
-// representation of that entry.
+// The input `SparseTensor` is represented via the tuple of inputs
+// (`indices`, `values`, `dense_shape`).  The output `SparseTensor` has the
+// same `dense_shape` but with indices `output_indices` and values
+// `output_values`.
 //
-// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
-// `int32` or `int64` and perform the bitcount on the result, than to feed in
-// 8- or 16-bit inputs and then aggregate the resulting counts.
-func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
+// This op inserts a single entry for every row that doesn't have any values.
+// The index is created as `[row, 0, ..., 0]` and the inserted value
+// is `default_value`.
+//
+// For example, suppose `sp_input` has shape `[5, 6]` and non-empty values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [2, 0]: c
+//     [3, 1]: d
+//
+// Rows 1 and 4 are empty, so the output will be of shape `[5, 6]` with values:
+//
+//     [0, 1]: a
+//     [0, 3]: b
+//     [1, 0]: default_value
+//     [2, 0]: c
+//     [3, 1]: d
+//     [4, 0]: default_value
+//
+// The output `SparseTensor` will be in row-major order and will have the
+// same shape as the input.
+//
+// This op also returns an indicator vector shaped `[dense_shape[0]]` such that
+//
+//     empty_row_indicator[i] = True iff row i was an empty row.
+//
+// And a reverse index map vector shaped `[indices.shape[0]]` that is used during
+// backpropagation,
+//
+//     reverse_index_map[j] = out_j s.t. indices[j, :] == output_indices[out_j, :]
+//
+// Arguments:
+//	indices: 2-D. the indices of the sparse tensor.
+//	values: 1-D. the values of the sparse tensor.
+//	dense_shape: 1-D. the shape of the sparse tensor.
+//	default_value: 0-D. default value to insert into location `[row, 0, ..., 0]`
+//   for rows missing from the input sparse tensor.
+// output indices: 2-D. the indices of the filled sparse tensor.
+//
+// Returns 1-D. the values of the filled sparse tensor.1-D. whether the dense row was missing in the
+// input sparse tensor.1-D. a map from the input indices to the output indices.
+func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, default_value tf.Output) (output_indices tf.Output, output_values tf.Output, empty_row_indicator tf.Output, reverse_index_map tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "PopulationCount",
+		Type: "SparseFillEmptyRows",
 		Input: []tf.Input{
-			x,
+			indices, values, dense_shape, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
+// Reverses specific dimensions of a tensor.
 //
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
 //
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
 //
-// Graphically the output tensors are:
+// `rank(tensor) = size(dims)`
 //
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// For example:
 //
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
 //
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "SparseSplit",
+		Type: "Reverse",
 		Input: []tf.Input{
-			split_dim, indices, values, shape,
+			tensor, dims,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
+	return op.Output(0)
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// BiasAddGradAttr is an optional argument to BiasAddGrad.
+type BiasAddGradAttr func(optionalAttr)
 
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+// BiasAddGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the bias tensor will be added to the last dimension
+// of the value tensor.
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// The tensor will be added to "in_channels", the third-to-the-last
+//     dimension.
+// If not specified, defaults to "NHWC"
+func BiasAddGradDataFormat(value string) BiasAddGradAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Use RandomPoissonV2 instead.
+// The backward operation for "BiasAdd" on the "bias" tensor.
 //
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+// It accumulates all the values from out_backprop into the feature dimension.
+// For NHWC data format, the feature dimension is the last. For NCHW data format,
+// the feature dimension is the third-to-last.
+//
+// Arguments:
+//	out_backprop: Any number of dimensions.
+//
+// Returns 1-D with size the feature dimension of `out_backprop`.
+func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5898,9 +6046,9 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
+		Type: "BiasAddGrad",
 		Input: []tf.Input{
-			shape, rate,
+			out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -5908,46 +6056,60 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
-type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
+// FusedBatchNormV2Attr is an optional argument to FusedBatchNormV2.
+type FusedBatchNormV2Attr func(optionalAttr)
 
-// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+// FusedBatchNormV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormV2Epsilon(value float32) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// FusedBatchNormV2DataFormat sets the optional data_format attribute to value.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormV2DataFormat(value string) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormV2IsTraining sets the optional is_training attribute to value.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
-//	lr_power: Scaling factor. Must be a scalar.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -5956,63 +6118,65 @@ func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, li
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrlV2",
+		Type: "FusedBatchNormV2",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
+			x, scale, offset, mean, variance,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Associates the given iterator with the given statistics aggregator.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
-// Returns the created operation.
-func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
+// Arguments:
+//	tensor: A Tensor of type `T`.
+//
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorSetStatsAggregator",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			iterator_handle, stats_aggregator_handle,
+			tensor,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
-type DataFormatVecPermuteAttr func(optionalAttr)
-
-// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
-//
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
 
-// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["adjoint"] = value
 	}
 }
 
-// Returns the permuted vector/tensor in the destination data format given the
+// Solves systems of linear equations.
 //
-// one in the source data format.
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
 //
 // Arguments:
-//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
-func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6021,9 +6185,9 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatVecPermute",
+		Type: "MatrixSolve",
 		Input: []tf.Input{
-			x,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -6031,13 +6195,13 @@ func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPe
 	return op.Output(0)
 }
 
-// Computes tan of x element-wise.
-func Tan(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes acos of x element-wise.
+func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Tan",
+		Type: "Acos",
 		Input: []tf.Input{
 			x,
 		},
@@ -6046,43 +6210,53 @@ func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
+type QuantizeAndDequantizeAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
 // If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["range_given"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
-//
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_min"] = value
+	}
+}
+
+// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
+// If not specified, defaults to 0
+func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+	return func(m optionalAttr) {
+		m["input_max"] = value
+	}
+}
+
+// Use QuantizeAndDequantizeV2 instead.
 //
-// Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
+func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6091,327 +6265,290 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "QuantizeAndDequantize",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
-//
-// N is the size of the segment being reduced.
+// Returns locations of nonzero / true values in a tensor.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// This operation returns the coordinates of true elements in `condition`. The
+// coordinates are returned in a 2-D tensor where the first dimension (rows)
+// represents the number of true elements, and the second dimension (columns)
+// represents the coordinates of the true elements. Keep in mind, the shape of
+// the output tensor can vary depending on how many true values there are in
+// `condition`. Indices are output in row-major order.
 //
-// Arguments:
+// For example:
 //
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+// ```
+// # 'input' tensor is [[True, False]
+// #                    [True, False]]
+// # 'input' has two true values, so output has two coordinates.
+// # 'input' has rank of 2, so coordinates have two indices.
+// where(input) ==> [[0, 0],
+//                   [1, 0]]
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSqrtN",
-		Input: []tf.Input{
-			data, indices, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
-type StatelessTruncatedNormalAttr func(optionalAttr)
-
-// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
+// # `condition` tensor is [[[True, False]
+// #                     [True, False]]
+// #                    [[False, True]
+// #                     [False, True]]
+// #                    [[False, False]
+// #                     [False, True]]]
+// # 'input' has 5 true values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+// # `condition` tensor is [[[1.5,  0.0]
+// #                     [-0.5, 0.0]]
+// #                    [[0.0,  0.25]
+// #                     [0.0,  0.75]]
+// #                    [[0.0,  0.0]
+// #                     [0.0,  0.01]]]
+// # 'input' has 5 nonzero values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
 //
-// Returns Random values with specified shape.
-func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
+// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
+// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
+// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
+// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
+// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
+// # 'input' has rank of 3, so coordinates have three indices.
+// where(input) ==> [[0, 0, 0],
+//                   [0, 1, 0],
+//                   [1, 0, 1],
+//                   [1, 1, 1],
+//                   [2, 1, 1]]
+// ```
+func Where(scope *Scope, condition tf.Output) (index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StatelessTruncatedNormal",
+		Type: "Where",
 		Input: []tf.Input{
-			shape, seed,
+			condition,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RestoreSliceAttr is an optional argument to RestoreSlice.
-type RestoreSliceAttr func(optionalAttr)
+// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
+type QueueDequeueV2Attr func(optionalAttr)
 
-// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
+// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`. See the documentation for `Restore`.
+// value: If the queue is empty, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
 // If not specified, defaults to -1
-func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// Dequeues a tuple of one or more tensors from the given queue.
 //
-// This is like `Restore` except that restored tensor can be listed as filling
-// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
-// larger tensor and the slice that the restored tensor covers.
+// This operation has k outputs, where k is the number of components
+// in the tuples stored in the given queue, and output i is the ith
+// component of the dequeued tuple.
 //
-// The `shape_and_slice` input has the same format as the
-// elements of the `shapes_and_slices` input of the `SaveSlices` op.
+// N.B. If the queue is empty, this operation will block until an element
+// has been dequeued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	shape_and_slice: Scalar. The shapes and slice specifications to use when
-// restoring a tensors.
-//	dt: The type of the tensor to be restored.
+//	handle: The handle to a queue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns The restored tensor.
-func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RestoreSlice",
+		Type: "QueueDequeueV2",
 		Input: []tf.Input{
-			file_pattern, tensor_name, shape_and_slice,
+			handle,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueV2", err)
+		return
 	}
+	return components
 }
 
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+// Computes the Gauss error function of `x` element-wise.
+func Erf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Imag",
+		Type: "Erf",
 		Input: []tf.Input{
-			input,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+// Returns element-wise largest integer not greater than x.
+func Floor(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Complex",
+		Type: "Floor",
 		Input: []tf.Input{
-			real, imag,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UniqueWithCountsAttr is an optional argument to UniqueWithCounts.
-type UniqueWithCountsAttr func(optionalAttr)
+// OneHotAttr is an optional argument to OneHot.
+type OneHotAttr func(optionalAttr)
 
-// UniqueWithCountsOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueWithCountsOutIdx(value tf.DataType) UniqueWithCountsAttr {
+// OneHotAxis sets the optional axis attribute to value.
+//
+// value: The axis to fill (default: -1, a new inner-most axis).
+// If not specified, defaults to -1
+func OneHotAxis(value int64) OneHotAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["axis"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
+// Returns a one-hot tensor.
 //
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. Finally, it returns a third tensor `count` that
-// contains the count of each element of `y` in `x`. In other words:
+// The locations represented by indices in `indices` take value `on_value`,
+// while all other locations take value `off_value`.
 //
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// If the input `indices` is rank `N`, the output will have rank `N+1`,
+// The new axis is created at dimension `axis` (default: the new axis is
+// appended at the end).
 //
-// For example:
+// If `indices` is a scalar the output shape will be a vector of length `depth`.
 //
+// If `indices` is a vector of length `features`, the output shape will be:
 // ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx, count = unique_with_counts(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// count ==> [2, 1, 3, 1, 2]
+//   features x depth if axis == -1
+//   depth x features if axis == 0
 // ```
 //
-// Arguments:
-//	x: 1-D.
+// If `indices` is a matrix (batch) with shape `[batch, features]`,
+// the output shape will be:
+// ```
+//   batch x features x depth if axis == -1
+//   batch x depth x features if axis == 1
+//   depth x batch x features if axis == 0
+// ```
 //
-// Returns 1-D.1-D.1-D.
-func UniqueWithCounts(scope *Scope, x tf.Output, optional ...UniqueWithCountsAttr) (y tf.Output, idx tf.Output, count tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UniqueWithCounts",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
-type StatelessRandomNormalAttr func(optionalAttr)
-
-// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom values from a normal distribution.
+// Examples
+// =========
 //
-// The generated values will have mean 0 and standard deviation 1.
+// Suppose that
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 5.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[4 x 3]`:
+//
+//     ```output =
+//       [5.0 0.0 0.0]  // one_hot(0)
+//       [0.0 0.0 5.0]  // one_hot(2)
+//       [0.0 0.0 0.0]  // one_hot(-1)
+//       [0.0 5.0 0.0]  // one_hot(1)
+//     ```
+//
+// Suppose that
+//
+// ```
+//   indices = [0, 2, -1, 1]
+//   depth = 3
+//   on_value = 0.0
+//   off_value = 3.0
+//   axis = 0
+// ```
+//
+// Then output is `[3 x 4]`:
+//
+//     ```output =
+//       [0.0 3.0 3.0 3.0]
+//       [3.0 3.0 3.0 0.0]
+//       [3.0 3.0 3.0 3.0]
+//       [3.0 0.0 3.0 3.0]
+//     //  ^                one_hot(0)
+//     //      ^            one_hot(2)
+//     //          ^        one_hot(-1)
+//     //              ^    one_hot(1)
+//     ```
+// Suppose that
+//
+// ```
+//   indices = [[0, 2], [1, -1]]
+//   depth = 3
+//   on_value = 1.0
+//   off_value = 0.0
+//   axis = -1
+// ```
+//
+// Then output is `[2 x 2 x 3]`:
+//
+//     ```output =
+//       [
+//         [1.0, 0.0, 0.0]  // one_hot(0)
+//         [0.0, 0.0, 1.0]  // one_hot(2)
+//       ][
+//         [0.0, 1.0, 0.0]  // one_hot(1)
+//         [0.0, 0.0, 0.0]  // one_hot(-1)
+//       ]```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	indices: A tensor of indices.
+//	depth: A scalar defining the depth of the one hot dimension.
+//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
+//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
 //
-// Returns Random values with specified shape.
-func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
+// Returns The one-hot tensor.
+func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6420,9 +6557,9 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomNormal",
+		Type: "OneHot",
 		Input: []tf.Input{
-			shape, seed,
+			indices, depth, on_value, off_value,
 		},
 		Attrs: attrs,
 	}
@@ -6430,81 +6567,107 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
-// Reshapes a quantized tensor as per the Reshape op.
+// Real-valued fast Fourier transform.
 //
-// ```
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
+//
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
+//
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
 //
-//	shape: Defines the shape of the output tensor.
-//	input_min: The minimum value of the input.
-//	input_max: The maximum value of the input.
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
 //
-// Returns This value is copied from input_min.This value is copied from input_max.
-func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReshape",
+		Type: "RFFT",
 		Input: []tf.Input{
-			tensor, shape, input_min, input_max,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// GatherAttr is an optional argument to Gather.
-type GatherAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
+type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
-// GatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func GatherValidateIndices(value bool) GatherAttr {
+// DepthwiseConv2dNativeBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gather slices from `params` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
-//
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
-//
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
+// DepthwiseConv2dNativeBackpropFilterDilations sets the optional dilations attribute to value.
 //
-// If `indices` is a permutation and `len(indices) == params.shape[0]` then
-// this operation will permute `params` accordingly.
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of depthwise convolution with respect to the filter.
 //
-// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
-// `indices` are always validated to be within range. If assigned to GPU,
-// out-of-bound indices result in safe but unspecified behavior, which may include
-// raising an error.
+// Arguments:
+//	input: 4-D with shape based on `data_format`.  For example, if
+// `data_format` is 'NHWC' then `input` is a 4-D `[batch, in_height,
+// in_width, in_channels]` tensor.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]` tensor.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
-func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropFilterAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Gather",
+		Type: "DepthwiseConv2dNativeBackpropFilter",
 		Input: []tf.Input{
-			params, indices,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -6512,110 +6675,103 @@ func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...Gathe
 	return op.Output(0)
 }
 
-// Returns the truth value of (x != y) element-wise.
+// LRNGradAttr is an optional argument to LRNGrad.
+type LRNGradAttr func(optionalAttr)
+
+// LRNGradDepthRadius sets the optional depth_radius attribute to value.
 //
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: A depth radius.
+// If not specified, defaults to 5
+func LRNGradDepthRadius(value int64) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["depth_radius"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 3D real-valued fast Fourier transform.
+// LRNGradBias sets the optional bias attribute to value.
 //
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
+// value: An offset (usually > 0 to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNGradBias(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["bias"] = value
+	}
+}
+
+// LRNGradAlpha sets the optional alpha attribute to value.
 //
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNGradAlpha(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["alpha"] = value
+	}
+}
+
+// LRNGradBeta sets the optional beta attribute to value.
 //
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNGradBeta(value float32) LRNGradAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Gradients for Local Response Normalization.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
+//	input_grads: 4-D with shape `[batch, height, width, channels]`.
+//	input_image: 4-D with shape `[batch, height, width, channels]`.
+//	output_image: 4-D with shape `[batch, height, width, channels]`.
 //
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns The gradients for LRN.
+func LRNGrad(scope *Scope, input_grads tf.Output, input_image tf.Output, output_image tf.Output, optional ...LRNGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
+		Type: "LRNGrad",
 		Input: []tf.Input{
-			input, fft_length,
+			input_grads, input_image, output_image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StringSplitAttr is an optional argument to StringSplit.
-type StringSplitAttr func(optionalAttr)
+// AnyAttr is an optional argument to Any.
+type AnyAttr func(optionalAttr)
 
-// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
+// AnyKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A `bool`. If `True`, skip the empty strings from the result.
-// If not specified, defaults to true
-func StringSplitSkipEmpty(value bool) StringSplitAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AnyKeepDims(value bool) AnyAttr {
 	return func(m optionalAttr) {
-		m["skip_empty"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Split elements of `input` based on `delimiter` into a `SparseTensor`.
-//
-// Let N be the size of source (typically N will be the batch size). Split each
-// element of `input` based on `delimiter` and return a `SparseTensor`
-// containing the splitted tokens. Empty tokens are ignored.
-//
-// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
-//  empty string, each element of `input` is split into individual single-byte
-//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
-//  every character of `delimiter` is a potential split point.
-//
-// For example:
-//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
-//   will be
+// Computes the "logical or" of elements across dimensions of a tensor.
 //
-//   indices = [0, 0;
-//              0, 1;
-//              1, 0;
-//              1, 1;
-//              1, 2]
-//   shape = [2, 3]
-//   values = ['hello', 'world', 'a', 'b', 'c']
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	input: 1-D. Strings to split.
-//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
-// tensor, where the first value is N and the second value is the maximum number
-// of tokens in a single input entry.
-func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
+// Returns The reduced tensor.
+func Any(scope *Scope, input tf.Output, axis tf.Output, optional ...AnyAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6624,43 +6780,51 @@ func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringSplit",
+		Type: "Any",
 		Input: []tf.Input{
-			input, delimiter,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResizeBilinearAttr is an optional argument to ResizeBilinear.
-type ResizeBilinearAttr func(optionalAttr)
+// ResourceApplyFtrlAttr is an optional argument to ResourceApplyFtrl.
+type ResourceApplyFtrlAttr func(optionalAttr)
 
-// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
+// ResourceApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+func ResourceApplyFtrlUseLocking(value bool) ResourceApplyFtrlAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Resize `images` to `size` using bilinear interpolation.
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// Input images can be of different types but output images are always float.
+// accum_new = accum + grad * grad
+// linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 regulariation. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
+// Returns the created operation.
+func ResourceApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -6669,363 +6833,279 @@ func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinear",
+		Type: "ResourceApplyFtrl",
 		Input: []tf.Input{
-			images, size,
+			var_, accum, linear, grad, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes softsign: `features / (abs(features) + 1)`.
-func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
+// RandomUniformAttr is an optional argument to RandomUniform.
+type RandomUniformAttr func(optionalAttr)
+
+// RandomUniformSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformSeed(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformSeed2(value int64) RandomUniformAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with uniform random values.
+func RandomUniform(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Softsign",
+		Type: "RandomUniform",
 		Input: []tf.Input{
-			features,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
-type GenerateVocabRemappingAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
-//
-// value: Number of entries in the old vocab file to consider.  If -1,
-// use the entire old vocabulary.
-// If not specified, defaults to -1
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// REQUIRES: value >= -1
-func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["old_vocab_size"] = value
+		m["summarize"] = value
 	}
 }
 
-// Given a path to new and old vocabulary files, returns a remapping Tensor of
-//
-// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
-// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
-// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
-// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
-// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
-// default value of -1.
+// Asserts that the given condition is true.
 //
-// `num_vocab_offset` enables
-// use in the partitioned variable case, and should generally be set through
-// examining partitioning info.  The format of the files should be a text file,
-// with each line containing a single entity within the vocabulary.
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
-// For example, with `new_vocab_file` a text file containing each of the following
-// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
-// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
-// `[0, -1, 2]`.
+// Arguments:
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
-// The op also returns a count of how many entries in the new vocabulary
-// were present in the old vocabulary, which is used to calculate the number of
-// values to initialize in a weight matrix remapping
-//
-// This functionality can be used to remap both row vocabularies (typically,
-// features) and column vocabularies (typically, classes) from TensorFlow
-// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
-// corresponding to div-partitioned variables.  Moreover, the underlying remapping
-// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
-// use the corresponding index_table_from_file() as the FeatureColumn framework
-// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
-//
-// Arguments:
-//	new_vocab_file: Path to the new vocab file.
-//	old_vocab_file: Path to the old vocab file.
-//	new_vocab_offset: How many entries into the new vocab file to start reading.
-//	num_new_vocab: Number of entries in the new vocab file to remap.
-//
-// Returns A Tensor of length num_new_vocab where the element at index i
-// is equal to the old ID that maps to the new ID i.  This element is -1 for any
-// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
-func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
+// Returns the created operation.
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "GenerateVocabRemapping",
+		Type: "Assert",
 		Input: []tf.Input{
-			new_vocab_file, old_vocab_file,
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Assigns sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] = updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] = updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+// Computes element-wise population count (a.k.a. popcount, bitsum, bitcount).
 //
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// For each entry in `x`, calculates the number of `1` (on) bits in the binary
+// representation of that entry.
 //
-// Returns the created operation.
-func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// **NOTE**: It is more efficient to first `tf.bitcast` your tensors into
+// `int32` or `int64` and perform the bitcount on the result, than to feed in
+// 8- or 16-bit inputs and then aggregate the resulting counts.
+func PopulationCount(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterUpdate",
+		Type: "PopulationCount",
 		Input: []tf.Input{
-			resource, indices, updates,
+			x,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
+		Type: "SparseSplit",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			split_dim, indices, values, shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
 	}
-	return scope.AddOperation(opspec)
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
 }
 
-// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
-type ComputeAccidentalHitsAttr func(optionalAttr)
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
 
-// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
+// RandomPoissonSeed sets the optional seed attribute to value.
 // If not specified, defaults to 0
-func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
 // If not specified, defaults to 0
-func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
 }
 
-// Computes the ids of the positions in sampled_candidates that match true_labels.
-//
-// When doing log-odds NCE, the result of this op should be passed through a
-// SparseToDense op, then added to the logits of the sampled candidates. This has
-// the effect of 'removing' the sampled labels that match the true labels by
-// making the classifier sure that they are sampled labels.
-//
-// Arguments:
-//	true_classes: The true_classes output of UnpackSparseLabels.
-//	sampled_candidates: The sampled_candidates output of CandidateSampler.
-//	num_true: Number of true labels per context.
+// Use RandomPoissonV2 instead.
 //
-// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
-// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
-// is -FLOAT_MAX.
-func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ComputeAccidentalHits",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			true_classes, sampled_candidates,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// CumsumAttr is an optional argument to Cumsum.
-type CumsumAttr func(optionalAttr)
+// ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
+type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
-// CumsumExclusive sets the optional exclusive attribute to value.
+// ResourceSparseApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, perform exclusive cumsum.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func CumsumExclusive(value bool) CumsumAttr {
+func ResourceSparseApplyFtrlV2UseLocking(value bool) ResourceSparseApplyFtrlV2Attr {
 	return func(m optionalAttr) {
-		m["exclusive"] = value
+		m["use_locking"] = value
 	}
 }
 
-// CumsumReverse sets the optional reverse attribute to value.
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumsumReverse(value bool) CumsumAttr {
-	return func(m optionalAttr) {
-		m["reverse"] = value
-	}
-}
-
-// Compute the cumulative sum of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumsum, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
-// ```
-//
-// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
-// performed instead:
-//
-// ```python
-// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
-// ```
-//
-// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
-// opposite direction:
-//
-// ```python
-// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
-// ```
-//
-// This is more efficient than using separate `tf.reverse` ops.
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-// The `reverse` and `exclusive` kwargs can also be combined:
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// ```python
-// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
-// ```
+//	lr_power: Scaling factor. Must be a scalar.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7034,167 +7114,102 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumsum",
+		Type: "ResourceSparseApplyFtrlV2",
 		Input: []tf.Input{
-			x, axis,
+			var_, accum, linear, grad, indices, lr, l1, l2, l2_shrinkage, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return scope.AddOperation(opspec)
 }
 
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Associates the given iterator with the given statistics aggregator.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// Returns the created operation.
+func IteratorSetStatsAggregator(scope *Scope, iterator_handle tf.Output, stats_aggregator_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
+		Type: "IteratorSetStatsAggregator",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			iterator_handle, stats_aggregator_handle,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
 	}
+	return scope.AddOperation(opspec)
 }
 
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
+// DataFormatVecPermuteAttr is an optional argument to DataFormatVecPermute.
+type DataFormatVecPermuteAttr func(optionalAttr)
 
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
+// DataFormatVecPermuteSrcFormat sets the optional src_format attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatVecPermuteSrcFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["src_format"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// DataFormatVecPermuteDstFormat sets the optional dst_format attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatVecPermuteDstFormat(value string) DataFormatVecPermuteAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dst_format"] = value
 	}
 }
 
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+// Returns the permuted vector/tensor in the destination data format given the
 //
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
+// one in the source data format.
 //
 // Arguments:
-//	record_bytes: Number of bytes in the record.
+//	x: Vector of size 4 or Tensor of shape (4, 2) in source data format.
 //
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
+// Returns Vector of size 4 or Tensor of shape (4, 2) in destination data format.
+func DataFormatVecPermute(scope *Scope, x tf.Output, optional ...DataFormatVecPermuteAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
+		Type: "DataFormatVecPermute",
+		Input: []tf.Input{
+			x,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// Reads the value of a variable.
 //
-// The hash function is deterministic on the content of the string within the
-// process.
+// The tensor returned by this operation is immutable.
 //
-// Note that the hash function may change from time to time.
-// This functionality will be deprecated and it's recommended to use
-// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+// The value returned by this operation is guaranteed to be influenced by all the
+// writes on which this operation depends directly or indirectly, and to not be
+// influenced by any of the writes which depend directly or indirectly on this
+// operation.
 //
 // Arguments:
-//
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	dtype: the dtype of the value.
+func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucket",
+		Type: "ReadVariableOp",
 		Input: []tf.Input{
-			string_tensor,
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -7202,158 +7217,212 @@ func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64
 	return op.Output(0)
 }
 
-// Computes gradients for the exponential linear (Elu) operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding Elu operation.
-//	outputs: The outputs of the corresponding Elu operation.
-//
-// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
-// `gradients` otherwise.
-func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Computes tan of x element-wise.
+func Tan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "EluGrad",
+		Type: "Tan",
 		Input: []tf.Input{
-			gradients, outputs,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that contains `count` elements from the `input_dataset`.
-//
-// Arguments:
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be taken. A value of `-1` indicates that all of `input_dataset`
-// is taken.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
 //
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
-func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TakeDataset",
+		Type: "ResourceSparseApplyFtrl",
 		Input: []tf.Input{
-			input_dataset, count,
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
+// Returns which elements of x are Inf.
 //
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
+		Type: "IsInf",
 		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the sum along sparse segments of a tensor divided by the sqrt of N.
+//
+// N is the size of the segment being reduced.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SparseSegmentSqrtN(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Atan",
+		Type: "SparseSegmentSqrtN",
 		Input: []tf.Input{
-			x,
+			data, indices, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Encode audio data using the WAV file format.
+// StatelessTruncatedNormalAttr is an optional argument to StatelessTruncatedNormal.
+type StatelessTruncatedNormalAttr func(optionalAttr)
+
+// StatelessTruncatedNormalDtype sets the optional dtype attribute to value.
 //
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalDtype(value tf.DataType) StatelessTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
 //
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
+// Returns Random values with specified shape.
+func StatelessTruncatedNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EncodeWav",
+		Type: "StatelessTruncatedNormal",
 		Input: []tf.Input{
-			audio, sample_rate,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
+// RestoreSliceAttr is an optional argument to RestoreSlice.
+type RestoreSliceAttr func(optionalAttr)
+
+// RestoreSlicePreferredShard sets the optional preferred_shard attribute to value.
 //
-// The hash function is deterministic on the content of the string within the
-// process. The hash function is a keyed hash function, where attribute `key`
-// defines the key of the hash function. `key` is an array of 2 elements.
+// value: Index of file to open first if multiple files match
+// `file_pattern`. See the documentation for `Restore`.
+// If not specified, defaults to -1
+func RestoreSlicePreferredShard(value int64) RestoreSliceAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
 //
-// A strong hash is important when inputs may be malicious, e.g. URLs with
-// additional components. Adversaries could try to make their inputs hash to the
-// same bucket for a denial-of-service attack or to skew the results. A strong
-// hash prevents this by making it difficult, if not infeasible, to compute inputs
-// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
-// time than `tf.string_to_hash_bucket_fast`.
+// This is like `Restore` except that restored tensor can be listed as filling
+// only a slice of a larger tensor.  `shape_and_slice` specifies the shape of the
+// larger tensor and the slice that the restored tensor covers.
+//
+// The `shape_and_slice` input has the same format as the
+// elements of the `shapes_and_slices` input of the `SaveSlices` op.
 //
 // Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//	key: The key for the keyed hash function passed as a list of two uint64
-// elements.
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	shape_and_slice: Scalar. The shapes and slice specifications to use when
+// restoring a tensors.
+//	dt: The type of the tensor to be restored.
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
+// Returns The restored tensor.
+func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, shape_and_slice tf.Output, dt tf.DataType, optional ...RestoreSliceAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StringToHashBucketStrong",
+		Type: "RestoreSlice",
 		Input: []tf.Input{
-			input,
+			file_pattern, tensor_name, shape_and_slice,
 		},
 		Attrs: attrs,
 	}
@@ -7361,142 +7430,120 @@ func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64,
 	return op.Output(0)
 }
 
-// Applies softmax to a batched N-D `SparseTensor`.
-//
-// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
-// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
-//
-// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
-// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
-// zero elements do not participate*.  Specifically, the algorithm is equivalent
-// to the following:
-//
-//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
-//       with shape `[B, C]`, along the size-C dimension;
-//   (2) Masks out the original implicitly-zero locations;
-//   (3) Renormalizes the remaining elements.
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
 //
-// Hence, the `SparseTensor` result has exactly the same non-zero indices and
-// shape.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
 //
-// Arguments:
-//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
-// SparseTensor, in canonical ordering.
-//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
+// For example:
 //
-// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
-func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmax",
+		Type: "Imag",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
-//
-// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
-// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
-// are placed in `outputs[i]` in lexicographic order of `js`, and the first
-// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
-// In detail,
-//
-// ```python
-//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
 //
-//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
-// ```
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
 //
-// `data.shape` must start with `partitions.shape`.
+// The input tensors `real` and `imag` must have the same shape.
 //
 // For example:
 //
-// ```python
-//     # Scalar partitions.
-//     partitions = 1
-//     num_partitions = 2
-//     data = [10, 20]
-//     outputs[0] = []  # Empty with shape [0, 2]
-//     outputs[1] = [[10, 20]]
-//
-//     # Vector partitions.
-//     partitions = [0, 0, 1, 1, 0]
-//     num_partitions = 2
-//     data = [10, 20, 30, 40, 50]
-//     outputs[0] = [10, 20, 50]
-//     outputs[1] = [30, 40]
 // ```
-//
-// See `dynamic_stitch` for an example on how to merge partitions back.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
-//	num_partitions: The number of partitions to output.
-func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DynamicPartition",
+		Type: "Complex",
 		Input: []tf.Input{
-			data, partitions,
+			real, imag,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("DynamicPartition", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
-type ResourceApplyAdagradAttr func(optionalAttr)
+// StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
+type StatelessRandomNormalAttr func(optionalAttr)
 
-// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
+// StatelessRandomNormalDtype sets the optional dtype attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalDtype(value tf.DataType) StatelessRandomNormalAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["dtype"] = value
 	}
 }
 
-// Update '*var' according to the adagrad scheme.
+// Outputs deterministic pseudorandom values from a normal distribution.
 //
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	grad: The gradient.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns the created operation.
-func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
+// Returns Random values with specified shape.
+func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7505,125 +7552,80 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagrad",
+		Type: "StatelessRandomNormal",
 		Input: []tf.Input{
-			var_, accum, lr, grad,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Update '*var' according to the AddSign update.
+// Reshapes a quantized tensor as per the Reshape op.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
-// variable <- variable - lr_t * update
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	logbase: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
+//	shape: Defines the shape of the output tensor.
+//	input_min: The minimum value of the input.
+//	input_max: The maximum value of the input.
+//
+// Returns This value is copied from input_min.This value is copied from input_max.
+func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min tf.Output, input_max tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyPowerSign",
+		Type: "QuantizedReshape",
 		Input: []tf.Input{
-			var_, m, lr, logbase, sign_decay, beta, grad,
+			tensor, shape, input_min, input_max,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// CumprodAttr is an optional argument to Cumprod.
-type CumprodAttr func(optionalAttr)
-
-// CumprodExclusive sets the optional exclusive attribute to value.
-//
-// value: If `True`, perform exclusive cumprod.
-// If not specified, defaults to false
-func CumprodExclusive(value bool) CumprodAttr {
-	return func(m optionalAttr) {
-		m["exclusive"] = value
-	}
-}
+// GatherAttr is an optional argument to Gather.
+type GatherAttr func(optionalAttr)
 
-// CumprodReverse sets the optional reverse attribute to value.
-//
-// value: A `bool` (default: False).
-// If not specified, defaults to false
-func CumprodReverse(value bool) CumprodAttr {
+// GatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func GatherValidateIndices(value bool) GatherAttr {
 	return func(m optionalAttr) {
-		m["reverse"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Compute the cumulative product of the tensor `x` along `axis`.
-//
-// By default, this op performs an inclusive cumprod, which means that the first
-// element of the input is identical to the first element of the output:
-//
-// ```python
-// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
-// ```
+// Gather slices from `params` according to `indices`.
 //
-// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
-// performed instead:
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
 // ```python
-// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
-// ```
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
 //
-// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
-// opposite direction:
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
 //
-// ```python
-// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
 // ```
 //
-// This is more efficient than using separate `tf.reverse` ops.
-//
-// The `reverse` and `exclusive` kwargs can also be combined:
+// If `indices` is a permutation and `len(indices) == params.shape[0]` then
+// this operation will permute `params` accordingly.
 //
-// ```python
-// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
-// ```
+// `validate_indices`: DEPRECATED. If this operation is assigned to CPU, values in
+// `indices` are always validated to be within range. If assigned to GPU,
+// out-of-bound indices result in safe but unspecified behavior, which may include
+// raising an error.
 //
-// Arguments:
-//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
-// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
-// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
-//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
-// `[-rank(x), rank(x))`.
-func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
+func Gather(scope *Scope, params tf.Output, indices tf.Output, optional ...GatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -7632,9 +7634,9 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Cumprod",
+		Type: "Gather",
 		Input: []tf.Input{
-			x, axis,
+			params, indices,
 		},
 		Attrs: attrs,
 	}
@@ -7642,304 +7644,335 @@ func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr)
 	return op.Output(0)
 }
 
-// Computes the mean along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
-// over `j` such that `segment_ids[j] == i` and `N` is the total number of
-// values summed.
-//
-// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
+// Returns the truth value of (x != y) element-wise.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMean",
+		Type: "NotEqual",
 		Input: []tf.Input{
-			data, segment_ids,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// Inverse 3D real-valued fast Fourier transform.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
 //
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
+		Type: "IRFFT3D",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+// StringSplitAttr is an optional argument to StringSplit.
+type StringSplitAttr func(optionalAttr)
+
+// StringSplitSkipEmpty sets the optional skip_empty attribute to value.
 //
-// Arguments:
+// value: A `bool`. If `True`, skip the empty strings from the result.
+// If not specified, defaults to true
+func StringSplitSkipEmpty(value bool) StringSplitAttr {
+	return func(m optionalAttr) {
+		m["skip_empty"] = value
+	}
+}
+
+// Split elements of `input` based on `delimiter` into a `SparseTensor`.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+// Let N be the size of source (typically N will be the batch size). Split each
+// element of `input` based on `delimiter` and return a `SparseTensor`
+// containing the splitted tokens. Empty tokens are ignored.
+//
+// `delimiter` can be empty, or a string of split characters. If `delimiter` is an
+//  empty string, each element of `input` is split into individual single-byte
+//  character strings, including splitting of UTF-8 multibyte sequences. Otherwise
+//  every character of `delimiter` is a potential split point.
 //
+// For example:
+//   N = 2, input[0] is 'hello world' and input[1] is 'a b c', then the output
+//   will be
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//   indices = [0, 0;
+//              0, 1;
+//              1, 0;
+//              1, 1;
+//              1, 2]
+//   shape = [2, 3]
+//   values = ['hello', 'world', 'a', 'b', 'c']
+//
+// Arguments:
+//	input: 1-D. Strings to split.
+//	delimiter: 0-D. Delimiter characters (bytes), or empty string.
+//
+// Returns A dense matrix of int64 representing the indices of the sparse tensor.A vector of strings corresponding to the splited values.a length-2 vector of int64 representing the shape of the sparse
+// tensor, where the first value is N and the second value is the maximum number
+// of tokens in a single input entry.
+func StringSplit(scope *Scope, input tf.Output, delimiter tf.Output, optional ...StringSplitAttr) (indices tf.Output, values tf.Output, shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "StringSplit",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			input, delimiter,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Inverse fast Fourier transform.
+// ResizeBilinearAttr is an optional argument to ResizeBilinear.
+type ResizeBilinearAttr func(optionalAttr)
+
+// ResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// Computes the inverse 1-dimensional discrete Fourier transform over the
-// inner-most dimension of `input`.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearAlignCorners(value bool) ResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using bilinear interpolation.
 //
-// Arguments:
-//	input: A complex64 tensor.
+// Input images can be of different types but output images are always float.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.ifft
-// @end_compatibility
-func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeBilinear(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeBilinearAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IFFT",
+		Type: "ResizeBilinear",
 		Input: []tf.Input{
-			input,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates values in an interval.
-//
-// A sequence of `num` evenly-spaced values are generated beginning at `start`.
-// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
-// so that the last one is exactly `stop`.
-//
-// For example:
-//
-// ```
-// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
-// ```
-//
-// Arguments:
-//	start: First entry in the range.
-//	stop: Last entry in the range.
-//	num: Number of values to generate.
-//
-// Returns 1-D. The generated values.
-func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+// Computes softsign: `features / (abs(features) + 1)`.
+func Softsign(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LinSpace",
+		Type: "Softsign",
 		Input: []tf.Input{
-			start, stop, num,
+			features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
-type DestroyResourceOpAttr func(optionalAttr)
+// GenerateVocabRemappingAttr is an optional argument to GenerateVocabRemapping.
+type GenerateVocabRemappingAttr func(optionalAttr)
 
-// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
+// GenerateVocabRemappingOldVocabSize sets the optional old_vocab_size attribute to value.
 //
-// value: whether to ignore the error when the resource
-// doesn't exist.
-// If not specified, defaults to true
-func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
+// value: Number of entries in the old vocab file to consider.  If -1,
+// use the entire old vocabulary.
+// If not specified, defaults to -1
+//
+// REQUIRES: value >= -1
+func GenerateVocabRemappingOldVocabSize(value int64) GenerateVocabRemappingAttr {
 	return func(m optionalAttr) {
-		m["ignore_lookup_error"] = value
+		m["old_vocab_size"] = value
 	}
 }
 
-// Deletes the resource specified by the handle.
+// Given a path to new and old vocabulary files, returns a remapping Tensor of
 //
-// All subsequent operations using the resource will result in a NotFound
-// error status.
+// length `num_new_vocab`, where `remapping[i]` contains the row number in the old
+// vocabulary that corresponds to row `i` in the new vocabulary (starting at line
+// `new_vocab_offset` and up to `num_new_vocab` entities), or `-1` if entry `i`
+// in the new vocabulary is not in the old vocabulary.  The old vocabulary is
+// constrained to the first `old_vocab_size` entries if `old_vocab_size` is not the
+// default value of -1.
+//
+// `num_vocab_offset` enables
+// use in the partitioned variable case, and should generally be set through
+// examining partitioning info.  The format of the files should be a text file,
+// with each line containing a single entity within the vocabulary.
+//
+// For example, with `new_vocab_file` a text file containing each of the following
+// elements on a single line: `[f0, f1, f2, f3]`, old_vocab_file = [f1, f0, f3],
+// `num_new_vocab = 3, new_vocab_offset = 1`, the returned remapping would be
+// `[0, -1, 2]`.
+//
+// The op also returns a count of how many entries in the new vocabulary
+// were present in the old vocabulary, which is used to calculate the number of
+// values to initialize in a weight matrix remapping
+//
+// This functionality can be used to remap both row vocabularies (typically,
+// features) and column vocabularies (typically, classes) from TensorFlow
+// checkpoints.  Note that the partitioning logic relies on contiguous vocabularies
+// corresponding to div-partitioned variables.  Moreover, the underlying remapping
+// uses an IndexTable (as opposed to an inexact CuckooTable), so client code should
+// use the corresponding index_table_from_file() as the FeatureColumn framework
+// does (as opposed to tf.feature_to_id(), which uses a CuckooTable).
 //
 // Arguments:
-//	resource: handle to the resource to delete.
+//	new_vocab_file: Path to the new vocab file.
+//	old_vocab_file: Path to the old vocab file.
+//	new_vocab_offset: How many entries into the new vocab file to start reading.
+//	num_new_vocab: Number of entries in the new vocab file to remap.
 //
-// Returns the created operation.
-func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+// Returns A Tensor of length num_new_vocab where the element at index i
+// is equal to the old ID that maps to the new ID i.  This element is -1 for any
+// new ID that is not found in the old vocabulary.Number of new vocab entries found in old vocab.
+func GenerateVocabRemapping(scope *Scope, new_vocab_file tf.Output, old_vocab_file tf.Output, new_vocab_offset int64, num_new_vocab int64, optional ...GenerateVocabRemappingAttr) (remapping tf.Output, num_present tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"new_vocab_offset": new_vocab_offset, "num_new_vocab": num_new_vocab}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DestroyResourceOp",
+		Type: "GenerateVocabRemapping",
 		Input: []tf.Input{
-			resource,
+			new_vocab_file, old_vocab_file,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// LRNAttr is an optional argument to LRN.
-type LRNAttr func(optionalAttr)
-
-// LRNDepthRadius sets the optional depth_radius attribute to value.
+// Assigns sparse updates to the variable referenced by `resource`.
 //
-// value: 0-D.  Half-width of the 1-D normalization window.
-// If not specified, defaults to 5
-func LRNDepthRadius(value int64) LRNAttr {
-	return func(m optionalAttr) {
-		m["depth_radius"] = value
-	}
-}
-
-// LRNBias sets the optional bias attribute to value.
+// This operation computes
 //
-// value: An offset (usually positive to avoid dividing by 0).
-// If not specified, defaults to 1
-func LRNBias(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["bias"] = value
-	}
-}
-
-// LRNAlpha sets the optional alpha attribute to value.
+//     # Scalar indices
+//     ref[indices, ...] = updates[...]
 //
-// value: A scale factor, usually positive.
-// If not specified, defaults to 1
-func LRNAlpha(value float32) LRNAttr {
-	return func(m optionalAttr) {
-		m["alpha"] = value
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] = updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] = updates[i, ..., j, ...]
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterUpdate",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
 }
 
-// LRNBeta sets the optional beta attribute to value.
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
+
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
 //
-// value: An exponent.
-// If not specified, defaults to 0.5
-func LRNBeta(value float32) LRNAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["beta"] = value
+		m["data_format"] = value
 	}
 }
 
-// Local Response Normalization.
-//
-// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
-// dimension), and each vector is normalized independently.  Within a given vector,
-// each component is divided by the weighted, squared sum of inputs within
-// `depth_radius`.  In detail,
-//
-//     sqr_sum[a, b, c, d] =
-//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
-//     output = input / (bias + alpha * sqr_sum) ** beta
-//
-// For details, see [Krizhevsky et al., ImageNet classification with deep
-// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
+// Computes gradients of the average pooling function.
 //
 // Arguments:
-//	input: 4-D.
-func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LRN",
+		Type: "AvgPoolGrad",
 		Input: []tf.Input{
-			input,
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
@@ -7947,142 +7980,143 @@ func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output)
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
-type ResourceSparseApplyAdagradAttr func(optionalAttr)
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
-//
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// var -= lr * grad * (1 / sqrt(accum))
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// Op removes all elements in the underlying container.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagrad",
-		Input: []tf.Input{
-			var_, accum, lr, grad, indices,
-		},
+		Type: "StageClear",
+
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// 2D real-valued fast Fourier transform.
+// ComputeAccidentalHitsAttr is an optional argument to ComputeAccidentalHits.
+type ComputeAccidentalHitsAttr func(optionalAttr)
+
+// ComputeAccidentalHitsSeed sets the optional seed attribute to value.
 //
-// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 2 dimensions of `input`.
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ComputeAccidentalHitsSeed2 sets the optional seed2 attribute to value.
 //
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ComputeAccidentalHitsSeed2(value int64) ComputeAccidentalHitsAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Computes the ids of the positions in sampled_candidates that match true_labels.
 //
-// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// When doing log-odds NCE, the result of this op should be passed through a
+// SparseToDense op, then added to the logits of the sampled candidates. This has
+// the effect of 'removing' the sampled labels that match the true labels by
+// making the classifier sure that they are sampled labels.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	true_classes: The true_classes output of UnpackSparseLabels.
+//	sampled_candidates: The sampled_candidates output of CandidateSampler.
+//	num_true: Number of true labels per context.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft2
-// @end_compatibility
-func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns A vector of indices corresponding to rows of true_candidates.A vector of IDs of positions in sampled_candidates that match a true_label
+// for the row with the corresponding index in indices.A vector of the same length as indices and ids, in which each element
+// is -FLOAT_MAX.
+func ComputeAccidentalHits(scope *Scope, true_classes tf.Output, sampled_candidates tf.Output, num_true int64, optional ...ComputeAccidentalHitsAttr) (indices tf.Output, ids tf.Output, weights tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_true": num_true}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RFFT2D",
+		Type: "ComputeAccidentalHits",
 		Input: []tf.Input{
-			input, fft_length,
+			true_classes, sampled_candidates,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
 
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["out_type"] = value
 	}
 }
 
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
 //
 // Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8091,193 +8125,176 @@ func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...Resi
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeArea",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			images, size,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Pads a tensor with zeros.
-//
-// This operation pads a `input` with zeros according to the `paddings` you
-// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
-// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many zeros to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
-// in that dimension.
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
+
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// ```
-// # 't' is [[1, 1], [2, 2]]
-// # 'paddings' is [[1, 1], [2, 2]]
-// # rank of 't' is 2
-// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
-//                       [0, 0, 1, 1, 0, 0]
-//                       [0, 0, 2, 2, 0, 0]
-//                       [0, 0, 0, 0, 0, 0]]
-// ```
-func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Pad",
-		Input: []tf.Input{
-			input, paddings,
-		},
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["header_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Checks whether a resource handle-based variable has been initialized.
-//
-// Arguments:
-//	resource: the input resource handle.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// Returns a scalar boolean which is true if the variable has been
-// initialized.
-func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "VarIsInitializedOp",
-		Input: []tf.Input{
-			resource,
-		},
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["footer_bytes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
-type StatelessRandomUniformAttr func(optionalAttr)
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
+//
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["hop_bytes"] = value
+	}
+}
 
-// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// value: The type of the output.
-// If not specified, defaults to DT_FLOAT
-func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["container"] = value
 	}
 }
 
-// Outputs deterministic pseudorandom random values from a uniform distribution.
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// The generated values follow a uniform distribution in the range `[0, 1)`. The
-// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
 //
-// The outputs are a deterministic function of `shape` and `seed`.
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
+//	record_bytes: Number of bytes in the record.
 //
-// Returns Random values with specified shape.
-func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniform",
-		Input: []tf.Input{
-			shape, seed,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Makes its input available to the next iteration.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process.
+//
+// Note that the hash function may change from time to time.
+// This functionality will be deprecated and it's recommended to use
+// `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
 //
 // Arguments:
-//	data: The tensor to be made available to the next iteration.
 //
-// Returns The same tensor as `data`.
-func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucket(scope *Scope, string_tensor tf.Output, num_buckets int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
 	opspec := tf.OpSpec{
-		Type: "NextIteration",
+		Type: "StringToHashBucket",
 		Input: []tf.Input{
-			data,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
+// Computes gradients for the exponential linear (Elu) operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Elu operation.
+//	outputs: The outputs of the corresponding Elu operation.
+//
+// Returns The gradients: `gradients * (outputs + 1)` if outputs < 0,
+// `gradients` otherwise.
+func EluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Fact",
+		Type: "EluGrad",
+		Input: []tf.Input{
+			gradients, outputs,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
+// Creates a dataset that contains `count` elements from the `input_dataset`.
 //
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+// Arguments:
 //
-// For example:
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be taken. A value of `-1` indicates that all of `input_dataset`
+// is taken.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
 //
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+func TakeDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Angle",
+		Type: "TakeDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -8285,294 +8302,254 @@ func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Outp
 	return op.Output(0)
 }
 
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
-//
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
+// The gradient operator for the SparseAdd op.
 //
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
 //
 // Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
-		Attrs: attrs,
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Elementwise computes the bitwise XOR of `x` and `y`.
-//
-// The result will have those bits set, that are different in `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseXor",
+		Type: "Atan",
 		Input: []tf.Input{
-			x, y,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deserialize `SparseTensor` objects.
-//
-// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
-// the last dimension stores serialized `SparseTensor` objects and the other N
-// dimensions (N >= 0) correspond to a batch. The ranks of the original
-// `SparseTensor` objects must all match. When the final `SparseTensor` is
-// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
-// the sparse tensors have been concatenated along new dimensions, one for each
-// batch.
-//
-// The output `SparseTensor` object's shape values for the original dimensions
-// are the max across the input `SparseTensor` objects' shape values for the
-// corresponding dimensions. The new dimensions match the size of the batch.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
+// Encode audio data using the WAV file format.
 //
-// then the final deserialized `SparseTensor` will be:
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
-// must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
+//
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "DeserializeSparse",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			serialized_sparse,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
+	return op.Output(0)
 }
 
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// The hash function is deterministic on the content of the string within the
+// process. The hash function is a keyed hash function, where attribute `key`
+// defines the key of the hash function. `key` is an array of 2 elements.
 //
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
+// A strong hash is important when inputs may be malicious, e.g. URLs with
+// additional components. Adversaries could try to make their inputs hash to the
+// same bucket for a denial-of-service attack or to skew the results. A strong
+// hash prevents this by making it difficult, if not infeasible, to compute inputs
+// that hash to the same bucket. This comes at a cost of roughly 4x higher compute
+// time than `tf.string_to_hash_bucket_fast`.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//	key: The key for the keyed hash function passed as a list of two uint64
+// elements.
 //
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketStrong(scope *Scope, input tf.Output, num_buckets int64, key []int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets, "key": key}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
+		Type: "StringToHashBucketStrong",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// SizeAttr is an optional argument to Size.
-type SizeAttr func(optionalAttr)
-
-// SizeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func SizeOutType(value tf.DataType) SizeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the size of a tensor.
+// Applies softmax to a batched N-D `SparseTensor`.
 //
-// This operation returns an integer representing the number of elements in
-// `input`.
+// The inputs represent an N-D SparseTensor  with logical shape `[..., B, C]`
+// (where `N >= 2`), and with indices sorted in the canonical lexicographic order.
 //
-// For example:
+// This op is equivalent to applying the normal `tf.nn.softmax()` to each innermost
+// logical submatrix with shape `[B, C]`, but with the catch that *the implicitly
+// zero elements do not participate*.  Specifically, the algorithm is equivalent
+// to the following:
 //
-// ```
-// # 't' is [[[1, 1,, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]]
-// size(t) ==> 12
-// ```
-func Size(scope *Scope, input tf.Output, optional ...SizeAttr) (output tf.Output) {
+//   (1) Applies `tf.nn.softmax()` to a densified view of each innermost submatrix
+//       with shape `[B, C]`, along the size-C dimension;
+//   (2) Masks out the original implicitly-zero locations;
+//   (3) Renormalizes the remaining elements.
+//
+// Hence, the `SparseTensor` result has exactly the same non-zero indices and
+// shape.
+//
+// Arguments:
+//	sp_indices: 2-D.  `NNZ x R` matrix with the indices of non-empty values in a
+// SparseTensor, in canonical ordering.
+//	sp_values: 1-D.  `NNZ` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 1-D.  The `NNZ` values for the result `SparseTensor`.
+func SparseSoftmax(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Size",
+		Type: "SparseSoftmax",
 		Input: []tf.Input{
-			input,
+			sp_indices, sp_values, sp_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+// Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
+// For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
+// becomes part of `outputs[partitions[js]]`.  The slices with `partitions[js] = i`
+// are placed in `outputs[i]` in lexicographic order of `js`, and the first
+// dimension of `outputs[i]` is the number of entries in `partitions` equal to `i`.
+// In detail,
 //
-// variable according to `indices`.
+// ```python
+//     outputs[i].shape = [sum(partitions == i)] + data.shape[partitions.ndim:]
 //
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//     outputs[i] = pack([data[js, ...] for js if partitions[js] == i])
+// ```
 //
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+// `data.shape` must start with `partitions.shape`.
 //
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
+// For example:
 //
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+// ```python
+//     # Scalar partitions.
+//     partitions = 1
+//     num_partitions = 2
+//     data = [10, 20]
+//     outputs[0] = []  # Empty with shape [0, 2]
+//     outputs[1] = [[10, 20]]
 //
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+//     # Vector partitions.
+//     partitions = [0, 0, 1, 1, 0]
+//     num_partitions = 2
+//     data = [10, 20, 30, 40, 50]
+//     outputs[0] = [10, 20, 50]
+//     outputs[1] = [30, 40]
 // ```
 //
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
+// See `dynamic_stitch` for an example on how to merge partitions back.
 //
-// ```python
-//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicPartition.png" alt>
+// </div>
 //
-// The resulting update to ref would look like this:
+// Arguments:
 //
-//     [1, 11, 3, 10, 9, 6, 7, 12]
+//	partitions: Any shape.  Indices in the range `[0, num_partitions)`.
+//	num_partitions: The number of partitions to output.
+func DynamicPartition(scope *Scope, data tf.Output, partitions tf.Output, num_partitions int64) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_partitions": num_partitions}
+	opspec := tf.OpSpec{
+		Type: "DynamicPartition",
+		Input: []tf.Input{
+			data, partitions,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("DynamicPartition", err)
+		return
+	}
+	return outputs
+}
+
+// ResourceApplyAdagradAttr is an optional argument to ResourceApplyAdagrad.
+type ResourceApplyAdagradAttr func(optionalAttr)
+
+// ResourceApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// See @{tf.scatter_nd} for more details about how to make updates to
-// slices.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradUseLocking(value bool) ResourceApplyAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the adagrad scheme.
+//
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, optional ...ResourceApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8581,59 +8558,66 @@ func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, upd
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
+		Type: "ResourceApplyAdagrad",
 		Input: []tf.Input{
-			ref, indices, updates,
+			var_, accum, lr, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SqueezeAttr is an optional argument to Squeeze.
-type SqueezeAttr func(optionalAttr)
-
-// SqueezeAxis sets the optional axis attribute to value.
+// Return the shape of s0 op s1 with broadcast.
 //
-// value: If specified, only squeezes the dimensions listed. The dimension
-// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
-// be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to <>
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func SqueezeAxis(value []int64) SqueezeAttr {
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
 	return func(m optionalAttr) {
-		m["squeeze_dims"] = value
+		m["src_format"] = value
 	}
 }
 
-// Removes dimensions of size 1 from the shape of a tensor.
-//
-// Given a tensor `input`, this operation returns a tensor of the same type with
-// all dimensions of size 1 removed. If you don't want to remove all size 1
-// dimensions, you can remove specific size 1 dimensions by specifying
-// `axis`.
-//
-// For example:
-//
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t)) ==> [2, 3]
-// ```
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
 //
-// Or, to remove specific size 1 dimensions:
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
 //
-// ```
-// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
-// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
-// ```
+// the source data format.
 //
 // Arguments:
-//	input: The `input` to squeeze.
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
 //
-// Returns Contains the same data as `input`, but has one or more dimensions of
-// size 1 removed.
-func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8642,9 +8626,9 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Squeeze",
+		Type: "DataFormatDimMap",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 		Attrs: attrs,
 	}
@@ -8652,38 +8636,38 @@ func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.
 	return op.Output(0)
 }
 
-// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
-type ResourceApplyAdadeltaAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var, accum and update_accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the adadelta scheme.
+// Update '*var' according to the AddSign update.
 //
-// accum = rho() * accum + (1 - rho()) * grad.square();
-// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
-// update_accum = rho() * update_accum + (1 - rho()) * update.square();
-// var -= update;
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	accum_update: Should be from a Variable().
+//	m: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
+//	logbase: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
 //	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+func ResourceApplyPowerSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, logbase tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyPowerSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8692,147 +8676,81 @@ func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdadelta",
+		Type: "ResourceApplyPowerSign",
 		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad,
+			var_, m, lr, logbase, sign_decay, beta, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
-type NonMaxSuppressionAttr func(optionalAttr)
-
-// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
+// Computes the mean along segments of a tensor.
 //
-// value: A float representing the threshold for deciding whether boxes
-// overlap too much with respect to IOU.
-// If not specified, defaults to 0.5
-func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
-	return func(m optionalAttr) {
-		m["iou_threshold"] = value
-	}
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//   selected_indices = tf.image.non_max_suppression(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// Computes a tensor such that
+// \\(output_i = \frac{\sum_j data_j}{N}\\) where `mean` is
+// over `j` such that `segment_ids[j] == i` and `N` is the total number of
+// values summed.
 //
-// Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppression",
-		Input: []tf.Input{
-			boxes, scores, max_output_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that emits `components` as a tuple of tensors once.
-func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "TensorDataset",
-		Input: []tf.Input{
-			tf.OutputList(components),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Component-wise multiplies a SparseTensor by a dense Tensor.
-//
-// The output locations corresponding to the implicitly zero elements in the sparse
-// tensor will be zero (i.e., will not take up storage space), regardless of the
-// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
+// If the mean is empty for a given segment ID `i`, `output[i] = 0`.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMean.png" alt>
+// </div>
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMean(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseMul",
+		Type: "SegmentMean",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
-type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
 
-// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
 // contention.
 // If not specified, defaults to false
-func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the RMSProp algorithm.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
 // and mom will not update in iterations during which the grad is zero.
 //
 // mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
 //
 // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
 // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
@@ -8840,6 +8758,7 @@ func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSProp
 //
 // Arguments:
 //	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
 //	ms: Should be from a Variable().
 //	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
@@ -8850,7 +8769,7 @@ func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSProp
 //	indices: A vector of indices into the first dimension of var, ms and mom.
 //
 // Returns the created operation.
-func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -8859,168 +8778,200 @@ func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyRMSProp",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x > y) element-wise.
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
 //
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Greater",
+		Type: "BatchDataset",
 		Input: []tf.Input{
-			x, y,
+			input_dataset, batch_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
-type SampleDistortedBoundingBoxAttr func(optionalAttr)
-
-// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+// Inverse fast Fourier transform.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Computes the inverse 1-dimensional discrete Fourier transform over the
+// inner-most dimension of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its inverse 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft
+// @end_compatibility
+func IFFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IFFT",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+// Generates values in an interval.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// A sequence of `num` evenly-spaced values are generated beginning at `start`.
+// If `num > 1`, the values in the sequence increase by `stop - start / num - 1`,
+// so that the last one is exactly `stop`.
+//
+// For example:
+//
+// ```
+// tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
+// ```
+//
+// Arguments:
+//	start: First entry in the range.
+//	stop: Last entry in the range.
+//	num: Number of values to generate.
+//
+// Returns 1-D. The generated values.
+func LinSpace(scope *Scope, start tf.Output, stop tf.Output, num tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "LinSpace",
+		Input: []tf.Input{
+			start, stop, num,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+// DestroyResourceOpAttr is an optional argument to DestroyResourceOp.
+type DestroyResourceOpAttr func(optionalAttr)
+
+// DestroyResourceOpIgnoreLookupError sets the optional ignore_lookup_error attribute to value.
 //
-// value: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
-// If not specified, defaults to 0.1
-func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+// value: whether to ignore the error when the resource
+// doesn't exist.
+// If not specified, defaults to true
+func DestroyResourceOpIgnoreLookupError(value bool) DestroyResourceOpAttr {
 	return func(m optionalAttr) {
-		m["min_object_covered"] = value
+		m["ignore_lookup_error"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// Deletes the resource specified by the handle.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
+// All subsequent operations using the resource will result in a NotFound
+// error status.
+//
+// Arguments:
+//	resource: handle to the resource to delete.
+//
+// Returns the created operation.
+func DestroyResourceOp(scope *Scope, resource tf.Output, optional ...DestroyResourceOpAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DestroyResourceOp",
+		Input: []tf.Input{
+			resource,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+// LRNAttr is an optional argument to LRN.
+type LRNAttr func(optionalAttr)
+
+// LRNDepthRadius sets the optional depth_radius attribute to value.
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+// value: 0-D.  Half-width of the 1-D normalization window.
+// If not specified, defaults to 5
+func LRNDepthRadius(value int64) LRNAttr {
 	return func(m optionalAttr) {
-		m["area_range"] = value
+		m["depth_radius"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+// LRNBias sets the optional bias attribute to value.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+// value: An offset (usually positive to avoid dividing by 0).
+// If not specified, defaults to 1
+func LRNBias(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["max_attempts"] = value
+		m["bias"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+// LRNAlpha sets the optional alpha attribute to value.
 //
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+// value: A scale factor, usually positive.
+// If not specified, defaults to 1
+func LRNAlpha(value float32) LRNAttr {
 	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
+		m["alpha"] = value
 	}
 }
 
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
+// LRNBeta sets the optional beta attribute to value.
 //
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
+// value: An exponent.
+// If not specified, defaults to 0.5
+func LRNBeta(value float32) LRNAttr {
+	return func(m optionalAttr) {
+		m["beta"] = value
+	}
+}
+
+// Local Response Normalization.
 //
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
+// The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
+// dimension), and each vector is normalized independently.  Within a given vector,
+// each component is divided by the weighted, squared sum of inputs within
+// `depth_radius`.  In detail,
 //
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
+//     sqr_sum[a, b, c, d] =
+//         sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
+//     output = input / (bias + alpha * sqr_sum) ** beta
 //
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// For details, see [Krizhevsky et al., ImageNet classification with deep
+// convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks).
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+//	input: 4-D.
+func LRN(scope *Scope, input tf.Output, optional ...LRNAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9029,37 +8980,7 @@ func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_box
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBox",
-		Input: []tf.Input{
-			image_size, bounding_boxes,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Converts each string in the input Tensor to its hash mod by a number of buckets.
-//
-// The hash function is deterministic on the content of the string within the
-// process and will never change. However, it is not suitable for cryptography.
-// This function may be used when CPU time is scarce and inputs are trusted or
-// unimportant. There is a risk of adversaries constructing inputs that all hash
-// to the same bucket. To prevent this problem, use a strong hash function with
-// `tf.string_to_hash_bucket_strong`.
-//
-// Arguments:
-//	input: The strings to assign a hash bucket.
-//	num_buckets: The number of buckets.
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_buckets": num_buckets}
-	opspec := tf.OpSpec{
-		Type: "StringToHashBucketFast",
+		Type: "LRN",
 		Input: []tf.Input{
 			input,
 		},
@@ -9069,296 +8990,326 @@ func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (o
 	return op.Output(0)
 }
 
-// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
-//
-// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Creates a dataset that zips together `input_datasets`.
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Maximum",
+		Type: "ZipDataset",
 		Input: []tf.Input{
-			x, y,
+			tf.OutputList(input_datasets),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
-type TensorArrayGatherV3Attr func(optionalAttr)
+// ResourceSparseApplyAdagradAttr is an optional argument to ResourceSparseApplyAdagrad.
+type ResourceSparseApplyAdagradAttr func(optionalAttr)
 
-// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
+// ResourceSparseApplyAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradUseLocking(value bool) ResourceSparseApplyAdagradAttr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Gather specific elements from the TensorArray into output `value`.
+// Update relevant entries in '*var' and '*accum' according to the adagrad scheme.
 //
-// All elements selected by `indices` must have the same shape.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// var -= lr * grad * (1 / sqrt(accum))
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations in the TensorArray from which to read tensor elements.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns All of the elements in the TensorArray, concatenated along a new
-// axis (the new dimension 0).
-func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV3",
+		Type: "ResourceSparseApplyAdagrad",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			var_, accum, lr, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns x / y element-wise for integer types.
+// 2D real-valued fast Fourier transform.
 //
-// Truncation designates that negative numbers will round fractional quantities
-// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
-// than Python semantics. See `FloorDiv` for a division function that matches
-// Python Semantics.
+// Computes the 2-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 2 dimensions of `input`.
 //
-// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT2D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT2D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft2
+// @end_compatibility
+func RFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TruncateDiv",
+		Type: "RFFT2D",
 		Input: []tf.Input{
-			x, y,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restores tensors from a V2 checkpoint.
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
+
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// For backward compatibility with the V1 format, this Op currently allows
-// restoring from a V1 checkpoint as well:
-//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
-//     if found proceed to read it as a V2 checkpoint;
-//   - Otherwise the V1 read path is invoked.
-// Relying on this behavior is not recommended, as the ability to fall back to read
-// V1 might be deprecated and eventually removed.
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Resize `images` to `size` using area interpolation.
 //
-// By default, restores the named tensors in full.  If the caller wishes to restore
-// specific slices of stored tensors, "shape_and_slices" should be non-empty
-// strings and correspondingly well-formed.
+// Input images can be of different types but output images are always float.
 //
-// Callers must ensure all the named tensors are indeed stored in the checkpoint.
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
 //
 // Arguments:
-//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
-//	tensor_names: shape {N}.  The names of the tensors to be restored.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
-// Empty strings indicate that they are non-partitioned tensors.
-//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
-// those stored in the checkpoint.
-//
-// Returns shape {N}.  The restored tensors, whose shapes are read from the
-// checkpoint directly.
-func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RestoreV2",
+		Type: "ResizeArea",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices,
+			images, size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
-		scope.UpdateErr("RestoreV2", err)
-		return
-	}
-	return tensors
+	return op.Output(0)
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
+// Pads a tensor with zeros.
 //
-// Arguments:
+// This operation pads a `input` with zeros according to the `paddings` you
+// specify. `paddings` is an integer tensor with shape `[Dn, 2]`, where n is the
+// rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many zeros to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many zeros to add after the contents of `input`
+// in that dimension.
 //
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
+// The padded size of each dimension D of the output is:
 //
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
 //
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// For example:
+//
+// ```
+// # 't' is [[1, 1], [2, 2]]
+// # 'paddings' is [[1, 1], [2, 2]]
+// # rank of 't' is 2
+// pad(t, paddings) ==> [[0, 0, 0, 0, 0, 0]
+//                       [0, 0, 1, 1, 0, 0]
+//                       [0, 0, 2, 2, 0, 0]
+//                       [0, 0, 0, 0, 0, 0]]
+// ```
+func Pad(scope *Scope, input tf.Output, paddings tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "SkipDataset",
+		Type: "Pad",
 		Input: []tf.Input{
-			input_dataset, count,
+			input, paddings,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
+// Checks whether a resource handle-based variable has been initialized.
 //
 // Arguments:
+//	resource: the input resource handle.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns a scalar boolean which is true if the variable has been
+// initialized.
+func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentMax",
+		Type: "VarIsInitializedOp",
 		Input: []tf.Input{
-			data, segment_ids,
+			resource,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+// StatelessRandomUniformAttr is an optional argument to StatelessRandomUniform.
+type StatelessRandomUniformAttr func(optionalAttr)
+
+// StatelessRandomUniformDtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformDtype(value tf.DataType) StatelessRandomUniformAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape` and `seed`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Tanh",
+		Type: "StatelessRandomUniform",
 		Input: []tf.Input{
-			x,
+			shape, seed,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
+// Makes its input available to the next iteration.
 //
 // Arguments:
-//	input: Base64 strings to decode.
+//	data: The tensor to be made available to the next iteration.
 //
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The same tensor as `data`.
+func NextIteration(scope *Scope, data tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "NextIteration",
 		Input: []tf.Input{
-			input,
+			data,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
-		Input: []tf.Input{
-			value,
-		},
+		Type: "Fact",
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["Tout"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// Returns the argument of a complex number.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9367,287 +9318,263 @@ func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.O
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "Angle",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			input,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns element-wise largest integer not greater than x.
-func Floor(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Floor",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the Gauss error function of `x` element-wise.
-func Erf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erf",
-		Input: []tf.Input{
-			x,
-		},
-	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// OneHotAttr is an optional argument to OneHot.
-type OneHotAttr func(optionalAttr)
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
 
-// OneHotAxis sets the optional axis attribute to value.
+// VarHandleOpContainer sets the optional container attribute to value.
 //
-// value: The axis to fill (default: -1, a new inner-most axis).
-// If not specified, defaults to -1
-func OneHotAxis(value int64) OneHotAttr {
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["container"] = value
 	}
 }
 
-// Returns a one-hot tensor.
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
 //
-// The locations represented by indices in `indices` take value `on_value`,
-// while all other locations take value `off_value`.
-//
-// If the input `indices` is rank `N`, the output will have rank `N+1`,
-// The new axis is created at dimension `axis` (default: the new axis is
-// appended at the end).
-//
-// If `indices` is a scalar the output shape will be a vector of length `depth`.
-//
-// If `indices` is a vector of length `features`, the output shape will be:
-// ```
-//   features x depth if axis == -1
-//   depth x features if axis == 0
-// ```
-//
-// If `indices` is a matrix (batch) with shape `[batch, features]`,
-// the output shape will be:
-// ```
-//   batch x features x depth if axis == -1
-//   batch x depth x features if axis == 1
-//   depth x batch x features if axis == 0
-// ```
-//
-//
-// Examples
-// =========
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 5.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[4 x 3]`:
-//
-//     ```output =
-//       [5.0 0.0 0.0]  // one_hot(0)
-//       [0.0 0.0 5.0]  // one_hot(2)
-//       [0.0 0.0 0.0]  // one_hot(-1)
-//       [0.0 5.0 0.0]  // one_hot(1)
-//     ```
-//
-// Suppose that
-//
-// ```
-//   indices = [0, 2, -1, 1]
-//   depth = 3
-//   on_value = 0.0
-//   off_value = 3.0
-//   axis = 0
-// ```
-//
-// Then output is `[3 x 4]`:
-//
-//     ```output =
-//       [0.0 3.0 3.0 3.0]
-//       [3.0 3.0 3.0 0.0]
-//       [3.0 3.0 3.0 3.0]
-//       [3.0 0.0 3.0 3.0]
-//     //  ^                one_hot(0)
-//     //      ^            one_hot(2)
-//     //          ^        one_hot(-1)
-//     //              ^    one_hot(1)
-//     ```
-// Suppose that
-//
-// ```
-//   indices = [[0, 2], [1, -1]]
-//   depth = 3
-//   on_value = 1.0
-//   off_value = 0.0
-//   axis = -1
-// ```
-//
-// Then output is `[2 x 2 x 3]`:
-//
-//     ```output =
-//       [
-//         [1.0, 0.0, 0.0]  // one_hot(0)
-//         [0.0, 0.0, 1.0]  // one_hot(2)
-//       ][
-//         [0.0, 1.0, 0.0]  // one_hot(1)
-//         [0.0, 0.0, 0.0]  // one_hot(-1)
-//       ]```
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
 //
 // Arguments:
-//	indices: A tensor of indices.
-//	depth: A scalar defining the depth of the one hot dimension.
-//	on_value: A scalar defining the value to fill in output when `indices[j] = i`.
-//	off_value: A scalar defining the value to fill in output when `indices[j] != i`.
-//
-// Returns The one-hot tensor.
-func OneHot(scope *Scope, indices tf.Output, depth tf.Output, on_value tf.Output, off_value tf.Output, optional ...OneHotAttr) (output tf.Output) {
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OneHot",
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise XOR of `x` and `y`.
+//
+// The result will have those bits set, that are different in `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseXor(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseXor",
 		Input: []tf.Input{
-			indices, depth, on_value, off_value,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reads the value of a variable.
+// Deserialize `SparseTensor` objects.
 //
-// The tensor returned by this operation is immutable.
+// The input `serialized_sparse` must have the shape `[?, ?, ..., ?, 3]` where
+// the last dimension stores serialized `SparseTensor` objects and the other N
+// dimensions (N >= 0) correspond to a batch. The ranks of the original
+// `SparseTensor` objects must all match. When the final `SparseTensor` is
+// created, its rank is the rank of the incoming `SparseTensor` objects plus N;
+// the sparse tensors have been concatenated along new dimensions, one for each
+// batch.
 //
-// The value returned by this operation is guaranteed to be influenced by all the
-// writes on which this operation depends directly or indirectly, and to not be
-// influenced by any of the writes which depend directly or indirectly on this
-// operation.
+// The output `SparseTensor` object's shape values for the original dimensions
+// are the max across the input `SparseTensor` objects' shape values for the
+// corresponding dimensions. The new dimensions match the size of the batch.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
+//
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
+//
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+//
+// and
+//
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+//
+// then the final deserialized `SparseTensor` will be:
+//
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	dtype: the dtype of the value.
-func ReadVariableOp(scope *Scope, resource tf.Output, dtype tf.DataType) (value tf.Output) {
+//	serialized_sparse: The serialized `SparseTensor` objects. The last dimension
+// must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "ReadVariableOp",
+		Type: "DeserializeSparse",
 		Input: []tf.Input{
-			resource,
+			serialized_sparse,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "ResourceApplyRMSProp",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
 
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Applies sparse `updates` to individual values or slices within a given
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// variable according to `indices`.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to
+// slices.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9656,176 +9583,169 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "ResourceScatterNdUpdate",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			ref, indices, updates,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise remainder of division. This emulates C semantics in that
+// SqueezeAttr is an optional argument to Squeeze.
+type SqueezeAttr func(optionalAttr)
+
+// SqueezeAxis sets the optional axis attribute to value.
 //
-// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
-// y + truncate_mod(x, y) = x`.
+// value: If specified, only squeezes the dimensions listed. The dimension
+// index starts at 0. It is an error to squeeze a dimension that is not 1. Must
+// be in the range `[-rank(input), rank(input))`.
+// If not specified, defaults to <>
 //
-// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncateMod",
-		Input: []tf.Input{
-			x, y,
-		},
+// REQUIRES: len(value) >= 0
+func SqueezeAxis(value []int64) SqueezeAttr {
+	return func(m optionalAttr) {
+		m["squeeze_dims"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Inverse 2D real-valued fast Fourier transform.
+// Removes dimensions of size 1 from the shape of a tensor.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 2 dimensions of `input`.
+// Given a tensor `input`, this operation returns a tensor of the same type with
+// all dimensions of size 1 removed. If you don't want to remove all size 1
+// dimensions, you can remove specific size 1 dimensions by specifying
+// `axis`.
 //
-// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
+// For example:
 //
-// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t)) ==> [2, 3]
+// ```
 //
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
+// Or, to remove specific size 1 dimensions:
 //
-// Returns A float32 tensor of the same rank as `input`. The inner-most 2
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 2D Fourier transform.
+// ```
+// # 't' is a tensor of shape [1, 2, 1, 3, 1, 1]
+// shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
+// ```
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft2
-// @end_compatibility
-func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Arguments:
+//	input: The `input` to squeeze.
+//
+// Returns Contains the same data as `input`, but has one or more dimensions of
+// size 1 removed.
+func Squeeze(scope *Scope, input tf.Output, optional ...SqueezeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT2D",
+		Type: "Squeeze",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeJpegAttr is an optional argument to DecodeJpeg.
-type DecodeJpegAttr func(optionalAttr)
+// ResourceApplyAdadeltaAttr is an optional argument to ResourceApplyAdadelta.
+type ResourceApplyAdadeltaAttr func(optionalAttr)
 
-// DecodeJpegChannels sets the optional channels attribute to value.
+// ResourceApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeJpegChannels(value int64) DecodeJpegAttr {
+// value: If True, updating of the var, accum and update_accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdadeltaUseLocking(value bool) ResourceApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["channels"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DecodeJpegRatio sets the optional ratio attribute to value.
+// Update '*var' according to the adadelta scheme.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeJpegRatio(value int64) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// accum = rho() * accum + (1 - rho()) * grad.square();
+// update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+// update_accum = rho() * update_accum + (1 - rho()) * update.square();
+// var -= update;
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	accum_update: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
 //
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+// Returns the created operation.
+func ResourceApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DecodeJpegDctMethod sets the optional dct_method attribute to value.
+// NonMaxSuppressionAttr is an optional argument to NonMaxSuppression.
+type NonMaxSuppressionAttr func(optionalAttr)
+
+// NonMaxSuppressionIouThreshold sets the optional iou_threshold attribute to value.
 //
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+// value: A float representing the threshold for deciding whether boxes
+// overlap too much with respect to IOU.
+// If not specified, defaults to 0.5
+func NonMaxSuppressionIouThreshold(value float32) NonMaxSuppressionAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["iou_threshold"] = value
 	}
 }
 
-// Decode a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// This op also supports decoding PNGs and non-animated GIFs since the interface is
-// the same, though it is cleaner to use `tf.image.decode_image`.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
+//   selected_indices = tf.image.non_max_suppression(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppression(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, optional ...NonMaxSuppressionAttr) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -9834,9 +9754,9 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJpeg",
+		Type: "NonMaxSuppression",
 		Input: []tf.Input{
-			contents,
+			boxes, scores, max_output_size,
 		},
 		Attrs: attrs,
 	}
@@ -9844,430 +9764,450 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// Transforms a vector of brain.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	names: A vector containing the names of the serialized protos.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this vector must be the same length as "serialized".
-//	sparse_keys: A list of Nsparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples' features associated with dense values.
-//	dense_defaults: A list of Ndense Tensors (some may be empty).
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	sparse_types: A list of Nsparse types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
-// given in dense_keys.
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// Creates a dataset that emits `components` as a tuple of tensors once.
+func TensorDataset(scope *Scope, components []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ParseExample",
+		Type: "TensorDataset",
 		Input: []tf.Input{
-			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
+			tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExample", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
-}
-
-// VariableShapeAttr is an optional argument to VariableShape.
-type VariableShapeAttr func(optionalAttr)
-
-// VariableShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
+	return op.Output(0)
 }
 
-// Returns the shape of the variable pointed to by `resource`.
+// Component-wise multiplies a SparseTensor by a dense Tensor.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// The output locations corresponding to the implicitly zero elements in the sparse
+// tensor will be zero (i.e., will not take up storage space), regardless of the
+// contents of the dense tensor (even if it's +/-INF and that INF*0 == NaN).
 //
-// For example:
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "VariableShape",
+		Type: "SparseDenseCwiseMul",
 		Input: []tf.Input{
-			input,
+			sp_indices, sp_values, sp_shape, dense,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes softmax cross entropy cost and gradients to backpropagate.
+// ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
+type ResourceSparseApplyRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
-// a matrix of label probabilities, but rather a single label per row
-// of features.  This label is considered to have probability 1.0 for the
-// given row.
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyRMSPropUseLocking(value bool) ResourceSparseApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
 //
-// Inputs are the logits, not probabilities.
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size vector with values in [0, num_classes).
-// This is the label for the given minibatch entry.
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSoftmaxCrossEntropyWithLogits",
+		Type: "ResourceSparseApplyRMSProp",
 		Input: []tf.Input{
-			features, labels,
+			var_, ms, mom, lr, rho, momentum, epsilon, grad, indices,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// Fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform over the inner-most
-// dimension of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most
-//   dimension of `input` is replaced with its 1D Fourier transform.
+// Returns the truth value of (x > y) element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft
-// @end_compatibility
-func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT",
+		Type: "Greater",
 		Input: []tf.Input{
-			input,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	serialized: A scalar string containing a serialized TensorProto proto.
-//	out_type: The type of the serialized tensor.  The provided type must match the
-// type of the serialized tensor and no implicit conversion will take place.
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
 //
-// Returns A Tensor of type `out_type`.
-func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
-	opspec := tf.OpSpec{
-		Type: "ParseTensor",
-		Input: []tf.Input{
-			serialized,
-		},
-		Attrs: attrs,
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
-type MaxPoolWithArgmaxAttr func(optionalAttr)
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
 
-// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
-// If not specified, defaults to DT_INT64
-func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
-		m["Targmax"] = value
+		m["area_range"] = value
 	}
 }
 
-// Performs max pooling on the input and outputs both max values and indices.
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
 //
-// The indices in `argmax` are flattened, so that a maximum value at position
-// `[b, y, x, c]` becomes flattened index
-// `((b * height + y) * width + x) * channels + c`.
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// The indices returned are always in `[0, height) x [0, width)` before flattening,
-// even if padding is involved and the mathematically correct answer is outside
-// (either negative or too large).  This is a bug, but fixing it is difficult to do
-// in a safe backwards compatible way, especially due to flattening.
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
+//
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
+//
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example,
+//
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
+//
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
+//
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
+// ```
+//
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
 //
-// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
-func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolWithArgmax",
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Converts each string in the input Tensor to its hash mod by a number of buckets.
+//
+// The hash function is deterministic on the content of the string within the
+// process and will never change. However, it is not suitable for cryptography.
+// This function may be used when CPU time is scarce and inputs are trusted or
+// unimportant. There is a risk of adversaries constructing inputs that all hash
+// to the same bucket. To prevent this problem, use a strong hash function with
+// `tf.string_to_hash_bucket_strong`.
+//
+// Arguments:
+//	input: The strings to assign a hash bucket.
+//	num_buckets: The number of buckets.
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToHashBucketFast(scope *Scope, input tf.Output, num_buckets int64) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_buckets": num_buckets}
+	opspec := tf.OpSpec{
+		Type: "StringToHashBucketFast",
 		Input: []tf.Input{
 			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// TensorArrayGatherV3Attr is an optional argument to TensorArrayGatherV3.
+type TensorArrayGatherV3Attr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// TensorArrayGatherV3ElementShape sets the optional element_shape attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Gather specific elements from the TensorArray into output `value`.
+//
+// All elements selected by `indices` must have the same shape.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	handle: The handle to a TensorArray.
+//	indices: The locations in the TensorArray from which to read tensor elements.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns All of the elements in the TensorArray, concatenated along a new
+// axis (the new dimension 0).
+func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV3Attr) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "TensorArrayGatherV3",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			handle, indices, flow_in,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
+// Returns x / y element-wise for integer types.
 //
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
+// Truncation designates that negative numbers will round fractional quantities
+// toward zero. I.e. -7 / 5 = -1. This matches C semantics but it is different
+// than Python semantics. See `FloorDiv` for a division function that matches
+// Python Semantics.
 //
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
+// *NOTE*: `TruncateDiv` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
+	opspec := tf.OpSpec{
+		Type: "TruncateDiv",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+// Restores tensors from a V2 checkpoint.
 //
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+// For backward compatibility with the V1 format, this Op currently allows
+// restoring from a V1 checkpoint as well:
+//   - This Op first attempts to find the V2 index file pointed to by "prefix", and
+//     if found proceed to read it as a V2 checkpoint;
+//   - Otherwise the V1 read path is invoked.
+// Relying on this behavior is not recommended, as the ability to fall back to read
+// V1 might be deprecated and eventually removed.
 //
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+// By default, restores the named tensors in full.  If the caller wishes to restore
+// specific slices of stored tensors, "shape_and_slices" should be non-empty
+// strings and correspondingly well-formed.
 //
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
-	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
+// Callers must ensure all the named tensors are indeed stored in the checkpoint.
 //
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
+// Arguments:
+//	prefix: Must have a single element.  The prefix of a V2 checkpoint.
+//	tensor_names: shape {N}.  The names of the tensors to be restored.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be restored.
+// Empty strings indicate that they are non-partitioned tensors.
+//	dtypes: shape {N}.  The list of expected dtype for the tensors.  Must match
+// those stored in the checkpoint.
 //
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
+// Returns shape {N}.  The restored tensors, whose shapes are read from the
+// checkpoint directly.
+func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, dtypes []tf.DataType) (tensors []tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	opspec := tf.OpSpec{
+		Type: "RestoreV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if tensors, idx, err = makeOutputList(op, idx, "tensors"); err != nil {
+		scope.UpdateErr("RestoreV2", err)
+		return
 	}
+	return tensors
 }
 
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// Arguments:
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			image,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -10275,73 +10215,49 @@ func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (cont
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
+// Computes the maximum along segments of a tensor.
 //
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
 //
 // Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
 //
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "SegmentMax",
 		Input: []tf.Input{
-			logits, num_samples,
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of NOT x element-wise.
-func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LogicalNot",
+		Type: "Tanh",
 		Input: []tf.Input{
 			x,
 		},
@@ -10350,143 +10266,104 @@ func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// 3D real-valued fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most 3 dimensions of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
-// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
-// of `output`: the zero-frequency term, followed by the `fft_length / 2`
-// positive-frequency terms.
+// Decode web-safe base64-encoded strings.
 //
-// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
 //
 // Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the their 3D Fourier transform. The
-//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
-//   components.
+//	input: Base64 strings to decode.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.rfftn with 3 dimensions.
-// @end_compatibility
-func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RFFT3D",
+		Type: "DecodeBase64",
 		Input: []tf.Input{
-			input, fft_length,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+// Store the input tensor in the state of the current session.
 //
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+// Arguments:
+//	value: The tensor to be stored.
 //
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// An array of Tensors of given size.
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
 //
-// Write data via Write and read via Read or Pack.
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
-// Returns The handle to the TensorArray.A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
+		Type: "ResourceSparseApplyProximalAdagrad",
 		Input: []tf.Input{
-			size,
+			var_, accum, lr, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DAttr is an optional argument to MaxPool3D.
-type MaxPool3DAttr func(optionalAttr)
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
 
-// MaxPool3DDataFormat sets the optional data_format attribute to value.
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
 //
 // value: The data format of the input and output data. With the
 // default format "NDHWC", the data is stored in the order of:
@@ -10494,24 +10371,24 @@ type MaxPool3DAttr func(optionalAttr)
 // Alternatively, the format could be "NCDHW", the data storage order is:
 //     [batch, in_channels, in_depth, in_height, in_width].
 // If not specified, defaults to "NDHWC"
-func MaxPool3DDataFormat(value string) MaxPool3DAttr {
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Performs 3D max pooling on the input.
+// Computes gradients of max pooling function.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
 //	ksize: 1-D tensor of length 5. The size of the window for each dimension of
 // the input tensor. Must have `ksize[0] = ksize[4] = 1`.
 //	strides: 1-D tensor of length 5. The stride of the sliding window for each
 // dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10520,38 +10397,9 @@ func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, pa
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3D",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradients of 3-D convolution with respect to the input.
-//
-// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
-//
-// Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
-// `in_channels` must match between `input` and `filter`.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropInput",
+		Type: "MaxPool3DGrad",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
@@ -10559,36 +10407,43 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
-type ResourceApplyProximalAdagradAttr func(optionalAttr)
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
 
-// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+// Computes the sum of elements across dimensions of a SparseTensor.
 //
-// accum += grad * grad
-// prox_v = var - lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns the created operation.
-func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10597,252 +10452,187 @@ func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalAdagrad",
+		Type: "SparseReduceSum",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
-type MutableHashTableOfTensorsV2Attr func(optionalAttr)
-
-// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
+// Returns element-wise remainder of division. This emulates C semantics in that
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
+// the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
+// y + truncate_mod(x, y) = x`.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// *NOTE*: `TruncateMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func TruncateMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+	opspec := tf.OpSpec{
+		Type: "TruncateMod",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to <>
-func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// Creates an empty hash table.
-//
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a vector. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Inverse 2D real-valued fast Fourier transform.
 //
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+// Computes the inverse 2-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 2 dimensions of `input`.
 //
-// Returns Handle to a table.
-func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MutableHashTableOfTensorsV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 2D fast Fourier transform.
+// The inner-most 2 dimensions of `input` are assumed to be the result of `RFFT2D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 2 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
 //
-// Computes the inverse 2-dimensional discrete Fourier transform over the
-// inner-most 2 dimensions of `input`.
+// Along each axis `IRFFT2D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
 //	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [2]. The FFT length for each dimension.
 //
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
+// Returns A float32 tensor of the same rank as `input`. The inner-most 2
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 2D Fourier transform.
 //
 // @compatibility(numpy)
-// Equivalent to np.fft.ifft2
+// Equivalent to np.fft.irfft2
 // @end_compatibility
-func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
+func IRFFT2D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT2D",
+		Type: "IRFFT2D",
 		Input: []tf.Input{
-			input,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a tensor filled with a scalar value.
-//
-// This operation creates a tensor of shape `dims` and fills it with `value`.
-//
-// For example:
-//
-// ```
-// # Output tensor has shape [2, 3].
-// fill([2, 3], 9) ==> [[9, 9, 9]
-//                      [9, 9, 9]]
-// ```
-//
-// Arguments:
-//	dims: 1-D. Represents the shape of the output tensor.
-//	value: 0-D (scalar). Value to fill the returned tensor.
+// DecodeJpegAttr is an optional argument to DecodeJpeg.
+type DecodeJpegAttr func(optionalAttr)
+
+// DecodeJpegChannels sets the optional channels attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.full
-// @end_compatibility
-func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fill",
-		Input: []tf.Input{
-			dims, value,
-		},
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeJpegChannels(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// 2D fast Fourier transform.
-//
-// Computes the 2-dimensional discrete Fourier transform over the inner-most
-// 2 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
-//   dimensions of `input` are replaced with their 2D Fourier transform.
+// DecodeJpegRatio sets the optional ratio attribute to value.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fft2
-// @end_compatibility
-func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "FFT2D",
-		Input: []tf.Input{
-			input,
-		},
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeJpegRatio(value int64) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
-type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
+// DecodeJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeJpegFancyUpscaling(value bool) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
 
-// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// DecodeJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If true try to recover an image from truncated input.
 // If not specified, defaults to false
-func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
+func DecodeJpegTryRecoverTruncated(value bool) DecodeJpegAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["try_recover_truncated"] = value
 	}
 }
 
-// Update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// prox_v = var - alpha * delta
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	delta: The change.
+// DecodeJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// Returns the created operation.
-func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, delta,
-		},
-		Attrs: attrs,
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeJpegAcceptableFraction(value float32) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the sqrt of `x` wrt its input.
+// DecodeJpegDctMethod sets the optional dct_method attribute to value.
 //
-// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
-// is the corresponding input gradient.
-func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SqrtGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeJpegDctMethod(value string) DecodeJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Get the value of the tensor specified by its handle.
+// Decode a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// This op also supports decoding PNGs and non-animated GIFs since the interface is
+// the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	handle: The handle for a tensor stored in the session state.
-//	dtype: The type of the output value.
+//	contents: 0-D.  The JPEG-encoded image.
 //
-// Returns The tensor for the given handle.
-func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionTensor",
+		Type: "DecodeJpeg",
 		Input: []tf.Input{
-			handle,
+			contents,
 		},
 		Attrs: attrs,
 	}
@@ -10850,104 +10640,107 @@ func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value
 	return op.Output(0)
 }
 
-// Returns x - y element-wise.
-//
-// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sub",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes softmax cross entropy cost and gradients to backpropagate.
-//
-// Inputs are the logits, not probabilities.
+// Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
-//	features: batch_size x num_classes matrix
-//	labels: batch_size x num_classes matrix
-// The caller must ensure that each batch of labels represents a valid
-// probability distribution.
-//
-// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
-func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	names: A vector containing the names of the serialized protos.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this vector must be the same length as "serialized".
+//	sparse_keys: A list of Nsparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: A list of Ndense string Tensors (scalars).
+// The keys expected in the Examples' features associated with dense values.
+//	dense_defaults: A list of Ndense Tensors (some may be empty).
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	sparse_types: A list of Nsparse types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: A list of Ndense shapes; the shapes of data in each Feature
+// given in dense_keys.
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys []tf.Output, dense_keys []tf.Output, dense_defaults []tf.Output, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sparse_types": sparse_types, "dense_shapes": dense_shapes}
 	opspec := tf.OpSpec{
-		Type: "SoftmaxCrossEntropyWithLogits",
+		Type: "ParseExample",
 		Input: []tf.Input{
-			features, labels,
+			serialized, names, tf.OutputList(sparse_keys), tf.OutputList(dense_keys), tf.OutputList(dense_defaults),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ReduceJoinAttr is an optional argument to ReduceJoin.
-type ReduceJoinAttr func(optionalAttr)
-
-// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If `True`, retain reduced dimensions with length `1`.
-// If not specified, defaults to false
-func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExample", err)
+		return
 	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
-// ReduceJoinSeparator sets the optional separator attribute to value.
-//
-// value: The separator to use when joining.
-// If not specified, defaults to ""
-func ReduceJoinSeparator(value string) ReduceJoinAttr {
+// VariableShapeAttr is an optional argument to VariableShape.
+type VariableShapeAttr func(optionalAttr)
+
+// VariableShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func VariableShapeOutType(value tf.DataType) VariableShapeAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["out_type"] = value
 	}
 }
 
-// Joins a string Tensor across the given dimensions.
+// Returns the shape of the variable pointed to by `resource`.
 //
-// Computes the string join across dimensions in the given string Tensor of shape
-// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
-// strings with the given separator (default: empty string).  Negative indices are
-// counted backwards from the end, with `-1` being equivalent to `n - 1`.
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
 // For example:
 //
-// ```python
-// # tensor `a` is [["a", "b"], ["c", "d"]]
-// tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
-// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
-// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
-// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
-// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-// tf.reduce_join(a, []) ==> ["abcd"]
 // ```
-//
-// Arguments:
-//	inputs: The input to be joined.  All reduced indices must have non-zero size.
-//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
-// order specified.  Omitting `reduction_indices` is equivalent to passing
-// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
-//
-// Returns Has shape equal to that of the input with reduced dimensions removed or
-// set to `1` depending on `keep_dims`.
-func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -10956,9 +10749,9 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ReduceJoin",
+		Type: "VariableShape",
 		Input: []tf.Input{
-			inputs, reduction_indices,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -10966,143 +10759,127 @@ func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, opt
 	return op.Output(0)
 }
 
-// Computes cos of x element-wise.
-func Cos(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes softmax cross entropy cost and gradients to backpropagate.
+//
+// Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
+// a matrix of label probabilities, but rather a single label per row
+// of features.  This label is considered to have probability 1.0 for the
+// given row.
+//
+// Inputs are the logits, not probabilities.
+//
+// Arguments:
+//	features: batch_size x num_classes matrix
+//	labels: batch_size vector with values in [0, num_classes).
+// This is the label for the given minibatch entry.
+//
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SparseSoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Cos",
+		Type: "SparseSoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			x,
+			features, labels,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
-type FusedBatchNormGradAttr func(optionalAttr)
-
-// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
+// Fast Fourier transform.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
+// Computes the 1-dimensional discrete Fourier transform over the inner-most
+// dimension of `input`.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
+// Arguments:
+//	input: A complex64 tensor.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// Returns A complex64 tensor of the same shape as `input`. The inner-most
+//   dimension of `input` is replaced with its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft
+// @end_compatibility
+func FFT(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FFT",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Transforms a serialized tensorflow.TensorProto proto into a Tensor.
 //
 // Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+//	serialized: A scalar string containing a serialized TensorProto proto.
+//	out_type: The type of the serialized tensor.  The provided type must match the
+// type of the serialized tensor and no implicit conversion will take place.
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// Returns A Tensor of type `out_type`.
+func ParseTensor(scope *Scope, serialized tf.Output, out_type tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGrad",
+		Type: "ParseTensor",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			serialized,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// TopKAttr is an optional argument to TopK.
-type TopKAttr func(optionalAttr)
+// MaxPoolWithArgmaxAttr is an optional argument to MaxPoolWithArgmax.
+type MaxPoolWithArgmaxAttr func(optionalAttr)
 
-// TopKSorted sets the optional sorted attribute to value.
-//
-// value: If true the resulting `k` elements will be sorted by the values in
-// descending order.
-// If not specified, defaults to true
-func TopKSorted(value bool) TopKAttr {
+// MaxPoolWithArgmaxTargmax sets the optional Targmax attribute to value.
+// If not specified, defaults to DT_INT64
+func MaxPoolWithArgmaxTargmax(value tf.DataType) MaxPoolWithArgmaxAttr {
 	return func(m optionalAttr) {
-		m["sorted"] = value
+		m["Targmax"] = value
 	}
 }
 
-// Finds values and indices of the `k` largest elements for the last dimension.
-//
-// DEPRECATED at GraphDef version 7: Use TopKV2 instead
-//
-// If the input is a vector (rank-1), finds the `k` largest entries in the vector
-// and outputs their values and indices as vectors.  Thus `values[j]` is the
-// `j`-th largest entry in `input`, and its index is `indices[j]`.
-//
-// For matrices (resp. higher rank input), computes the top `k` entries in each
-// row (resp. vector along the last dimension).  Thus,
-//
-//     values.shape = indices.shape = input.shape[:-1] + [k]
+// Performs max pooling on the input and outputs both max values and indices.
 //
-// If two elements are equal, the lower-index element appears first.
+// The indices in `argmax` are flattened, so that a maximum value at position
+// `[b, y, x, c]` becomes flattened index
+// `((b * height + y) * width + x) * channels + c`.
 //
-// If `k` varies dynamically, use `TopKV2` below.
+// The indices returned are always in `[0, height) x [0, width)` before flattening,
+// even if padding is involved and the mathematically correct answer is outside
+// (either negative or too large).  This is a bug, but fixing it is difficult to do
+// in a safe backwards compatible way, especially due to flattening.
 //
 // Arguments:
-//	input: 1-D or higher with last dimension at least `k`.
-//	k: Number of top elements to look for along the last dimension (along each
-// row for matrices).
+//	input: 4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
-func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
+// Returns The max pooled output tensor.4-D.  The flattened indices of the max values chosen for each output.
+func MaxPoolWithArgmax(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolWithArgmaxAttr) (output tf.Output, argmax tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TopK",
+		Type: "MaxPoolWithArgmax",
 		Input: []tf.Input{
 			input,
 		},
@@ -11112,53 +10889,170 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
-// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// The Hurwitz zeta function is defined as:
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
 //
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
-// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
-func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Zeta",
+		Type: "ResourceSparseApplyAdagradDA",
 		Input: []tf.Input{
-			x, q,
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ProdAttr is an optional argument to Prod.
-type ProdAttr func(optionalAttr)
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
 
-// ProdKeepDims sets the optional keep_dims attribute to value.
+// EncodeJpegFormat sets the optional format attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
 // If not specified, defaults to false
-func ProdKeepDims(value bool) ProdAttr {
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["progressive"] = value
 	}
 }
 
-// Computes the product of elements across dimensions of a tensor.
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
 //
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns The reduced tensor.
-func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11167,9 +11061,9 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Prod",
+		Type: "EncodeJpeg",
 		Input: []tf.Input{
-			input, axis,
+			image,
 		},
 		Attrs: attrs,
 	}
@@ -11177,58 +11071,59 @@ func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (
 	return op.Output(0)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
 
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+// MultinomialSeed sets the optional seed attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1),
-// which exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
 	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
+		m["seed"] = value
 	}
 }
 
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// MultinomialSeed2 sets the optional seed2 attribute to value.
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
+		Type: "Multinomial",
 		Input: []tf.Input{
-			input, size, paddings, filter,
+			logits, num_samples,
 		},
 		Attrs: attrs,
 	}
@@ -11236,221 +11131,194 @@ func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, padd
 	return op.Output(0)
 }
 
-// Inverse 3D fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform over the
-// inner-most 3 dimensions of `input`.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.ifftn with 3 dimensions.
-// @end_compatibility
-func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns the truth value of NOT x element-wise.
+func LogicalNot(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IFFT3D",
+		Type: "LogicalNot",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adds `bias` to `value`.
+// 3D real-valued fast Fourier transform.
 //
-// This is a deprecated version of BiasAdd and will be soon removed.
+// Computes the 3-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most 3 dimensions of `input`.
 //
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT3D` only returns the
+// `fft_length / 2 + 1` unique components of the FFT for the inner-most dimension
+// of `output`: the zero-frequency term, followed by the `fft_length / 2`
+// positive-frequency terms.
+//
+// Along each axis `RFFT3D` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
 //
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+// Returns A complex64 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the their 3D Fourier transform. The
+//   inner-most dimension contains `fft_length / 2 + 1` unique frequency
+//   components.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfftn with 3 dimensions.
+// @end_compatibility
+func RFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
+		Type: "RFFT3D",
 		Input: []tf.Input{
-			value, bias,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
 //
-// Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
 //
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "ReverseV2",
-		Input: []tf.Input{
-			tensor, axis,
-		},
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
 
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
 	return func(m optionalAttr) {
-		m["Tout"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// Returns the real part of a complex number.
+// An array of Tensors of given size.
 //
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
+// Write data via Write and read via Read or Pack.
 //
-// For example:
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
 //
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
+// Returns The handle to the TensorArray.A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Real",
+		Type: "TensorArrayV3",
 		Input: []tf.Input{
-			input,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// AudioSummaryAttr is an optional argument to AudioSummary.
-type AudioSummaryAttr func(optionalAttr)
+// MaxPool3DAttr is an optional argument to MaxPool3D.
+type MaxPool3DAttr func(optionalAttr)
 
-// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// MaxPool3DDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= 1
-func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DDataFormat(value string) MaxPool3DAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["data_format"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
-//
-// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
-//
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+// Performs 3D max pooling on the input.
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
+//	input: Shape `[batch, depth, rows, cols, channels]` tensor to pool over.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool3D(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummary",
+		Type: "MaxPool3D",
 		Input: []tf.Input{
-			tag, tensor,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -11458,70 +11326,28 @@ func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate flo
 	return op.Output(0)
 }
 
-// QrAttr is an optional argument to Qr.
-type QrAttr func(optionalAttr)
-
-// QrFullMatrices sets the optional full_matrices attribute to value.
-//
-// value: If true, compute full-sized `q` and `r`. If false
-// (the default), compute only the leading `P` columns of `q`.
-// If not specified, defaults to false
-func QrFullMatrices(value bool) QrAttr {
-	return func(m optionalAttr) {
-		m["full_matrices"] = value
-	}
-}
-
-// Computes the QR decompositions of one or more matrices.
-//
-// Computes the QR decomposition of each inner matrix in `tensor` such that
-// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+// Computes the gradients of 3-D convolution with respect to the input.
 //
-// ```python
-// # a is a tensor.
-// # q is a tensor of orthonormal matrices.
-// # r is a tensor of upper triangular matrices.
-// q, r = qr(a)
-// q_full, r_full = qr(a, full_matrices=True)
-// ```
+// DEPRECATED at GraphDef version 10: Use Conv3DBackpropInputV2
 //
 // Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
-// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
-// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
-func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Qr",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter: Shape `[depth, rows, cols, in_channels, out_channels]`.
+// `in_channels` must match between `input` and `filter`.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
+		Type: "Conv3DBackpropInput",
 		Input: []tf.Input{
-			input_dataset, tag,
+			input, filter, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -11529,36 +11355,36 @@ func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Out
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
+// ResourceApplyProximalAdagradAttr is an optional argument to ResourceApplyProximalAdagrad.
+type ResourceApplyProximalAdagradAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
+func ResourceApplyProximalAdagradUseLocking(value bool) ResourceApplyProximalAdagradAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+// Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
 //
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+// accum += grad * grad
+// prox_v = var - lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
+func ResourceApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, optional ...ResourceApplyProximalAdagradAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11567,262 +11393,201 @@ func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, al
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
+		Type: "ResourceApplyProximalAdagrad",
 		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
+			var_, accum, lr, l1, l2, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MeanAttr is an optional argument to Mean.
-type MeanAttr func(optionalAttr)
+// MutableHashTableOfTensorsV2Attr is an optional argument to MutableHashTableOfTensorsV2.
+type MutableHashTableOfTensorsV2Attr func(optionalAttr)
 
-// MeanKeepDims sets the optional keep_dims attribute to value.
+// MutableHashTableOfTensorsV2Container sets the optional container attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MeanKeepDims(value bool) MeanAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2Container(value string) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["container"] = value
 	}
 }
 
-// Computes the mean of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// MutableHashTableOfTensorsV2SharedName sets the optional shared_name attribute to value.
 //
-// Returns The reduced tensor.
-func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Mean",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableHashTableOfTensorsV2SharedName(value string) MutableHashTableOfTensorsV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
-type InitializeTableFromTextFileV2Attr func(optionalAttr)
-
-// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
-//
-// value: Number of elements of the file, use -1 if unknown.
-// If not specified, defaults to -1
-//
-// REQUIRES: value >= -1
-func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
+// MutableHashTableOfTensorsV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["vocab_size"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
-//
-// value: Delimiter to separate fields in a line.
-// If not specified, defaults to "\t"
-func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
+// MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
+// If not specified, defaults to <>
+func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
-		m["delimiter"] = value
+		m["value_shape"] = value
 	}
 }
 
-// Initializes a table from a text file.
-//
-// It inserts one key-value pair into the table for each line of the file.
-// The key and value is extracted from the whole line content, elements from the
-// split line based on `delimiter` or the line number (starting from zero).
-// Where to extract the key and value from a line is specified by `key_index` and
-// `value_index`.
+// Creates an empty hash table.
 //
-// - A value of -1 means use the line number(starting from zero), expects `int64`.
-// - A value of -2 means use the whole line content, expects `string`.
-// - A value >= 0 means use the index (starting at zero) of the split line based
-//   on `delimiter`.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a vector. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	filename: Filename of a vocabulary text file.
-//	key_index: Column index in a line to get the table `key` values from.
-//	value_index: Column index that represents information of a line to get the table
-// `value` values from.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns the created operation.
-func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
+// Returns Handle to a table.
+func MutableHashTableOfTensorsV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableOfTensorsV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableFromTextFileV2",
-		Input: []tf.Input{
-			table_handle, filename,
-		},
+		Type: "MutableHashTableOfTensorsV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear: `max(features, 0)`
+// Inverse 2D fast Fourier transform.
+//
+// Computes the inverse 2-dimensional discrete Fourier transform over the
+// inner-most 2 dimensions of `input`.
 //
 // Arguments:
+//	input: A complex64 tensor.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.fft.ifft2
+// @end_compatibility
+func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
+		Type: "IFFT2D",
 		Input: []tf.Input{
-			features, min_features, max_features,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+// Creates a tensor filled with a scalar value.
 //
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
+// This operation creates a tensor of shape `dims` and fills it with `value`.
 //
-// Reshaping does not affect the order of values in the SparseTensor.
+// For example:
 //
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
+// ```
+// # Output tensor has shape [2, 3].
+// fill([2, 3], 9) ==> [[9, 9, 9]
+//                      [9, 9, 9]]
+// ```
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//	dims: 1-D. Represents the shape of the output tensor.
+//	value: 0-D (scalar). Value to fill the returned tensor.
 //
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.full
+// @end_compatibility
+func Fill(scope *Scope, dims tf.Output, value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReshape",
+		Type: "Fill",
 		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
+			dims, value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySplitV3
+// 2D fast Fourier transform.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Computes the 2-dimensional discrete Fourier transform over the inner-most
+// 2 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 2
+//   dimensions of `input` are replaced with their 2D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fft2
+// @end_compatibility
+func FFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
+		Type: "FFT2D",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PackAttr is an optional argument to Pack.
-type PackAttr func(optionalAttr)
+// ResourceApplyProximalGradientDescentAttr is an optional argument to ResourceApplyProximalGradientDescent.
+type ResourceApplyProximalGradientDescentAttr func(optionalAttr)
 
-// PackAxis sets the optional axis attribute to value.
+// ResourceApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: Dimension along which to pack.  Negative values wrap around, so the
-// valid range is `[-(R+1), R+1)`.
-// If not specified, defaults to 0
-func PackAxis(value int64) PackAttr {
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyProximalGradientDescentUseLocking(value bool) ResourceApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
-		m["axis"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
-//
-// Packs the `N` tensors in `values` into a tensor with rank one higher than each
-// tensor in `values`, by packing them along the `axis` dimension.
-// Given a list of tensors of shape `(A, B, C)`;
-//
-// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
-// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
-// Etc.
-//
-// For example:
-//
-// ```
-// # 'x' is [1, 4]
-// # 'y' is [2, 5]
-// # 'z' is [3, 6]
-// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
-// ```
+// Update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// This is the opposite of `unpack`.
+// prox_v = var - alpha * delta
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
-//	values: Must be of same shape and type.
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	delta: The change.
 //
-// Returns The packed tensor.
-func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
+// Returns the created operation.
+func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, delta tf.Output, optional ...ResourceApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -11831,246 +11596,245 @@ func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Out
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Pack",
+		Type: "ResourceApplyProximalGradientDescent",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			var_, alpha, l1, l2, delta,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the sqrt of `x` wrt its input.
+//
+// Specifically, `grad = dy * 0.5 / y`, where `y = sqrt(x)`, and `dy`
+// is the corresponding input gradient.
+func SqrtGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SqrtGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+// Get the value of the tensor specified by its handle.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	handle: The handle for a tensor stored in the session state.
+//	dtype: The type of the output value.
 //
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// Returns The tensor for the given handle.
+func GetSessionTensor(scope *Scope, handle tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SparseReorder",
+		Type: "GetSessionTensor",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape,
+			handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+// Returns x - y element-wise.
+//
+// *NOTE*: `Subtract` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Sub(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Relu",
+		Type: "Sub",
 		Input: []tf.Input{
-			features,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
+// LogUniformCandidateSamplerAttr is an optional argument to LogUniformCandidateSampler.
+type LogUniformCandidateSamplerAttr func(optionalAttr)
 
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
+// LogUniformCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed(value int64) LogUniformCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Update '*var' according to the AddSign update.
+// LogUniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func LogUniformCandidateSamplerSeed2(value int64) LogUniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a log-uniform distribution.
 //
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func LogUniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LogUniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
+		Type: "LogUniformCandidateSampler",
 		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes gradient of the FractionalMaxPool function.
-//
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+// Returns the max of x and y (i.e. x > y ? x : y) element-wise.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+// *NOTE*: `Maximum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Maximum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "Maximum",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			x, y,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
-
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// Computes softmax cross entropy cost and gradients to backpropagate.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the proximal adagrad scheme.
+// Inputs are the logits, not probabilities.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	features: batch_size x num_classes matrix
+//	labels: batch_size x num_classes matrix
+// The caller must ensure that each batch of labels represents a valid
+// probability distribution.
 //
-// Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+// Returns Per example loss (batch_size vector).backpropagated gradients (batch_size x num_classes matrix).
+func SoftmaxCrossEntropyWithLogits(scope *Scope, features tf.Output, labels tf.Output) (loss tf.Output, backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "SoftmaxCrossEntropyWithLogits",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			features, labels,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
+// ReduceJoinAttr is an optional argument to ReduceJoin.
+type ReduceJoinAttr func(optionalAttr)
 
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
+// ReduceJoinKeepDims sets the optional keep_dims attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
+// value: If `True`, retain reduced dimensions with length `1`.
 // If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
+func ReduceJoinKeepDims(value bool) ReduceJoinAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the max of elements across dimensions of a SparseTensor.
+// ReduceJoinSeparator sets the optional separator attribute to value.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// value: The separator to use when joining.
+// If not specified, defaults to ""
+func ReduceJoinSeparator(value string) ReduceJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins a string Tensor across the given dimensions.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Computes the string join across dimensions in the given string Tensor of shape
+// `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
+// strings with the given separator (default: empty string).  Negative indices are
+// counted backwards from the end, with `-1` being equivalent to `n - 1`.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// For example:
+//
+// ```python
+// # tensor `a` is [["a", "b"], ["c", "d"]]
+// tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, -2) = tf.reduce_join(a, 0) ==> ["ac", "bd"]
+// tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
+// tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
+// tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
+// tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
+// tf.reduce_join(a, [0, 1]) ==> ["acbd"]
+// tf.reduce_join(a, [1, 0]) ==> ["abcd"]
+// tf.reduce_join(a, []) ==> ["abcd"]
+// ```
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	inputs: The input to be joined.  All reduced indices must have non-zero size.
+//	reduction_indices: The dimensions to reduce over.  Dimensions are reduced in the
+// order specified.  Omitting `reduction_indices` is equivalent to passing
+// `[n-1, n-2, ..., 0]`.  Negative indices from `-n` to `-1` are supported.
+//
+// Returns Has shape equal to that of the input with reduced dimensions removed or
+// set to `1` depending on `keep_dims`.
+func ReduceJoin(scope *Scope, inputs tf.Output, reduction_indices tf.Output, optional ...ReduceJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12079,99 +11843,88 @@ func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "ReduceJoin",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			inputs, reduction_indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
-//
-//
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Computes cos of x element-wise.
+func Cos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "Cos",
 		Input: []tf.Input{
-			input_dataset, count,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
+// FusedBatchNormGradAttr is an optional argument to FusedBatchNormGrad.
+type FusedBatchNormGradAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+// FusedBatchNormGradEpsilon sets the optional epsilon attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradEpsilon(value float32) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["epsilon"] = value
 	}
 }
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// FusedBatchNormGradDataFormat sets the optional data_format attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradDataFormat(value string) FusedBatchNormGradAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["data_format"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
+// FusedBatchNormGradIsTraining sets the optional is_training attribute to value.
 //
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradIsTraining(value bool) FusedBatchNormGradAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
-//
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
-//
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradAttr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12180,70 +11933,106 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "FusedBatchNormGrad",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Concatenates tensors along one dimension.
+// TopKAttr is an optional argument to TopK.
+type TopKAttr func(optionalAttr)
+
+// TopKSorted sets the optional sorted attribute to value.
+//
+// value: If true the resulting `k` elements will be sorted by the values in
+// descending order.
+// If not specified, defaults to true
+func TopKSorted(value bool) TopKAttr {
+	return func(m optionalAttr) {
+		m["sorted"] = value
+	}
+}
+
+// Finds values and indices of the `k` largest elements for the last dimension.
+//
+// DEPRECATED at GraphDef version 7: Use TopKV2 instead
+//
+// If the input is a vector (rank-1), finds the `k` largest entries in the vector
+// and outputs their values and indices as vectors.  Thus `values[j]` is the
+// `j`-th largest entry in `input`, and its index is `indices[j]`.
+//
+// For matrices (resp. higher rank input), computes the top `k` entries in each
+// row (resp. vector along the last dimension).  Thus,
+//
+//     values.shape = indices.shape = input.shape[:-1] + [k]
+//
+// If two elements are equal, the lower-index element appears first.
+//
+// If `k` varies dynamically, use `TopKV2` below.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	input: 1-D or higher with last dimension at least `k`.
+//	k: Number of top elements to look for along the last dimension (along each
+// row for matrices).
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns The `k` largest elements along each last dimensional slice.The indices of `values` within the last dimension of `input`.
+func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values tf.Output, indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"k": k}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "TopK",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
+//
+// The Hurwitz zeta function is defined as:
+//
+//
+// \\(\zeta(x, q) = \sum_{n=0}^{\infty} (q + n)^{-x}\\)
+func Zeta(scope *Scope, x tf.Output, q tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "Zeta",
 		Input: []tf.Input{
-			filename,
+			x, q,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
+// ProdAttr is an optional argument to Prod.
+type ProdAttr func(optionalAttr)
 
-// MinKeepDims sets the optional keep_dims attribute to value.
+// ProdKeepDims sets the optional keep_dims attribute to value.
 //
 // value: If true, retain reduced dimensions with length 1.
 // If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
+func ProdKeepDims(value bool) ProdAttr {
 	return func(m optionalAttr) {
 		m["keep_dims"] = value
 	}
 }
 
-// Computes the minimum of elements across dimensions of a tensor.
+// Computes the product of elements across dimensions of a tensor.
 //
 // Reduces `input` along the dimensions given in `axis`. Unless
 // `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
@@ -12256,7 +12045,7 @@ func MinKeepDims(value bool) MinAttr {
 // `[-rank(input), rank(input))`.
 //
 // Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+func Prod(scope *Scope, input tf.Output, axis tf.Output, optional ...ProdAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12265,7 +12054,7 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Min",
+		Type: "Prod",
 		Input: []tf.Input{
 			input, axis,
 		},
@@ -12275,159 +12064,224 @@ func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (ou
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// value: If true, rescale input by (new_height - 1) / (height - 1),
+// which exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Transpose",
+		Type: "FusedResizeAndPadConv2D",
 		Input: []tf.Input{
-			x, perm,
+			input, size, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes sigmoid of `x` element-wise.
+// Inverse 3D fast Fourier transform.
 //
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+// Computes the inverse 3-dimensional discrete Fourier transform over the
+// inner-most 3 dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their inverse 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.ifftn with 3 dimensions.
+// @end_compatibility
+func IFFT3D(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Sigmoid",
+		Type: "IFFT3D",
 		Input: []tf.Input{
-			x,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
-type FusedBatchNormAttr func(optionalAttr)
-
-// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
+// Adds `bias` to `value`.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormDataFormat sets the optional data_format attribute to value.
+// This is a deprecated version of BiasAdd and will be soon removed.
 //
-// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormIsTraining sets the optional is_training attribute to value.
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+// Arguments:
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
+//
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BiasAddV1",
+		Input: []tf.Input{
+			value, bias,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNorm",
+		Type: "ReverseV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			tensor, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
-type RandomStandardNormalAttr func(optionalAttr)
-
-// RandomStandardNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["Tout"] = value
 	}
 }
 
-// Outputs random values from a normal distribution.
+// Returns the real part of a complex number.
 //
-// The generated values will have mean 0 and standard deviation 1.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+// For example:
 //
-// Returns A tensor of the specified shape filled with random normal values.
-func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
+// ```
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomStandardNormal",
+		Type: "Real",
 		Input: []tf.Input{
-			shape,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -12435,73 +12289,98 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// Component-wise divides a SparseTensor by a dense Tensor.
+// AudioSummaryAttr is an optional argument to AudioSummary.
+type AudioSummaryAttr func(optionalAttr)
+
+// AudioSummaryMaxOutputs sets the optional max_outputs attribute to value.
 //
-// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
-// the other direction.
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryMaxOutputs(value int64) AudioSummaryAttr {
+	return func(m optionalAttr) {
+		m["max_outputs"] = value
+	}
+}
+
+// Outputs a `Summary` protocol buffer with audio.
+//
+// DEPRECATED at GraphDef version 15: Use AudioSummaryV2.
+//
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummary(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate float32, optional ...AudioSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"sample_rate": sample_rate}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseDiv",
+		Type: "AudioSummary",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			tag, tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
-type FractionalAvgPoolGradAttr func(optionalAttr)
+// QrAttr is an optional argument to Qr.
+type QrAttr func(optionalAttr)
 
-// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
+// QrFullMatrices sets the optional full_matrices attribute to value.
 //
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [41/3, 26/3] for fractional avg pooling.
+// value: If true, compute full-sized `q` and `r`. If false
+// (the default), compute only the leading `P` columns of `q`.
 // If not specified, defaults to false
-func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
+func QrFullMatrices(value bool) QrAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Computes gradient of the FractionalAvgPool function.
+// Computes the QR decompositions of one or more matrices.
 //
-// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
-// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
-// out_backprop to those indices that form the same pooling cell. Therefore, we
-// just need to know the shape of original input tensor, instead of the whole
-// tensor.
+// Computes the QR decomposition of each inner matrix in `tensor` such that
+// `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
+//
+// ```python
+// # a is a tensor.
+// # q is a tensor of orthonormal matrices.
+// # r is a tensor of upper triangular matrices.
+// q, r = qr(a)
+// q_full, r_full = qr(a, full_matrices=True)
+// ```
 //
 // Arguments:
-//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_avg_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
 //
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
-func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
+// Returns Orthonormal basis for range of `a`. If `full_matrices` is `False` then
+// shape is `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`.Triangular factor. If `full_matrices` is `False` then shape is
+// `[..., P, N]`. If `full_matrices` is `True` then shape is `[..., M, N]`.
+func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12510,84 +12389,63 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FractionalAvgPoolGrad",
+		Type: "Qr",
 		Input: []tf.Input{
-			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Concatenates tensors along one dimension.
-//
-// Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Concat",
+		Type: "BytesProducedStatsDataset",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values),
+			input_dataset, tag,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
-type ResourceApplyMomentumAttr func(optionalAttr)
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, the tensor passed to compute grad will be
-// var - lr * momentum * accum, so in the end, the var you get is actually
-// var - lr * momentum * accum.
-// If not specified, defaults to false
-func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
-//
-// want to use Nesterov momentum.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
 //
-// accum = accum * momentum + grad
-// var -= lr * accum
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
+//	alpha: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
 //	grad: The gradient.
-//	momentum: Momentum. Must be a scalar.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12596,56 +12454,53 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyMomentum",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			var_, accum, lr, grad, momentum,
+			var_, alpha, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
-type MaxPoolGradGradAttr func(optionalAttr)
+// MeanAttr is an optional argument to Mean.
+type MeanAttr func(optionalAttr)
 
-// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
+// MeanKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MeanKeepDims(value bool) MeanAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes the mean of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Mean(scope *Scope, input tf.Output, axis tf.Output, optional ...MeanAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolGradGrad",
+		Type: "Mean",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -12653,286 +12508,208 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
-// Returns element-wise integer closest to x.
-//
-// If the result is midway between two representable values,
-// the even representable is chosen.
-// For example:
-//
-// ```
-// rint(-1.5) ==> -2.0
-// rint(0.5000001) ==> 1.0
-// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
-// ```
-func Rint(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Rint",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
-type OrderedMapUnstageNoKeyAttr func(optionalAttr)
+// InitializeTableFromTextFileV2Attr is an optional argument to InitializeTableFromTextFileV2.
+type InitializeTableFromTextFileV2Attr func(optionalAttr)
 
-// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// InitializeTableFromTextFileV2VocabSize sets the optional vocab_size attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// value: Number of elements of the file, use -1 if unknown.
+// If not specified, defaults to -1
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+// REQUIRES: value >= -1
+func InitializeTableFromTextFileV2VocabSize(value int64) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["vocab_size"] = value
 	}
 }
 
-// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+// InitializeTableFromTextFileV2Delimiter sets the optional delimiter attribute to value.
+//
+// value: Delimiter to separate fields in a line.
+// If not specified, defaults to "\t"
+func InitializeTableFromTextFileV2Delimiter(value string) InitializeTableFromTextFileV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["delimiter"] = value
 	}
 }
 
-// Op removes and returns the (key, value) element with the smallest
+// Initializes a table from a text file.
 //
-// key from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// It inserts one key-value pair into the table for each line of the file.
+// The key and value is extracted from the whole line content, elements from the
+// split line based on `delimiter` or the line number (starting from zero).
+// Where to extract the key and value from a line is specified by `key_index` and
+// `value_index`.
+//
+// - A value of -1 means use the line number(starting from zero), expects `int64`.
+// - A value of -2 means use the whole line content, expects `string`.
+// - A value >= 0 means use the index (starting at zero) of the split line based
+//   on `delimiter`.
+//
+// Arguments:
+//	table_handle: Handle to a table which will be initialized.
+//	filename: Filename of a vocabulary text file.
+//	key_index: Column index in a line to get the table `key` values from.
+//	value_index: Column index that represents information of a line to get the table
+// `value` values from.
+//
+// Returns the created operation.
+func InitializeTableFromTextFileV2(scope *Scope, table_handle tf.Output, filename tf.Output, key_index int64, value_index int64, optional ...InitializeTableFromTextFileV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"key_index": key_index, "value_index": value_index}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstageNoKey",
+		Type: "InitializeTableFromTextFileV2",
 		Input: []tf.Input{
-			indices,
+			table_handle, filename,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstageNoKey", err)
-		return
-	}
-	return key, values
+	return scope.AddOperation(opspec)
 }
 
-// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
-type MaxPool3DGradGradAttr func(optionalAttr)
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
-//
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["out_type"] = value
 	}
 }
 
-// Computes second-order gradients of the maxpooling function.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
 //
-// Returns Gradients of gradients w.r.t. the input to `max_pool`.
-func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGradGrad",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
-type Conv3DBackpropFilterV2Attr func(optionalAttr)
-
-// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+// Reshapes a SparseTensor to represent values in a new dense shape.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of 3-D convolution with respect to the filter.
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
 //
 // Arguments:
-//	input: Shape `[batch, depth, rows, cols, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 5-D
-// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
-// tensor.
-//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
-// out_channels]`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
+//
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Conv3DBackpropFilterV2",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
+			input_indices, input_shape, new_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Execute a sub graph on a remote processor.
-//
-// The graph specifications(such as graph itself, input tensors and output names)
-// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
-// as serialized_remote_fused_graph_execute_info.
-// The specifications will be passed to a dedicated registered
-// remote fused graph executor.  The executor will send the graph specifications
-// to a remote processor and execute that graph.  The execution results
-// will be passed to consumer nodes as outputs of this node.
-//
-// Arguments:
-//	inputs: Arbitrary number of tensors with arbitrary data types
-//
-//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
-// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+// Deprecated. Use TensorArraySplitV3
 //
-// Returns Arbitrary number of tensors with arbitrary data types
-func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "RemoteFusedGraphExecute",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			handle, value, lengths, flow_in,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("RemoteFusedGraphExecute", err)
-		return
-	}
-	return outputs
+	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
+// PackAttr is an optional argument to Pack.
+type PackAttr func(optionalAttr)
 
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// PackAxis sets the optional axis attribute to value.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// value: Dimension along which to pack.  Negative values wrap around, so the
+// valid range is `[-(R+1), R+1)`.
+// If not specified, defaults to 0
+func PackAxis(value int64) PackAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["axis"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Packs a list of `N` rank-`R` tensors into one rank-`(R+1)` tensor.
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// Packs the `N` tensors in `values` into a tensor with rank one higher than each
+// tensor in `values`, by packing them along the `axis` dimension.
+// Given a list of tensors of shape `(A, B, C)`;
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
+// if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
+// Etc.
+//
+// For example:
+//
+// ```
+// # 'x' is [1, 4]
+// # 'y' is [2, 5]
+// # 'z' is [3, 6]
+// pack([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// pack([x, y, z], axis=1) => [[1, 2, 3], [4, 5, 6]]
+// ```
+//
+// This is the opposite of `unpack`.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//	values: Must be of same shape and type.
+//
+// Returns The packed tensor.
+func Pack(scope *Scope, values []tf.Output, optional ...PackAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12941,9 +12718,9 @@ func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "Pack",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -12951,127 +12728,138 @@ func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values t
 	return op.Output(0)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+// Reorders a SparseTensor into the canonical, row-major ordering.
+//
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
+//
+// Reordering does not affect the shape of the SparseTensor.
+//
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Acosh",
+		Type: "SparseReorder",
 		Input: []tf.Input{
-			x,
+			input_indices, input_values, input_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
-// If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Deprecated. Use TensorArrayV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "Relu",
 		Input: []tf.Input{
-			size,
+			features,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// Update '*var' according to the AddSign update.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAddSign",
+		Input: []tf.Input{
+			var_, m, lr, alpha, sign_decay, beta, grad,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
+
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["na_value"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Convert CSV records to tensors. Each column maps to one tensor.
-//
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// Computes gradient of the FractionalMaxPool function.
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or empty if the column is required.
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -13080,149 +12868,107 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "FractionalMaxPoolGrad",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["use_locking"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Update '*var' according to the proximal adagrad scheme.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			true_classes,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -13230,45 +12976,23 @@ func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -13276,98 +13000,76 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
 
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
+// value: The container name for the `SparseTensorsMap` created by this op.
 // If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
 // If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["value_shape"] = value
-	}
-}
-
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
-	}
-}
-
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["max_load_factor"] = value
-	}
-}
-
-// Creates an empty hash table that uses tensors as the backing store.
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
 //
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//	value_dtype: Type of the table values.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			empty_key,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -13375,116 +13077,55 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Produces the max pool of the input tensor for quantized types.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	ksize: The size of the window for each dimension of the input tensor.
-// The length must be 4 to match the number of dimensions of the input.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. The length must be 4 to match the number of dimensions of the input.
-//	padding: The type of padding algorithm to use.
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMaxPool",
+		Type: "ConcatV2",
 		Input: []tf.Input{
-			input, min_input, max_input,
+			tf.OutputList(values), axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes softplus: `log(exp(features) + 1)`.
-func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Softplus",
+		Type: "ReadFile",
 		Input: []tf.Input{
-			features,
+			filename,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes exponential of x - 1 element-wise.
+// Computes sigmoid of `x` element-wise.
 //
-// I.e., \\(y = (\exp x) - 1\\).
-func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Expm1",
+		Type: "Sigmoid",
 		Input: []tf.Input{
 			x,
 		},
@@ -13493,185 +13134,209 @@ func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the number of records this Reader has produced.
-//
-// This is the same as the number of ReaderRead executions that have
-// succeeded.
+// FusedBatchNormAttr is an optional argument to FusedBatchNorm.
+type FusedBatchNormAttr func(optionalAttr)
+
+// FusedBatchNormEpsilon sets the optional epsilon attribute to value.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderNumRecordsProducedV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormEpsilon(value float32) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["epsilon"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the sum along segments of a tensor.
+// FusedBatchNormDataFormat sets the optional data_format attribute to value.
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// value: The data format for x and y. Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormDataFormat(value string) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// FusedBatchNormIsTraining sets the optional is_training attribute to value.
 //
-// Computes a tensor such that
-// \\(output_i = \sum_j data_j\\) where sum is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormIsTraining(value bool) FusedBatchNormAttr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Batch normalization.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
-// </div>
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNorm(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormAttr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SegmentSum",
+		Type: "FusedBatchNorm",
 		Input: []tf.Input{
-			data, segment_ids,
+			x, scale, offset, mean, variance,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Creates a dataset that emits the lines of one or more text files.
+// RandomStandardNormalAttr is an optional argument to RandomStandardNormal.
+type RandomStandardNormalAttr func(optionalAttr)
+
+// RandomStandardNormalSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar containing the number of bytes to buffer.
-func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TextLineDataset",
-		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
-		},
+}
+
+// RandomStandardNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomStandardNormalSeed2(value int64) RandomStandardNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes gradients for SparseSegmentMean.
+// Outputs random values from a normal distribution.
 //
-// Returns tensor "output" with same shape as grad, except for dimension 0 whose
-// value is output_dim0.
+// The generated values will have mean 0 and standard deviation 1.
 //
 // Arguments:
-//	grad: gradient propagated to the SparseSegmentMean op.
-//	indices: indices passed to the corresponding SparseSegmentMean op.
-//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
-//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
-func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random normal values.
+func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...RandomStandardNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSegmentMeanGrad",
+		Type: "RandomStandardNormal",
 		Input: []tf.Input{
-			grad, indices, segment_ids, output_dim0,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x >= y) element-wise.
+// Component-wise divides a SparseTensor by a dense Tensor.
 //
-// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// *Limitation*: this Op only broadcasts the dense side to the sparse side, but not
+// the other direction.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseDiv(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GreaterEqual",
+		Type: "SparseDenseCwiseDiv",
 		Input: []tf.Input{
-			x, y,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv3DAttr is an optional argument to Conv3D.
-type Conv3DAttr func(optionalAttr)
+// FractionalAvgPoolGradAttr is an optional argument to FractionalAvgPoolGrad.
+type FractionalAvgPoolGradAttr func(optionalAttr)
 
-// Conv3DDataFormat sets the optional data_format attribute to value.
+// FractionalAvgPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func Conv3DDataFormat(value string) Conv3DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv3DDilations sets the optional dilations attribute to value.
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// value: 1-D tensor of length 5.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
-func Conv3DDilations(value []int64) Conv3DAttr {
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [41/3, 26/3] for fractional avg pooling.
+// If not specified, defaults to false
+func FractionalAvgPoolGradOverlapping(value bool) FractionalAvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["overlapping"] = value
 	}
 }
 
-// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
-//
-// In signal processing, cross-correlation is a measure of similarity of
-// two waveforms as a function of a time-lag applied to one of them. This
-// is also known as a sliding dot product or sliding inner-product.
+// Computes gradient of the FractionalAvgPool function.
 //
-// Our Conv3D implements a form of cross-correlation.
+// Unlike FractionalMaxPoolGrad, we don't need to find arg_max for
+// FractionalAvgPoolGrad, we just need to evenly back-propagate each element of
+// out_backprop to those indices that form the same pooling cell. Therefore, we
+// just need to know the shape of original input tensor, instead of the whole
+// tensor.
 //
 // Arguments:
-//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
-//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
-// out_channels]`. `in_channels` must match between `input` and `filter`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
-func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
+//	orig_input_tensor_shape: Original input tensor shape for `fractional_avg_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_avg_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_avg_pool`.
+func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalAvgPoolGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv3D",
+		Type: "FractionalAvgPoolGrad",
 		Input: []tf.Input{
-			input, filter,
+			orig_input_tensor_shape, out_backprop, row_pooling_sequence, col_pooling_sequence,
 		},
 		Attrs: attrs,
 	}
@@ -13679,351 +13344,219 @@ func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, pa
 	return op.Output(0)
 }
 
-// Adds up a SparseTensor and a dense Tensor, using these special rules:
-//
-// (1) Broadcasts the dense side to have the same shape as the sparse side, if
-//     eligible;
-// (2) Then, only the dense values pointed to by the indices of the SparseTensor
-//     participate in the cwise addition.
-//
-// By these rules, the result is a logical SparseTensor with exactly the same
-// indices and shape, but possibly with different non-zero values.  The output of
-// this Op is the resultant non-zero values.
+// Concatenates tensors along one dimension.
 //
 // Arguments:
-//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
-//	sp_shape: 1-D.  Shape of the input SparseTensor.
-//	dense: `R`-D.  The dense Tensor operand.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
 //
-// Returns 1-D.  The `N` values that are operated on.
-func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseDenseCwiseAdd",
+		Type: "Concat",
 		Input: []tf.Input{
-			sp_indices, sp_values, sp_shape, dense,
+			concat_dim, tf.OutputList(values),
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Read an element from the TensorArray into output `value`.
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+// ResourceApplyMomentumAttr is an optional argument to ResourceApplyMomentum.
+type ResourceApplyMomentumAttr func(optionalAttr)
+
+// ResourceApplyMomentumUseLocking sets the optional use_locking attribute to value.
 //
-// Returns The tensor that is read from the TensorArray.
-func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV3",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeV2Attr is an optional argument to QuantizeV2.
-type QuantizeV2Attr func(optionalAttr)
-
-// QuantizeV2Mode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func QuantizeV2Mode(value string) QuantizeV2Attr {
-	return func(m optionalAttr) {
-		m["mode"] = value
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseLocking(value bool) ResourceApplyMomentumAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
 }
 
-// QuantizeV2RoundMode sets the optional round_mode attribute to value.
-// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
-func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+// ResourceApplyMomentumUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, the tensor passed to compute grad will be
+// var - lr * momentum * accum, so in the end, the var you get is actually
+// var - lr * momentum * accum.
+// If not specified, defaults to false
+func ResourceApplyMomentumUseNesterov(value bool) ResourceApplyMomentumAttr {
 	return func(m optionalAttr) {
-		m["round_mode"] = value
+		m["use_nesterov"] = value
 	}
 }
 
-// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.  The
-// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
-// when rounding float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
-// if T == qint8, out[i] -= (range(T) + 1) / 2.0
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// Assume the input is type float and has a possible range of [0.0, 6.0] and the
-// output type is quint8 ([0, 255]). The min_range and max_range values should be
-// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
-// value of the input by 255/6 and cast to quint8.
-//
-// If the output type was qint8 ([-128, 127]), the operation will additionally
-// subtract each value by 128 prior to casting, so that the range of values aligns
-// with the range of qint8.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = num_discrete_values / range
-// quantized = round(input * range_scale) - round(range_min * range_scale) +
-//   numeric_limits<T>::min()
-// quantized = max(quantized, numeric_limits<T>::min())
-// quantized = min(quantized, numeric_limits<T>::max())
-// ```
-//
-// The biggest difference between this and MIN_COMBINED is that the minimum range
-// is rounded first, before it's subtracted from the rounded value. With
-// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
-// and dequantizing will introduce a larger and larger error.
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (max_fixed - min_fixed) / (2 * m)
-// ```
+// Update '*var' according to the momentum scheme. Set use_nesterov = True if you
 //
-// Now we can quantize the elements of our tensor:
-// ```c++
-// result = round(input * s)
-// ```
+// want to use Nesterov momentum.
 //
-// One thing to watch out for is that the operator may choose to adjust the
-// requested minimum and maximum values slightly during the quantization process,
-// so you should always use the output ports as the range for further calculations.
-// For example, if the requested minimum and maximum values are close to equal,
-// they will be separated by a small epsilon value to prevent ill-formed quantized
-// buffers from being created. Otherwise, you can end up with buffers where all the
-// quantized values map to the same float value, which causes problems for
-// operations that have to perform further calculations on them.
+// accum = accum * momentum + grad
+// var -= lr * accum
 //
 // Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	grad: The gradient.
+//	momentum: Momentum. Must be a scalar.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-//
-//
-// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
-func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns the created operation.
+func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, grad tf.Output, momentum tf.Output, optional ...ResourceApplyMomentumAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeV2",
+		Type: "ResourceApplyMomentum",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			var_, accum, lr, grad, momentum,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the truth value of (x < y) element-wise.
+// MaxPoolGradGradAttr is an optional argument to MaxPoolGradGrad.
+type MaxPoolGradGradAttr func(optionalAttr)
+
+// MaxPoolGradGradDataFormat sets the optional data_format attribute to value.
 //
-// *NOTE*: `Less` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolGradGradDataFormat(value string) MaxPoolGradGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes second-order gradients of the maxpooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: 4-D.  Gradients of gradients w.r.t. the input of `max_pool`.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Less",
+		Type: "MaxPoolGradGrad",
 		Input: []tf.Input{
-			x, y,
+			orig_input, orig_output, grad,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedReluXAttr is an optional argument to QuantizedReluX.
-type QuantizedReluXAttr func(optionalAttr)
-
-// QuantizedReluXOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
-//
-// Arguments:
-//
+// Returns element-wise integer closest to x.
 //
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
+// If the result is midway between two representable values,
+// the even representable is chosen.
+// For example:
 //
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
+// ```
+// rint(-1.5) ==> -2.0
+// rint(0.5000001) ==> 1.0
+// rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
+// ```
+func Rint(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedReluX",
+		Type: "Rint",
 		Input: []tf.Input{
-			features, max_value, min_features, max_features,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
-type WholeFileReaderV2Attr func(optionalAttr)
+// OrderedMapUnstageNoKeyAttr is an optional argument to OrderedMapUnstageNoKey.
+type OrderedMapUnstageNoKeyAttr func(optionalAttr)
 
-// WholeFileReaderV2Container sets the optional container attribute to value.
+// OrderedMapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyCapacity(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["capacity"] = value
 	}
 }
 
-// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
+// OrderedMapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+// REQUIRES: value >= 0
+func OrderedMapUnstageNoKeyMemoryLimit(value int64) OrderedMapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// A Reader that outputs the entire contents of a file as a value.
-//
-// To use, enqueue filenames in a Queue.  The output of ReaderRead will
-// be a filename (key) and the contents of that file (value).
-//
-// Returns The handle to reference the Reader.
-func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// OrderedMapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeyContainer(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "WholeFileReaderV2",
+}
 
-		Attrs: attrs,
+// OrderedMapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageNoKeySharedName(value string) OrderedMapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Transforms a tf.Example proto (as a string) into typed tensors.
+// Op removes and returns the (key, value) element with the smallest
 //
-// Arguments:
-//	serialized: A vector containing a batch of binary serialized Example protos.
-//	dense_defaults: A list of Tensors (some may be empty), whose length matches
-// the length of `dense_keys`. dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse features to be parsed from the example. This
-// must match the lengths of `sparse_keys` and `sparse_types`.
-//	sparse_keys: A list of `num_sparse` strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: The keys expected in the Examples' features associated with dense
-// values.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each
-// Feature given in sparse_keys.
-// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	dense_shapes: The shapes of data in each Feature given in dense_keys.
-// The length of this list must match the length of `dense_keys`.  The
-// number of elements in the Feature corresponding to dense_key[j] must
-// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
-// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
-// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
-// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
-// D1, .., DN), where M is the number of blocks of elements of length
-// D1 * .... * DN, in the input.
-func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+// key from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func OrderedMapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleExample",
+		Type: "OrderedMapUnstageNoKey",
 		Input: []tf.Input{
-			serialized, tf.OutputList(dense_defaults),
+			indices,
 		},
 		Attrs: attrs,
 	}
@@ -14033,126 +13566,119 @@ func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.
 	}
 	var idx int
 	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleExample", err)
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstageNoKey", err)
 		return
 	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values
+	return key, values
 }
 
-// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
-type QuantizedConv2DAttr func(optionalAttr)
-
-// QuantizedConv2DOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
+// MaxPool3DGradGradAttr is an optional argument to MaxPool3DGradGrad.
+type MaxPool3DGradGradAttr func(optionalAttr)
 
-// QuantizedConv2DDilations sets the optional dilations attribute to value.
+// MaxPool3DGradGradDataFormat sets the optional data_format attribute to value.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradGradDataFormat(value string) MaxPool3DGradGradAttr {
 	return func(m optionalAttr) {
-		m["dilations"] = value
+		m["data_format"] = value
 	}
 }
 
-// Computes a 2D convolution given quantized 4D input and filter tensors.
-//
-// The inputs are quantized tensors where the lowest value represents the real
-// number of the associated minimum, and the highest represents the maximum.
-// This means that you can only interpret the quantized output in the same way, by
-// taking the returned minimum and maximum values into account.
+// Computes second-order gradients of the maxpooling function.
 //
 // Arguments:
-//
-//	filter: filter's input_depth dimension must match input's depth dimensions.
-//	min_input: The float value that the lowest quantized input value represents.
-//	max_input: The float value that the highest quantized input value represents.
-//	min_filter: The float value that the lowest quantized filter value represents.
-//	max_filter: The float value that the highest quantized filter value represents.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor.
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
 //	padding: The type of padding algorithm to use.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
+// Returns Gradients of gradients w.r.t. the input to `max_pool`.
+func MaxPool3DGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConv2D",
+		Type: "MaxPool3DGradGrad",
 		Input: []tf.Input{
-			input, filter, min_input, max_input, min_filter, max_filter,
+			orig_input, orig_output, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// ResourceGatherAttr is an optional argument to ResourceGather.
-type ResourceGatherAttr func(optionalAttr)
+// Conv3DBackpropFilterV2Attr is an optional argument to Conv3DBackpropFilterV2.
+type Conv3DBackpropFilterV2Attr func(optionalAttr)
 
-// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
+// Conv3DBackpropFilterV2DataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Gather slices from the variable pointed to by `resource` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
-//
-// ```python
-//     # Scalar indices
-//     output[:, ..., :] = params[indices, :, ... :]
+// Conv3DBackpropFilterV2Dilations sets the optional dilations attribute to value.
 //
-//     # Vector indices
-//     output[i, :, ..., :] = params[indices[i], :, ... :]
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of 3-D convolution with respect to the filter.
 //
-//     # Higher rank indices
-//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
-// ```
-func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
+// Arguments:
+//	input: Shape `[batch, depth, rows, cols, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 5-D
+// `[filter_depth, filter_height, filter_width, in_channels, out_channels]`
+// tensor.
+//	out_backprop: Backprop signal of shape `[batch, out_depth, out_rows, out_cols,
+// out_channels]`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3DBackpropFilterV2(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv3DBackpropFilterV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceGather",
+		Type: "Conv3DBackpropFilterV2",
 		Input: []tf.Input{
-			resource, indices,
+			input, filter_sizes, out_backprop,
 		},
 		Attrs: attrs,
 	}
@@ -14160,237 +13686,224 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 	return op.Output(0)
 }
 
-// Delete the TensorArray from its resource container.
+// Execute a sub graph on a remote processor.
 //
-// This enables the user to close and release the resource in the middle
-// of a step/run.
+// The graph specifications(such as graph itself, input tensors and output names)
+// are stored as a serialized protocol buffer of RemoteFusedGraphExecuteInfo
+// as serialized_remote_fused_graph_execute_info.
+// The specifications will be passed to a dedicated registered
+// remote fused graph executor.  The executor will send the graph specifications
+// to a remote processor and execute that graph.  The execution results
+// will be passed to consumer nodes as outputs of this node.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	inputs: Arbitrary number of tensors with arbitrary data types
 //
-// Returns the created operation.
-func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
+//	serialized_remote_fused_graph_execute_info: Serialized protocol buffer
+// of RemoteFusedGraphExecuteInfo which contains graph specifications.
+//
+// Returns Arbitrary number of tensors with arbitrary data types
+func RemoteFusedGraphExecute(scope *Scope, inputs []tf.Output, Toutputs []tf.DataType, serialized_remote_fused_graph_execute_info string) (outputs []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Toutputs": Toutputs, "serialized_remote_fused_graph_execute_info": serialized_remote_fused_graph_execute_info}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV3",
+		Type: "RemoteFusedGraphExecute",
 		Input: []tf.Input{
-			handle,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("RemoteFusedGraphExecute", err)
+		return
+	}
+	return outputs
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
 //
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
 //
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseAdd",
+		Type: "SerializeManySparse",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
-type OrderedMapPeekAttr func(optionalAttr)
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Acosh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// OrderedMapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["element_shape"] = value
 	}
 }
 
-// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["dynamic_size"] = value
 	}
 }
 
-// OrderedMapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["clear_after_read"] = value
 	}
 }
 
-// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
 // If not specified, defaults to ""
-func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["tensor_array_name"] = value
 	}
 }
 
-// Op peeks at the values at the specified key.  If the
+// Deprecated. Use TensorArrayV3
 //
-// underlying container does not contain this key
-// this op will block until it does.   This Op is optimized for
-// performance.
-func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapPeek",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			key, indices,
+			size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
 
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
 //
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["ratio"] = value
+		m["field_delim"] = value
 	}
 }
 
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
 //
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
 // If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
+		m["use_quote_delim"] = value
 	}
 }
 
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+// DecodeCSVNaValue sets the optional na_value attribute to value.
 //
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
+		m["na_value"] = value
 	}
 }
 
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
 	return func(m optionalAttr) {
-		m["dct_method"] = value
+		m["select_cols"] = value
 	}
 }
 
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
+// Convert CSV records to tensors. Each column maps to one tensor.
 //
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
 //
 // Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or empty if the column is required.
 //
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -14399,36 +13912,103 @@ func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
+		Type: "DecodeCSV",
 		Input: []tf.Input{
-			contents, crop_window,
+			records, tf.OutputList(record_defaults),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
-type AllCandidateSamplerAttr func(optionalAttr)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
 
-// AllCandidateSamplerSeed sets the optional seed attribute to value.
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
+
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
 // value: If either seed or seed2 are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
 // value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
@@ -14450,10 +14030,11 @@ func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 //	true_classes: A batch_size * num_true matrix, in which each row contains the
 // IDs of the num_true target_classes in the corresponding original label.
 //	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to produce.
+//	num_sampled: Number of candidates to randomly sample.
 //	unique: If unique is true, we sample with rejection, so that all sampled
 // candidates in a batch are unique. This requires some approximation to
 // estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
 // Returns A vector of length num_sampled, in which each element is
 // the ID of a sampled candidate.A batch_size * num_true matrix, representing
@@ -14462,16 +14043,16 @@ func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
 // candidate representing the number of times the candidate is expected
 // to occur in a batch of sampled candidates.  If unique=true, then this is a
 // probability.
-func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AllCandidateSampler",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
 			true_classes,
 		},
@@ -14481,108 +14062,144 @@ func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Saves the input tensors to disk.
-//
-// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
-// is written to `filename` with name `tensor_names[i]`.
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
+
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// See also `SaveSlices`.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write
-// the tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	data: `N` tensors to save.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns the created operation.
-func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Save",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			filename, tensor_names, tf.OutputList(data),
+			input, ksize, strides,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
 //
-// true, this follows Python semantics in that the result here is consistent
-// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "FloorMod",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// If not specified, defaults to false
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["value_shape"] = value
+	}
+}
 
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
 //
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint_a"] = value
+		m["initial_num_buckets"] = value
 	}
 }
 
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
 //
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["adjoint_b"] = value
+		m["max_load_factor"] = value
 	}
 }
 
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
 //
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
 // Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			empty_key,
 		},
 		Attrs: attrs,
 	}
@@ -14590,500 +14207,303 @@ func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Outp
 	return op.Output(0)
 }
 
-// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
-// `N` is the minibatch size and the rows correspond to packed outputs of
-// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
-// must all match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the serialized input is a `[2 x 3]` matrix representing two
-// original `SparseTensor` objects:
-//
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-//
-// and
-//
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-//
-// then the final deserialized `SparseTensor` will be:
-//
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-//
-// Arguments:
-//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
-// Must have 3 columns.
-//	dtype: The `dtype` of the serialized `SparseTensor` objects.
-func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "DeserializeManySparse",
-		Input: []tf.Input{
-			serialized_sparse,
-		},
-		Attrs: attrs,
+}
+
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
+// StageSizeSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
+func StageSizeSharedName(value string) StageSizeAttr {
 	return func(m optionalAttr) {
-		m["separator"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
+		Type: "StageSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
+// Produces the max pool of the input tensor for quantized types.
 //
 // Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+//	input: The 4D (batch x rows x cols x depth) Tensor to MaxReduce over.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	ksize: The size of the window for each dimension of the input tensor.
+// The length must be 4 to match the number of dimensions of the input.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. The length must be 4 to match the number of dimensions of the input.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMaxPool(scope *Scope, input tf.Output, min_input tf.Output, max_input tf.Output, ksize []int64, strides []int64, padding string) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
+		Type: "QuantizedMaxPool",
+		Input: []tf.Input{
+			input, min_input, max_input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Computes softplus: `log(exp(features) + 1)`.
+func Softplus(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Softplus",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Inverse real-valued fast Fourier transform.
-//
-// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most dimension of `input`.
-//
-// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
-// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
-// `fft_length` is not provided, it is computed from the size of the inner-most
-// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
-// compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
-// than the corresponding dimension of `input`, the dimension is cropped. If it is
-// larger, the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length` samples of its inverse
-//   1D Fourier transform.
+// Computes exponential of x - 1 element-wise.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.irfft
-// @end_compatibility
-func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+// I.e., \\(y = (\exp x) - 1\\).
+func Expm1(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "IRFFT",
+		Type: "Expm1",
 		Input: []tf.Input{
-			input, fft_length,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Concatenates a list of `SparseTensor` along the specified dimension.
-//
-// Concatenation is with respect to the dense versions of these sparse tensors.
-// It is assumed that each input is a `SparseTensor` whose elements are ordered
-// along increasing dimension number.
-//
-// All inputs' shapes must match, except for the concat dimension.  The
-// `indices`, `values`, and `shapes` lists must have the same length.
-//
-// The output shape is identical to the inputs', except along the concat
-// dimension, where it is the sum of the inputs' sizes along that dimension.
-//
-// The output elements will be resorted to preserve the sort order along
-// increasing dimension number.
-//
-// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
-// values across all inputs. This is due to the need for an internal sort in
-// order to concatenate efficiently across an arbitrary dimension.
-//
-// For example, if `concat_dim = 1` and the inputs are
-//
-//     sp_inputs[0]: shape = [2, 3]
-//     [0, 2]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     sp_inputs[1]: shape = [2, 4]
-//     [0, 1]: "d"
-//     [0, 2]: "e"
-//
-// then the output will be
-//
-//     shape = [2, 7]
-//     [0, 2]: "a"
-//     [0, 4]: "d"
-//     [0, 5]: "e"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-// Graphically this is equivalent to doing
+// Returns the number of records this Reader has produced.
 //
-//     [    a] concat [  d e  ] = [    a   d e  ]
-//     [b c  ]        [       ]   [b c          ]
+// This is the same as the number of ReaderRead executions that have
+// succeeded.
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.  Non-empty values of each `SparseTensor`.
-//	shapes: 1-D.  Shapes of each `SparseTensor`.
-//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
-// where rank is the number of dimensions in each input `SparseTensor`.
-//
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	reader_handle: Handle to a Reader.
+func ReaderNumRecordsProducedV2(scope *Scope, reader_handle tf.Output) (records_produced tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "SparseConcat",
+		Type: "ReaderNumRecordsProducedV2",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
+			reader_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generates sparse cross from a list of sparse and dense tensors.
+// Computes the sum along segments of a tensor.
 //
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// For example, if the inputs are
+// Computes a tensor such that
+// \\(output_i = \sum_j data_j\\) where sum is over `j` such
+// that `segment_ids[j] == i`.
 //
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
+// If the sum is empty for a given segment ID `i`, `output[i] = 0`.
 //
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentSum.png" alt>
+// </div>
 //
 // Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
 //
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentSum(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SparseCross",
+		Type: "SegmentSum",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+			data, segment_ids,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Concatenates quantized tensors along one dimension.
+// Creates a dataset that emits the lines of one or more text files.
 //
 // Arguments:
-//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [0, rank(values)).
-//	values: The `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	input_mins: The minimum scalar values for each of the input tensors.
-//	input_maxes: The maximum scalar values for each of the input tensors.
-//
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar containing the number of bytes to buffer.
+func TextLineDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedConcat",
+		Type: "TextLineDataset",
 		Input: []tf.Input{
-			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
+			filenames, compression_type, buffer_size,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Slice a `SparseTensor` based on the `start` and `size`.
-//
-// For example, if the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
+// Computes gradients for SparseSegmentMean.
 //
-//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
+// Returns tensor "output" with same shape as grad, except for dimension 0 whose
+// value is output_dim0.
 //
 // Arguments:
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-//	start: 1-D. tensor represents the start of the slice.
-//	size: 1-D. tensor represents the size of the slice.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//
-// Returns A list of 1-D tensors represents the values of the output sparse
-// tensors.A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	grad: gradient propagated to the SparseSegmentMean op.
+//	indices: indices passed to the corresponding SparseSegmentMean op.
+//	segment_ids: segment_ids passed to the corresponding SparseSegmentMean op.
+//	output_dim0: dimension 0 of "data" passed to SparseSegmentMean op.
+func SparseSegmentMeanGrad(scope *Scope, grad tf.Output, indices tf.Output, segment_ids tf.Output, output_dim0 tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseSlice",
+		Type: "SparseSegmentMeanGrad",
 		Input: []tf.Input{
-			indices, values, shape, start, size,
+			grad, indices, segment_ids, output_dim0,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
+// Returns the truth value of (x >= y) element-wise.
 //
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+// *NOTE*: `GreaterEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func GreaterEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
+		Type: "GreaterEqual",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the set of files matching one or more glob patterns.
-//
-// Note that this routine only supports wildcard characters in the
-// basename portion of the pattern, not in the directory portion.
-//
-// Arguments:
-//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
+// Conv3DAttr is an optional argument to Conv3D.
+type Conv3DAttr func(optionalAttr)
+
+// Conv3DDataFormat sets the optional data_format attribute to value.
 //
-// Returns A vector of matching filenames.
-func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatchingFiles",
-		Input: []tf.Input{
-			pattern,
-		},
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func Conv3DDataFormat(value string) Conv3DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+// Conv3DDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 5.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
+func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
-		m["fast"] = value
+		m["dilations"] = value
 	}
 }
 
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+// Computes a 3-D convolution given 5-D `input` and `filter` tensors.
 //
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
-// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
-// sufficiently large.
+// In signal processing, cross-correlation is a measure of similarity of
+// two waveforms as a function of a time-lag applied to one of them. This
+// is also known as a sliding dot product or sliding inner-product.
 //
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
+// Our Conv3D implements a form of cross-correlation.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+//	input: Shape `[batch, in_depth, in_height, in_width, in_channels]`.
+//	filter: Shape `[filter_depth, filter_height, filter_width, in_channels,
+// out_channels]`. `in_channels` must match between `input` and `filter`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func Conv3D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv3DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
+		Type: "Conv3D",
 		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -15091,93 +14511,197 @@ func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise OR of `x` and `y`.
+// Adds up a SparseTensor and a dense Tensor, using these special rules:
 //
-// The result will have those bits set, that are set in `x`, `y` or both. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// (1) Broadcasts the dense side to have the same shape as the sparse side, if
+//     eligible;
+// (2) Then, only the dense values pointed to by the indices of the SparseTensor
+//     participate in the cwise addition.
+//
+// By these rules, the result is a logical SparseTensor with exactly the same
+// indices and shape, but possibly with different non-zero values.  The output of
+// this Op is the resultant non-zero values.
+//
+// Arguments:
+//	sp_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	sp_values: 1-D.  `N` non-empty values corresponding to `sp_indices`.
+//	sp_shape: 1-D.  Shape of the input SparseTensor.
+//	dense: `R`-D.  The dense Tensor operand.
+//
+// Returns 1-D.  The `N` values that are operated on.
+func SparseDenseCwiseAdd(scope *Scope, sp_indices tf.Output, sp_values tf.Output, sp_shape tf.Output, dense tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BitwiseOr",
+		Type: "SparseDenseCwiseAdd",
 		Input: []tf.Input{
-			x, y,
+			sp_indices, sp_values, sp_shape, dense,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// Read an element from the TensorArray into output `value`.
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns The tensor that is read from the TensorArray.
+func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayReadV3",
+		Input: []tf.Input{
+			handle, index, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// QuantizeV2Attr is an optional argument to QuantizeV2.
+type QuantizeV2Attr func(optionalAttr)
+
+// QuantizeV2Mode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func QuantizeV2Mode(value string) QuantizeV2Attr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["mode"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
+// QuantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_AWAY_FROM_ZERO"
+func QuantizeV2RoundMode(value string) QuantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// Quantize the 'input' tensor of type float to 'output' tensor of type 'T'.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.  The
+// 'round_mode' attribute controls which rounding tie-breaking algorithm is used
+// when rounding float values to their quantized equivalents.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// ```
+// out[i] = (in[i] - min_range) * range(T) / (max_range - min_range)
+// if T == qint8, out[i] -= (range(T) + 1) / 2.0
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// *MIN_COMBINED Mode Example*
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// Assume the input is type float and has a possible range of [0.0, 6.0] and the
+// output type is quint8 ([0, 255]). The min_range and max_range values should be
+// specified as 0.0 and 6.0. Quantizing from float to quint8 will multiply each
+// value of the input by 255/6 and cast to quint8.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// If the output type was qint8 ([-128, 127]), the operation will additionally
+// subtract each value by 128 prior to casting, so that the range of values aligns
+// with the range of qint8.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = num_discrete_values / range
+// quantized = round(input * range_scale) - round(range_min * range_scale) +
+//   numeric_limits<T>::min()
+// quantized = max(quantized, numeric_limits<T>::min())
+// quantized = min(quantized, numeric_limits<T>::max())
+// ```
+//
+// The biggest difference between this and MIN_COMBINED is that the minimum range
+// is rounded first, before it's subtracted from the rounded value. With
+// MIN_COMBINED, a small bias is introduced where repeated iterations of quantizing
+// and dequantizing will introduce a larger and larger error.
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (max_fixed - min_fixed) / (2 * m)
+// ```
+//
+// Now we can quantize the elements of our tensor:
+// ```c++
+// result = round(input * s)
+// ```
+//
+// One thing to watch out for is that the operator may choose to adjust the
+// requested minimum and maximum values slightly during the quantization process,
+// so you should always use the output ports as the range for further calculations.
+// For example, if the requested minimum and maximum values are close to equal,
+// they will be separated by a small epsilon value to prevent ill-formed quantized
+// buffers from being created. Otherwise, you can end up with buffers where all the
+// quantized values map to the same float value, which causes problems for
+// operations that have to perform further calculations on them.
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
+//
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
 //
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns The quantized data produced from the float input.The actual minimum scalar value used for the output.The actual maximum scalar value used for the output.
+func QuantizeV2(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, T tf.DataType, optional ...QuantizeV2Attr) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "QuantizeV2",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
@@ -15185,55 +14709,45 @@ func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_value
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes numerical negative value element-wise.
+// Returns the truth value of (x < y) element-wise.
 //
-// I.e., \\(y = -x\\).
-func Neg(scope *Scope, x tf.Output) (y tf.Output) {
+// *NOTE*: `Less` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Less(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Neg",
+		Type: "Less",
 		Input: []tf.Input{
-			x,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
-type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
-
-// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
-	return func(m optionalAttr) {
-		m["num_bits"] = value
-	}
-}
+// QuantizedReluXAttr is an optional argument to QuantizedReluX.
+type QuantizedReluXAttr func(optionalAttr)
 
-// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
-// If not specified, defaults to false
-func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+// QuantizedReluXOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluXOutType(value tf.DataType) QuantizedReluXAttr {
 	return func(m optionalAttr) {
-		m["narrow_range"] = value
+		m["out_type"] = value
 	}
 }
 
-// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
+// Computes Quantized Rectified Linear X: `min(max(features, 0), max_value)`
 //
-// and `max` to 'outputs' tensor of same shape as `inputs`.
+// Arguments:
 //
-// `[min; max]` define the clamping range for the `inputs` data.
-// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
-// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
-// then de-quantized and output as floats in `[min; max]` interval.
-// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// This operation has a gradient and thus allows for training `min` and `max`
-// values.
-func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedReluX(scope *Scope, features tf.Output, max_value tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluXAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -15242,164 +14756,72 @@ func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FakeQuantWithMinMaxVars",
+		Type: "QuantizedReluX",
 		Input: []tf.Input{
-			inputs, min, max,
+			features, max_value, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise min of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMinimum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
-type TakeManySparseFromTensorsMapAttr func(optionalAttr)
+// QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
+type QuantizedConv2DAttr func(optionalAttr)
 
-// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` read by this op.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
+// QuantizedConv2DOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["out_type"] = value
 	}
 }
 
-// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
+// QuantizedConv2DDilations sets the optional dilations attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` read by this op.
-// It should not be blank; rather the `shared_name` or unique Operation name
-// of the Op that created the original `SparseTensorsMap` should be used.
-// If not specified, defaults to ""
-func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["dilations"] = value
 	}
 }
 
-// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
-//
-// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
-// `N` is the minibatch size and the rows correspond to the output handles of
-// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
-// original `SparseTensor` objects that went into the given input ops must all
-// match.  When the final `SparseTensor` is created, it has rank one
-// higher than the ranks of the incoming `SparseTensor` objects
-// (they have been concatenated along a new row dimension on the left).
-//
-// The output `SparseTensor` object's shape values for all dimensions but the
-// first are the max across the input `SparseTensor` objects' shape values
-// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
-// size.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in
-// standard lexicographic order.  If this is not the case, after this
-// step run `SparseReorder` to restore index ordering.
-//
-// For example, if the handles represent an input, which is a `[2, 3]` matrix
-// representing two original `SparseTensor` objects:
-//
-// ```
-//     index = [ 0]
-//             [10]
-//             [20]
-//     values = [1, 2, 3]
-//     shape = [50]
-// ```
-//
-// and
-//
-// ```
-//     index = [ 2]
-//             [10]
-//     values = [4, 5]
-//     shape = [30]
-// ```
-//
-// then the final `SparseTensor` will be:
+// Computes a 2D convolution given quantized 4D input and filter tensors.
 //
-// ```
-//     index = [0  0]
-//             [0 10]
-//             [0 20]
-//             [1  2]
-//             [1 10]
-//     values = [1, 2, 3, 4, 5]
-//     shape = [2 50]
-// ```
+// The inputs are quantized tensors where the lowest value represents the real
+// number of the associated minimum, and the highest represents the maximum.
+// This means that you can only interpret the quantized output in the same way, by
+// taking the returned minimum and maximum values into account.
 //
 // Arguments:
-//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
-// Shape: `[N]`.
-//	dtype: The `dtype` of the `SparseTensor` objects stored in the
-// `SparseTensorsMap`.
 //
-// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
-func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
+//	filter: filter's input_depth dimension must match input's depth dimensions.
+//	min_input: The float value that the lowest quantized input value represents.
+//	max_input: The float value that the highest quantized input value represents.
+//	min_filter: The float value that the lowest quantized filter value represents.
+//	max_filter: The float value that the highest quantized filter value represents.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedConv2D(scope *Scope, input tf.Output, filter tf.Output, min_input tf.Output, max_input tf.Output, min_filter tf.Output, max_filter tf.Output, strides []int64, padding string, optional ...QuantizedConv2DAttr) (output tf.Output, min_output tf.Output, max_output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TakeManySparseFromTensorsMap",
+		Type: "QuantizedConv2D",
 		Input: []tf.Input{
-			sparse_handles,
+			input, filter, min_input, max_input, min_filter, max_filter,
 		},
 		Attrs: attrs,
 	}
@@ -15407,45 +14829,44 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
+// ResourceGatherAttr is an optional argument to ResourceGather.
+type ResourceGatherAttr func(optionalAttr)
 
-// MaxPoolDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
+// ResourceGatherValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func ResourceGatherValidateIndices(value bool) ResourceGatherAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Performs max pooling on the input.
+// Gather slices from the variable pointed to by `resource` according to `indices`.
 //
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+// ```python
+//     # Scalar indices
+//     output[:, ..., :] = params[indices, :, ... :]
+//
+//     # Vector indices
+//     output[i, :, ..., :] = params[indices[i], :, ... :]
+//
+//     # Higher rank indices
+//     output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+// ```
+func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype tf.DataType, optional ...ResourceGatherAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool",
+		Type: "ResourceGather",
 		Input: []tf.Input{
-			input,
+			resource, indices,
 		},
 		Attrs: attrs,
 	}
@@ -15453,357 +14874,320 @@ func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// Delete the TensorArray from its resource container.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// This enables the user to close and release the resource in the middle
+// of a step/run.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+// Returns the created operation.
+func TensorArrayCloseV3(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "TensorArrayCloseV3",
 		Input: []tf.Input{
-			predictions, targets, k,
+			handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Assigns a new value to a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to return
-// this value or a subsequent newer value of the variable.
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
 //
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value to set the new tensor to use.
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
 //
-// Returns the created operation.
-func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns a tensor of ones with the same shape and type as x.
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
 //
-// Arguments:
-//	x: a tensor of type T.
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
 //
-// Returns a tensor of the same shape and type as x but filled with ones.
-func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "OnesLike",
+		Type: "SparseAdd",
 		Input: []tf.Input{
-			x,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// The gradient of SparseFillEmptyRows.
-//
-// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
-// shaped `[N_full]`, where `N_full >= N` and copies data into either
-// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
-// `d_default_value` is a scalar.
-//
-//   d_values[j] = grad_values[reverse_index_map[j]]
-//   d_default_value = sum_{k : 0 .. N_full - 1} (
-//      grad_values[k] * 1{k not in reverse_index_map})
-//
-// Arguments:
-//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
-//	grad_values: 1-D.  The gradients from backprop.
+// OrderedMapPeekAttr is an optional argument to OrderedMapPeek.
+type OrderedMapPeekAttr func(optionalAttr)
+
+// OrderedMapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
-func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseFillEmptyRowsGrad",
-		Input: []tf.Input{
-			reverse_index_map, grad_values,
-		},
+// REQUIRES: value >= 0
+func OrderedMapPeekCapacity(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
-// if < 0, `scale * features` otherwise.
+// OrderedMapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Selu",
-		Input: []tf.Input{
-			features,
-		},
+// REQUIRES: value >= 0
+func OrderedMapPeekMemoryLimit(value int64) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SetSizeAttr is an optional argument to SetSize.
-type SetSizeAttr func(optionalAttr)
+// OrderedMapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekContainer(value string) OrderedMapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
 
-// SetSizeValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SetSizeValidateIndices(value bool) SetSizeAttr {
+// OrderedMapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapPeekSharedName(value string) OrderedMapPeekAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Number of unique elements along last dimension of input `set`.
-//
-// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
-// and `set_shape`. The last dimension contains values in a set, duplicates are
-// allowed but ignored.
-//
-// If `validate_indices` is `True`, this op validates the order and range of `set`
-// indices.
-//
-// Arguments:
-//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
-//	set_values: 1D `Tensor`, values of a `SparseTensor`.
-//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
+// Op peeks at the values at the specified key.  If the
 //
-// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
-// `n-1` dimensions as `set`. Each value is the number of unique elements in
-// the corresponding `[0...n-1]` dimension of `set`.
-func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
+// underlying container does not contain this key
+// this op will block until it does.   This Op is optimized for
+// performance.
+func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SetSize",
+		Type: "OrderedMapPeek",
 		Input: []tf.Input{
-			set_indices, set_values, set_shape,
+			key, indices,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the sign and the log of the absolute value of the determinant of
-//
-// one or more square matrices.
-//
-// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
-// form square matrices. The outputs are two tensors containing the signs and
-// absolute values of the log determinants for all N input submatrices
-// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
-// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
-// is the LU decomposition of the input and P is the corresponding
-// permutation matrix.
-//
-// Arguments:
-//	input: Shape is `[N, M, M]`.
-//
-// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
-// of the N input matrices.  Shape is `[N]`.
-func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "LogMatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapPeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return values
 }
 
-// SumAttr is an optional argument to Sum.
-type SumAttr func(optionalAttr)
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
 
-// SumKeepDims sets the optional keep_dims attribute to value.
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SumKeepDims(value bool) SumAttr {
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["channels"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
 //
-// Returns The reduced tensor.
-func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Sum",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Delete the tensor specified by its handle in the session.
-//
-// Arguments:
-//	handle: The handle for a tensor stored in the session state.
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
 //
-// Returns the created operation.
-func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "DeleteSessionTensor",
-		Input: []tf.Input{
-			handle,
-		},
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// L2 Loss.
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
 //
-// Computes half the L2 norm of a tensor without the `sqrt`:
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
 //
-//     output = sum(t ** 2) / 2
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
 //
 // Arguments:
-//	t: Typically 2-D, but may have any dimensions.
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
 //
-// Returns 0-D.
-func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "L2Loss",
+		Type: "DecodeAndCropJpeg",
 		Input: []tf.Input{
-			t,
+			contents, crop_window,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
-type DenseToSparseSetOperationAttr func(optionalAttr)
+// AllCandidateSamplerAttr is an optional argument to AllCandidateSampler.
+type AllCandidateSamplerAttr func(optionalAttr)
 
-// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
+// AllCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed(value int64) AllCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["seed"] = value
 	}
 }
 
-// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
+// AllCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func AllCandidateSamplerSeed2(value int64) AllCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set2`
-// indices.
+// For each batch, this op picks a single set of sampled candidate labels.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
-// max set size across `n-1` dimensions.
-//
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to produce.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func AllCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, optional ...AllCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseSetOperation",
+		Type: "AllCandidateSampler",
 		Input: []tf.Input{
-			set1, set2_indices, set2_values, set2_shape,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
@@ -15811,85 +15195,108 @@ func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Out
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Subtracts a value from the current value of a variable.
+// Saves the input tensors to disk.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
+// The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
+// is written to `filename` with name `tensor_names[i]`.
 //
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// See also `SaveSlices`.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	filename: Must have a single element. The name of the file to which we write
+// the tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	data: `N` tensors to save.
 //
 // Returns the created operation.
-func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+func Save(scope *Scope, filename tf.Output, tensor_names tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignSubVariableOp",
+		Type: "Save",
 		Input: []tf.Input{
-			resource, value,
+			filename, tensor_names, tf.OutputList(data),
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// RestoreAttr is an optional argument to Restore.
-type RestoreAttr func(optionalAttr)
+// Returns element-wise remainder of division. When `x < 0` xor `y < 0` is
+//
+// true, this follows Python semantics in that the result here is consistent
+// with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
+//
+// *NOTE*: `FloorMod` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func FloorMod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "FloorMod",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// RestorePreferredShard sets the optional preferred_shard attribute to value.
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
 //
-// value: Index of file to open first if multiple files match
-// `file_pattern`.
-// If not specified, defaults to -1
-func RestorePreferredShard(value int64) RestoreAttr {
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
 	return func(m optionalAttr) {
-		m["preferred_shard"] = value
+		m["adjoint_a"] = value
 	}
 }
 
-// Restores a tensor from checkpoint files.
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
 //
-// Reads a tensor stored in one or several files. If there are several files (for
-// instance because a tensor was saved as slices), `file_pattern` may contain
-// wildcard symbols (`*` and `?`) in the filename portion only, not in the
-// directory portion.
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
 //
-// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
-// in which file the requested tensor is likely to be found. This op will first
-// open the file at index `preferred_shard` in the list of matching files and try
-// to restore tensors from that file.  Only if some tensors or tensor slices are
-// not found in that first file, then the Op opens all the files. Setting
-// `preferred_shard` to match the value passed as the `shard` input
-// of a matching `Save` Op may speed up Restore.  This attribute only affects
-// performance, not correctness.  The default value -1 means files are processed in
-// order.
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
 //
-// See also `RestoreSlice`.
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
 //
 // Arguments:
-//	file_pattern: Must have a single element. The pattern of the files from
-// which we read the tensor.
-//	tensor_name: Must have a single element. The name of the tensor to be
-// restored.
-//	dt: The type of the tensor to be restored.
-//
-// Returns The restored tensor.
-func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dt": dt}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Restore",
+		Type: "SparseTensorDenseMatMul",
 		Input: []tf.Input{
-			file_pattern, tensor_name,
+			a_indices, a_values, a_shape, b,
 		},
 		Attrs: attrs,
 	}
@@ -15897,578 +15304,500 @@ func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.
 	return op.Output(0)
 }
 
-// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
-type QuantizedResizeBilinearAttr func(optionalAttr)
-
-// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize quantized `images` to `size` using quantized bilinear interpolation.
+// Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
-// Input images and output images must be quantized types.
+// The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
+// `N` is the minibatch size and the rows correspond to packed outputs of
+// `SerializeSparse`.  The ranks of the original `SparseTensor` objects
+// must all match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension).
 //
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
+// For example, if the serialized input is a `[2 x 3]` matrix representing two
+// original `SparseTensor` objects:
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedResizeBilinear",
-		Input: []tf.Input{
-			images, size, min, max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Computes the minimum along segments of a tensor.
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
 //
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
+// and
 //
-// Computes a tensor such that
-// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
-// that `segment_ids[j] == i`.
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
 //
-// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+// then the final deserialized `SparseTensor` will be:
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
-// </div>
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
 //
 // Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+//	serialized_sparse: 2-D, The `N` serialized `SparseTensor` objects.
+// Must have 3 columns.
+//	dtype: The `dtype` of the serialized `SparseTensor` objects.
+func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataType) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "SegmentMin",
+		Type: "DeserializeManySparse",
 		Input: []tf.Input{
-			data, segment_ids,
+			serialized_sparse,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
-type SdcaOptimizerAttr func(optionalAttr)
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
 
-// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+// StringJoinSeparator sets the optional separator attribute to value.
 //
-// value: Whether to use Adapative SDCA for the inner loop.
-// If not specified, defaults to false
-func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
 	return func(m optionalAttr) {
-		m["adaptative"] = value
+		m["separator"] = value
 	}
 }
 
-// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
-//
-// linear models with L1 + L2 regularization. As global optimization objective is
-// strongly-convex, the optimizer optimizes the dual objective at each step. The
-// optimizer applies each update one example at a time. Examples are sampled
-// uniformly, and the optimizer is learning rate free and enjoys linear convergence
-// rate.
-//
-// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
-// Shai Shalev-Shwartz, Tong Zhang. 2012
-//
-// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
-//
-// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
-// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
-// Peter Richtarik, Martin Takac. 2015
+// Joins the strings in the given list of string tensors into one tensor;
 //
-// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
-// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+// with the given separator (default is an empty separator).
 //
 // Arguments:
-//	sparse_example_indices: a list of vectors which contain example indices.
-//	sparse_feature_indices: a list of vectors which contain feature indices.
-//	sparse_feature_values: a list of vectors which contains feature value
-// associated with each feature group.
-//	dense_features: a list of matrices which contains the dense feature values.
-//	example_weights: a vector which contains the weight associated with each
-// example.
-//	example_labels: a vector which contains the label/target associated with each
-// example.
-//	sparse_indices: a list of vectors where each value is the indices which has
-// corresponding weights in sparse_weights. This field maybe omitted for the
-// dense approach.
-//	sparse_weights: a list of vectors where each value is the weight associated with
-// a sparse feature group.
-//	dense_weights: a list of vectors where the values are the weights associated
-// with a dense feature group.
-//	example_state_data: a list of vectors containing the example state data.
-//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
-// squared and hinge losses.
-//	l1: Symmetric l1 regularization strength.
-//	l2: Symmetric l2 regularization strength.
-//	num_loss_partitions: Number of partitions of the global loss function.
-//	num_inner_iterations: Number of iterations per mini-batch.
-//
-// Returns a list of vectors containing the updated example state
-// data.a list of vectors where each value is the delta
-// weights associated with a sparse feature group.a list of vectors where the values are the delta
-// weights associated with a dense feature group.
-func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SdcaOptimizer",
+		Type: "StringJoin",
 		Input: []tf.Input{
-			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
+			tf.OutputList(inputs),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	out_example_state_data = op.Output(idx)
-	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
-		scope.UpdateErr("SdcaOptimizer", err)
-		return
-	}
-	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
-}
-
-// SparseMatMulAttr is an optional argument to SparseMatMul.
-type SparseMatMulAttr func(optionalAttr)
-
-// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
-// If not specified, defaults to false
-func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["a_is_sparse"] = value
-	}
-}
-
-// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
-// If not specified, defaults to false
-func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
-	return func(m optionalAttr) {
-		m["b_is_sparse"] = value
-	}
+	return op.Output(0)
 }
 
-// Multiply matrix "a" by matrix "b".
+// Returns immutable tensor from memory region.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of "a" must
-// match the outer dimension of "b". This op is optimized for the case where at
-// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
-// matrix multiply on one platform was 30% zero values in the sparse matrix.
+// The current implementation memmaps the tensor from a file.
 //
-// The gradient computation of this operation will only take advantage of sparsity
-// in the input gradient when that gradient comes from a Relu.
-func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
 	opspec := tf.OpSpec{
-		Type: "SparseMatMul",
-		Input: []tf.Input{
-			a, b,
-		},
+		Type: "ImmutableConst",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the power of one value to another.
+// Inverse real-valued fast Fourier transform.
 //
-// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
-// corresponding elements in `x` and `y`. For example:
+// Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most dimension of `input`.
 //
-// ```
-// # tensor 'x' is [[2, 2]], [3, 3]]
-// # tensor 'y' is [[8, 16], [2, 3]]
-// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
-// ```
-func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// The inner-most dimension of `input` is assumed to be the result of `RFFT`: the
+// `fft_length / 2 + 1` unique components of the DFT of a real-valued signal. If
+// `fft_length` is not provided, it is computed from the size of the inner-most
+// dimension of `input` (`fft_length = 2 * (inner - 1)`). If the FFT length used to
+// compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along the axis `IRFFT` is computed on, if `fft_length / 2 + 1` is smaller
+// than the corresponding dimension of `input`, the dimension is cropped. If it is
+// larger, the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length` samples of its inverse
+//   1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.irfft
+// @end_compatibility
+func IRFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Pow",
+		Type: "IRFFT",
 		Input: []tf.Input{
-			x, y,
+			input, fft_length,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ShapeAttr is an optional argument to Shape.
-type ShapeAttr func(optionalAttr)
-
-// ShapeOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeOutType(value tf.DataType) ShapeAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Returns the shape of a tensor.
+// Concatenates a list of `SparseTensor` along the specified dimension.
 //
-// This operation returns a 1-D integer tensor representing the shape of `input`.
+// Concatenation is with respect to the dense versions of these sparse tensors.
+// It is assumed that each input is a `SparseTensor` whose elements are ordered
+// along increasing dimension number.
 //
-// For example:
+// All inputs' shapes must match, except for the concat dimension.  The
+// `indices`, `values`, and `shapes` lists must have the same length.
 //
-// ```
-// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
-// shape(t) ==> [2, 2, 3]
-// ```
-func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
+// The output shape is identical to the inputs', except along the concat
+// dimension, where it is the sum of the inputs' sizes along that dimension.
+//
+// The output elements will be resorted to preserve the sort order along
+// increasing dimension number.
+//
+// This op runs in `O(M log M)` time, where `M` is the total number of non-empty
+// values across all inputs. This is due to the need for an internal sort in
+// order to concatenate efficiently across an arbitrary dimension.
+//
+// For example, if `concat_dim = 1` and the inputs are
+//
+//     sp_inputs[0]: shape = [2, 3]
+//     [0, 2]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     sp_inputs[1]: shape = [2, 4]
+//     [0, 1]: "d"
+//     [0, 2]: "e"
+//
+// then the output will be
+//
+//     shape = [2, 7]
+//     [0, 2]: "a"
+//     [0, 4]: "d"
+//     [0, 5]: "e"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+// Graphically this is equivalent to doing
+//
+//     [    a] concat [  d e  ] = [    a   d e  ]
+//     [b c  ]        [       ]   [b c          ]
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.  Non-empty values of each `SparseTensor`.
+//	shapes: 1-D.  Shapes of each `SparseTensor`.
+//	concat_dim: Dimension to concatenate along. Must be in range [-rank, rank),
+// where rank is the number of dimensions in each input `SparseTensor`.
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseConcat(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, concat_dim int64) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"concat_dim": concat_dim}
 	opspec := tf.OpSpec{
-		Type: "Shape",
+		Type: "SparseConcat",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes),
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes fingerprints of the input strings.
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
 //
 // Arguments:
-//	input: vector of strings to compute fingerprints on.
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
 //
-// Returns a (N,2) shaped matrix where N is the number of elements in the input
-// vector. Each row contains the low and high parts of the fingerprint.
-func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
+//
+//
+// Returns 2-D.  Indices of the concatenated `SparseTensor`.1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
 	opspec := tf.OpSpec{
-		Type: "SdcaFprint",
+		Type: "SparseCross",
 		Input: []tf.Input{
-			input,
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
+//	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [0, rank(values)).
+//	values: The `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	input_mins: The minimum scalar values for each of the input tensors.
+//	input_maxes: The maximum scalar values for each of the input tensors.
 //
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizedConcat(scope *Scope, concat_dim tf.Output, values []tf.Output, input_mins []tf.Output, input_maxes []tf.Output) (output tf.Output, output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
+		Type: "QuantizedConcat",
 		Input: []tf.Input{
-			shape, rate,
+			concat_dim, tf.OutputList(values), tf.OutputList(input_mins), tf.OutputList(input_maxes),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
-type MatrixTriangularSolveAttr func(optionalAttr)
-
-// MatrixTriangularSolveLower sets the optional lower attribute to value.
-//
-// value: Boolean indicating whether the innermost matrices in `matrix` are
-// lower or upper triangular.
-// If not specified, defaults to true
-func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["lower"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+// Slice a `SparseTensor` based on the `start` and `size`.
 //
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-//          adjoint.
+// For example, if the input is
 //
-// @compatibility(numpy)
-// Equivalent to np.linalg.triangular_solve
-// @end_compatibility
-// If not specified, defaults to false
-func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations with upper or lower triangular matrices by
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
 //
-// backsubstitution.
+// Graphically the output tensors are:
 //
-// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
-// square matrices. If `lower` is `True` then the strictly upper triangular part
-// of each inner-most matrix is assumed to be zero and not accessed.
-// If `lower` is False then the strictly lower triangular part of each inner-most
-// matrix is assumed to be zero and not accessed.
-// `rhs` is a tensor of shape `[..., M, K]`.
+//     sparse_slice([0, 0], [2, 4]) = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
 //
-// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
-// `True` then the innermost matrices in `output` satisfy matrix equations
-// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `False` then the strictly then the  innermost matrices in
-// `output` satisfy matrix equations
-// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
+//     sparse_slice([0, 4], [2, 3]) = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+//	start: 1-D. tensor represents the start of the slice.
+//	size: 1-D. tensor represents the size of the slice.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
+// Returns A list of 1-D tensors represents the values of the output sparse
+// tensors.A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSlice(scope *Scope, indices tf.Output, values tf.Output, shape tf.Output, start tf.Output, size tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixTriangularSolve",
+		Type: "SparseSlice",
 		Input: []tf.Input{
-			matrix, rhs,
+			indices, values, shape, start, size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes inverse hyperbolic sine of x element-wise.
-func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
+//
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Asinh",
+		Type: "SparseTensorDenseAdd",
 		Input: []tf.Input{
-			x,
+			a_indices, a_values, a_shape, b,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset with a range of values. Corresponds to python's xrange.
+// Returns the set of files matching one or more glob patterns.
 //
-// Arguments:
-//	start: corresponds to start in python's xrange().
-//	stop: corresponds to stop in python's xrange().
-//	step: corresponds to step in python's xrange().
+// Note that this routine only supports wildcard characters in the
+// basename portion of the pattern, not in the directory portion.
 //
+// Arguments:
+//	pattern: Shell wildcard pattern(s). Scalar or vector of type string.
 //
-func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A vector of matching filenames.
+func MatchingFiles(scope *Scope, pattern tf.Output) (filenames tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RangeDataset",
+		Type: "MatchingFiles",
 		Input: []tf.Input{
-			start, stop, step,
+			pattern,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
-type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
 
-// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["fast"] = value
 	}
 }
 
-// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
+// Solves one or more linear least-squares problems.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of depthwise convolution with respect to the input.
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 +
+// \lambda ||Z||_F^2\\). If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or\\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`, based
-// on `data_format`.  For example, if `data_format` is 'NHWC' then
-//  `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
-//	out_backprop: 4-D with shape  based on `data_format`.
-// For example, if `data_format` is 'NHWC' then
-// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution.
-//	padding: The type of padding algorithm to use.
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
 //
-// Returns 4-D with shape according to `data_format`.  For example, if
-// `data_format` is 'NHWC', output shape is `[batch, in_height,
-// in_width, in_channels]`.  Gradient w.r.t. the input of the
-// convolution.
-func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DepthwiseConv2dNativeBackpropInput",
+		Type: "MatrixSolveLs",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			matrix, rhs, l2_regularizer,
 		},
 		Attrs: attrs,
 	}
@@ -16476,145 +15805,160 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// Adds sparse updates to the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] += updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] += updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+// Elementwise computes the bitwise OR of `x` and `y`.
 //
-// Returns the created operation.
-func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+// The result will have those bits set, that are set in `x`, `y` or both. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseOr(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterAdd",
+		Type: "BitwiseOr",
 		Input: []tf.Input{
-			resource, indices, updates,
+			x, y,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// More formally, let
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
 //
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
 //
-// Returns Computed Precision at `k` as a `bool Tensor`.
-func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"k": k}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopK",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			predictions, targets,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// Computes numerical negative value element-wise.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// I.e., \\(y = -x\\).
+func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "Neg",
 		Input: []tf.Input{
-			y, dy,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
-//
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
+// FakeQuantWithMinMaxVarsAttr is an optional argument to FakeQuantWithMinMaxVars.
+type FakeQuantWithMinMaxVarsAttr func(optionalAttr)
+
+// FakeQuantWithMinMaxVarsNumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func FakeQuantWithMinMaxVarsNumBits(value int64) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
+}
+
+// FakeQuantWithMinMaxVarsNarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func FakeQuantWithMinMaxVarsNarrowRange(value bool) FakeQuantWithMinMaxVarsAttr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
+// Fake-quantize the 'inputs' tensor of type float via global float scalars `min`
 //
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// and `max` to 'outputs' tensor of same shape as `inputs`.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// `[min; max]` define the clamping range for the `inputs` data.
+// `inputs` values are quantized into the quantization range (`[0; 2^num_bits - 1]`
+// when `narrow_range` is false and `[1; 2^num_bits - 1]` when it is true) and
+// then de-quantized and output as floats in `[min; max]` interval.
+// `num_bits` is the bitwidth of the quantization; between 2 and 8, inclusive.
 //
-// Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+// This operation has a gradient and thus allows for training `min` and `max`
+// values.
+func FakeQuantWithMinMaxVars(scope *Scope, inputs tf.Output, min tf.Output, max tf.Output, optional ...FakeQuantWithMinMaxVarsAttr) (outputs tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "FakeQuantWithMinMaxVars",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			inputs, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -16622,426 +15966,340 @@ func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Out
 	return op.Output(0)
 }
 
-// Convert the quantized 'input' tensor into a lower-precision 'output', using the
-//
-// actual distribution of the values to maximize the usage of the lower bit depth
-// and adjusting the output min and max ranges accordingly.
-//
-// [input_min, input_max] are scalar floats that specify the range for the float
-// interpretation of the 'input' data. For example, if input_min is -1.0f and
-// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
-// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
-//
-// This operator tries to squeeze as much precision as possible into an output with
-// a lower bit depth by calculating the actual min and max values found in the
-// data. For example, maybe that quint16 input has no values lower than 16,384 and
-// none higher than 49,152. That means only half the range is actually needed, all
-// the float interpretations are between -0.5f and 0.5f, so if we want to compress
-// the data into a quint8 output, we can use that range rather than the theoretical
-// -1.0f to 1.0f that is suggested by the input min and max.
+// Returns the element-wise min of two SparseTensors.
 //
-// In practice, this is most useful for taking output from operations like
-// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
-// may have large potential output ranges, but in practice have a distribution of
-// input values that only uses a small fraction of the possible range. By feeding
-// that output into this operator, we can reduce it from 32 bits down to 8 with
-// minimal loss of accuracy.
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
 //
 // Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
-//	out_type: The type of the output. Should be a lower bit depth than Tinput.
-//
-// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
-func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
 	opspec := tf.OpSpec{
-		Type: "QuantizeDownAndShrinkRange",
+		Type: "SparseSparseMinimum",
 		Input: []tf.Input{
-			input, input_min, input_max,
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
+// TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
+type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
-// RandomGammaSeed sets the optional seed attribute to value.
+// TakeManySparseFromTensorsMapContainer sets the optional container attribute to value.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
+// value: The container name for the `SparseTensorsMap` read by this op.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapContainer(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["container"] = value
 	}
 }
 
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
+// TakeManySparseFromTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
+// value: The shared name for the `SparseTensorsMap` read by this op.
+// It should not be blank; rather the `shared_name` or unique Operation name
+// of the Op that created the original `SparseTensorsMap` should be used.
+// If not specified, defaults to ""
+func TakeManySparseFromTensorsMapSharedName(value string) TakeManySparseFromTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Read `SparseTensors` from a `SparseTensorsMap` and concatenate them.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// The input `sparse_handles` must be an `int64` matrix of shape `[N, 1]` where
+// `N` is the minibatch size and the rows correspond to the output handles of
+// `AddSparseToTensorsMap` or `AddManySparseToTensorsMap`.  The ranks of the
+// original `SparseTensor` objects that went into the given input ops must all
+// match.  When the final `SparseTensor` is created, it has rank one
+// higher than the ranks of the incoming `SparseTensor` objects
+// (they have been concatenated along a new row dimension on the left).
 //
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+// The output `SparseTensor` object's shape values for all dimensions but the
+// first are the max across the input `SparseTensor` objects' shape values
+// for the corresponding dimensions.  Its first shape value is `N`, the minibatch
+// size.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomGamma",
-		Input: []tf.Input{
-			shape, alpha,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
+// The input `SparseTensor` objects' indices are assumed ordered in
+// standard lexicographic order.  If this is not the case, after this
+// step run `SparseReorder` to restore index ordering.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+// For example, if the handles represent an input, which is a `[2, 3]` matrix
+// representing two original `SparseTensor` objects:
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
+// ```
+//     index = [ 0]
+//             [10]
+//             [20]
+//     values = [1, 2, 3]
+//     shape = [50]
+// ```
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// and
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+// ```
+//     index = [ 2]
+//             [10]
+//     values = [4, 5]
+//     shape = [30]
+// ```
+//
+// then the final `SparseTensor` will be:
+//
+// ```
+//     index = [0  0]
+//             [0 10]
+//             [0 20]
+//             [1  2]
+//             [1 10]
+//     values = [1, 2, 3, 4, 5]
+//     shape = [2 50]
+// ```
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//	sparse_handles: 1-D, The `N` serialized `SparseTensor` objects.
+// Shape: `[N]`.
+//	dtype: The `dtype` of the `SparseTensor` objects stored in the
+// `SparseTensorsMap`.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+// Returns 2-D.  The `indices` of the minibatch `SparseTensor`.1-D.  The `values` of the minibatch `SparseTensor`.1-D.  The `shape` of the minibatch `SparseTensor`.
+func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype tf.DataType, optional ...TakeManySparseFromTensorsMapAttr) (sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "TakeManySparseFromTensorsMap",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			sparse_handles,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
 
-// SkipgramSubsample sets the optional subsample attribute to value.
+// MaxPoolDataFormat sets the optional data_format attribute to value.
 //
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
 	return func(m optionalAttr) {
-		m["subsample"] = value
+		m["data_format"] = value
 	}
 }
 
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+// Performs max pooling on the input.
 //
 // Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+	return op.Output(0)
 }
 
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
+// More formally, let
 //
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+//
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
+//
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StringToNumber",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			string_tensor,
+			predictions, targets, k,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
+// Assigns a new value to a variable.
 //
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// Any ReadVariableOp with a control dependency on this op is guaranteed to return
+// this value or a subsequent newer value of the variable.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
+//	resource: handle to the resource in which to store the variable.
+//	value: the value to set the new tensor to use.
 //
 // Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
+		Type: "AssignVariableOp",
 		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+			resource, value,
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
+// Returns a tensor of ones with the same shape and type as x.
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+// Arguments:
+//	x: a tensor of type T.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Returns a tensor of the same shape and type as x but filled with ones.
+func OnesLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "OnesLike",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random values from a truncated normal distribution.
+// The gradient of SparseFillEmptyRows.
 //
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
+// Takes vectors reverse_index_map, shaped `[N]`, and grad_values,
+// shaped `[N_full]`, where `N_full >= N` and copies data into either
+// `d_values` or `d_default_value`.  Here `d_values` is shaped `[N]` and
+// `d_default_value` is a scalar.
+//
+//   d_values[j] = grad_values[reverse_index_map[j]]
+//   d_default_value = sum_{k : 0 .. N_full - 1} (
+//      grad_values[k] * 1{k not in reverse_index_map})
 //
 // Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
+//	reverse_index_map: 1-D.  The reverse index map from SparseFillEmptyRows.
+//	grad_values: 1-D.  The gradients from backprop.
 //
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+// Returns 1-D.  The backprop into values.0-D.  The backprop into default_value.
+func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_values tf.Output) (d_values tf.Output, d_default_value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
+		Type: "SparseFillEmptyRowsGrad",
 		Input: []tf.Input{
-			shape,
+			reverse_index_map, grad_values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
-
-// RandomShuffleSeed sets the optional seed attribute to value.
+// Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
 //
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// if < 0, `scale * features` otherwise.
+//
+// See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
+func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Selu",
+		Input: []tf.Input{
+			features,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+// SetSizeAttr is an optional argument to SetSize.
+type SetSizeAttr func(optionalAttr)
+
+// SetSizeValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SetSizeValidateIndices(value bool) SetSizeAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Randomly shuffles a tensor along its first dimension.
+// Number of unique elements along last dimension of input `set`.
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// Input `set` is a `SparseTensor` represented by `set_indices`, `set_values`,
+// and `set_shape`. The last dimension contains values in a set, duplicates are
+// allowed but ignored.
 //
-// ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
-// ```
+// If `validate_indices` is `True`, this op validates the order and range of `set`
+// indices.
 //
 // Arguments:
-//	value: The tensor to be shuffled.
+//	set_indices: 2D `Tensor`, indices of a `SparseTensor`.
+//	set_values: 1D `Tensor`, values of a `SparseTensor`.
+//	set_shape: 1D `Tensor`, shape of a `SparseTensor`.
 //
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+// Returns For `set` ranked `n`, this is a `Tensor` with rank `n-1`, and the same 1st
+// `n-1` dimensions as `set`. Each value is the number of unique elements in
+// the corresponding `[0...n-1]` dimension of `set`.
+func SetSize(scope *Scope, set_indices tf.Output, set_values tf.Output, set_shape tf.Output, optional ...SetSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17050,9 +16308,9 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "SetSize",
 		Input: []tf.Input{
-			value,
+			set_indices, set_values, set_shape,
 		},
 		Attrs: attrs,
 	}
@@ -17060,99 +16318,75 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
-
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Computes the sign and the log of the absolute value of the determinant of
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// one or more square matrices.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// The input is a tensor of shape `[N, M, M]` whose inner-most 2 dimensions
+// form square matrices. The outputs are two tensors containing the signs and
+// absolute values of the log determinants for all N input submatrices
+// `[..., :, :]` such that the determinant = sign*exp(log_abs_determinant).
+// The log_abs_determinant is computed as det(P)*sum(log(diag(LU))) where LU
+// is the LU decomposition of the input and P is the corresponding
+// permutation matrix.
+//
+// Arguments:
+//	input: Shape is `[N, M, M]`.
+//
+// Returns The signs of the log determinants of the inputs. Shape is `[N]`.The logs of the absolute values of the determinants
+// of the N input matrices.  Shape is `[N]`.
+func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_abs_determinant tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
-		Attrs: attrs,
+		Type: "LogMatrixDeterminant",
+		Input: []tf.Input{
+			input,
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// DecodeRawAttr is an optional argument to DecodeRaw.
-type DecodeRawAttr func(optionalAttr)
+// SumAttr is an optional argument to Sum.
+type SumAttr func(optionalAttr)
 
-// DecodeRawLittleEndian sets the optional little_endian attribute to value.
+// SumKeepDims sets the optional keep_dims attribute to value.
 //
-// value: Whether the input `bytes` are in little-endian order.
-// Ignored for `out_type` values that are stored in a single byte like
-// `uint8`.
-// If not specified, defaults to true
-func DecodeRawLittleEndian(value bool) DecodeRawAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SumKeepDims(value bool) SumAttr {
 	return func(m optionalAttr) {
-		m["little_endian"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Reinterpret the bytes of a string as a vector of numbers.
+// Computes the sum of elements across dimensions of a tensor.
 //
-// Arguments:
-//	bytes: All the elements must have the same length.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A Tensor with one more dimension than the input `bytes`.  The
-// added dimension will have size equal to the length of the elements
-// of `bytes` divided by the number of bytes to represent `out_type`.
-func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func Sum(scope *Scope, input tf.Output, axis tf.Output, optional ...SumAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"out_type": out_type}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeRaw",
+		Type: "Sum",
 		Input: []tf.Input{
-			bytes,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -17160,140 +16394,105 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+// Delete the tensor specified by its handle in the session.
 //
-// The indicator function
+// Arguments:
+//	handle: The handle for a tensor stored in the session state.
 //
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
+// Returns the created operation.
+func DeleteSessionTensor(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DeleteSessionTensor",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// L2 Loss.
 //
-// Useful special cases:
+// Computes half the L2 norm of a tensor without the `sqrt`:
 //
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
+//     output = sum(t ** 2) / 2
 //
 // Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+//	t: Typically 2-D, but may have any dimensions.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// Returns 0-D.
+func L2Loss(scope *Scope, t tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "L2Loss",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			t,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
-type QuantizedMatMulAttr func(optionalAttr)
+// DenseToSparseSetOperationAttr is an optional argument to DenseToSparseSetOperation.
+type DenseToSparseSetOperationAttr func(optionalAttr)
 
-// QuantizedMatMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
+// DenseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToSparseSetOperationValidateIndices(value bool) DenseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
+// Applies set operation along last dimension of `Tensor` and `SparseTensor`.
 //
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
 //
-// value: The type of output produced by activation function
-// following this operation.
-// If not specified, defaults to DT_QUINT8
-func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
-	return func(m optionalAttr) {
-		m["Tactivation"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
+// If `validate_indices` is `True`, this op validates the order and range of `set2`
+// indices.
 //
-// The inputs must be two-dimensional matrices and the inner dimension of
-// `a` (after being transposed if `transpose_a` is non-zero) must match the
-// outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero).
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	a: Must be a two-dimensional tensor.
-//	b: Must be a two-dimensional tensor.
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as the 1st `n-1` dimensions of `set1`, `result_shape[n]` is the
+// max set size across `n-1` dimensions.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
-func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToSparseSetOperation(scope *Scope, set1 tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...DenseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMatMul",
+		Type: "DenseToSparseSetOperation",
 		Input: []tf.Input{
-			a, b, min_a, max_a, min_b, max_b,
+			set1, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
@@ -17301,52 +16500,85 @@ func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, ma
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
+// Subtracts a value from the current value of a variable.
 //
-// Only useful as a placeholder for control edges.
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
+//
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
 // Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
+func AssignSubVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
+		Type: "AssignSubVariableOp",
+		Input: []tf.Input{
+			resource, value,
+		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Batch normalization.
+// RestoreAttr is an optional argument to Restore.
+type RestoreAttr func(optionalAttr)
+
+// RestorePreferredShard sets the optional preferred_shard attribute to value.
 //
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// value: Index of file to open first if multiple files match
+// `file_pattern`.
+// If not specified, defaults to -1
+func RestorePreferredShard(value int64) RestoreAttr {
+	return func(m optionalAttr) {
+		m["preferred_shard"] = value
+	}
+}
+
+// Restores a tensor from checkpoint files.
 //
-// This op is deprecated. Prefer `tf.nn.batch_normalization`.
+// Reads a tensor stored in one or several files. If there are several files (for
+// instance because a tensor was saved as slices), `file_pattern` may contain
+// wildcard symbols (`*` and `?`) in the filename portion only, not in the
+// directory portion.
+//
+// If a `file_pattern` matches several files, `preferred_shard` can be used to hint
+// in which file the requested tensor is likely to be found. This op will first
+// open the file at index `preferred_shard` in the list of matching files and try
+// to restore tensors from that file.  Only if some tensors or tensor slices are
+// not found in that first file, then the Op opens all the files. Setting
+// `preferred_shard` to match the value passed as the `shard` input
+// of a matching `Save` Op may speed up Restore.  This attribute only affects
+// performance, not correctness.  The default value -1 means files are processed in
+// order.
+//
+// See also `RestoreSlice`.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	beta: A 1D beta Tensor with size matching the last dimension of t.
-// An offset to be added to the normalized tensor.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this tensor will be multiplied
-// with the normalized tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
-func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
+//	file_pattern: Must have a single element. The pattern of the files from
+// which we read the tensor.
+//	tensor_name: Must have a single element. The name of the tensor to be
+// restored.
+//	dt: The type of the tensor to be restored.
+//
+// Returns The restored tensor.
+func Restore(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, dt tf.DataType, optional ...RestoreAttr) (tensor tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
+	attrs := map[string]interface{}{"dt": dt}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalization",
+		Type: "Restore",
 		Input: []tf.Input{
-			t, m, v, beta, gamma,
+			file_pattern, tensor_name,
 		},
 		Attrs: attrs,
 	}
@@ -17354,51 +16586,35 @@ func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayReadV3
+// QuantizedResizeBilinearAttr is an optional argument to QuantizedResizeBilinear.
+type QuantizedResizeBilinearAttr func(optionalAttr)
+
+// QuantizedResizeBilinearAlignCorners sets the optional align_corners attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
-func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayReadV2",
-		Input: []tf.Input{
-			handle, index, flow_in,
-		},
-		Attrs: attrs,
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func QuantizedResizeBilinearAlignCorners(value bool) QuantizedResizeBilinearAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// QuantizedMulAttr is an optional argument to QuantizedMul.
-type QuantizedMulAttr func(optionalAttr)
-
-// QuantizedMulToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// Returns x * y element-wise, working on quantized buffers.
+// Resize quantized `images` to `size` using quantized bilinear interpolation.
 //
-// Arguments:
+// Input images and output images must be quantized types.
 //
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func QuantizedResizeBilinear(scope *Scope, images tf.Output, size tf.Output, min tf.Output, max tf.Output, optional ...QuantizedResizeBilinearAttr) (resized_images tf.Output, out_min tf.Output, out_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17407,9 +16623,9 @@ func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedMul",
+		Type: "QuantizedResizeBilinear",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			images, size, min, max,
 		},
 		Attrs: attrs,
 	}
@@ -17417,109 +16633,182 @@ func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QuantizedAddAttr is an optional argument to QuantizedAdd.
-type QuantizedAddAttr func(optionalAttr)
+// Computes the minimum along segments of a tensor.
+//
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \min_j(data_j)\\) where `min` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the min is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMin.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMin(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMin",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
 
-// QuantizedAddToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+// SdcaOptimizerAttr is an optional argument to SdcaOptimizer.
+type SdcaOptimizerAttr func(optionalAttr)
+
+// SdcaOptimizerAdaptative sets the optional adaptative attribute to value.
+//
+// value: Whether to use Adapative SDCA for the inner loop.
+// If not specified, defaults to false
+func SdcaOptimizerAdaptative(value bool) SdcaOptimizerAttr {
 	return func(m optionalAttr) {
-		m["Toutput"] = value
+		m["adaptative"] = value
 	}
 }
 
-// Returns x + y element-wise, working on quantized buffers.
+// Distributed version of Stochastic Dual Coordinate Ascent (SDCA) optimizer for
 //
-// Arguments:
+// linear models with L1 + L2 regularization. As global optimization objective is
+// strongly-convex, the optimizer optimizes the dual objective at each step. The
+// optimizer applies each update one example at a time. Examples are sampled
+// uniformly, and the optimizer is learning rate free and enjoys linear convergence
+// rate.
 //
+// [Proximal Stochastic Dual Coordinate Ascent](http://arxiv.org/pdf/1211.2717v1.pdf).<br>
+// Shai Shalev-Shwartz, Tong Zhang. 2012
 //
-//	min_x: The float value that the lowest quantized `x` value represents.
-//	max_x: The float value that the highest quantized `x` value represents.
-//	min_y: The float value that the lowest quantized `y` value represents.
-//	max_y: The float value that the highest quantized `y` value represents.
+// $$Loss Objective = \sum f_{i} (wx_{i}) + (l2 / 2) * |w|^2 + l1 * |w|$$
 //
-// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+// [Adding vs. Averaging in Distributed Primal-Dual Optimization](http://arxiv.org/abs/1502.03508).<br>
+// Chenxin Ma, Virginia Smith, Martin Jaggi, Michael I. Jordan,
+// Peter Richtarik, Martin Takac. 2015
 //
-// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
-// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
+// [Stochastic Dual Coordinate Ascent with Adaptive Probabilities](https://arxiv.org/abs/1502.08053).<br>
+// Dominik Csiba, Zheng Qu, Peter Richtarik. 2015
+//
+// Arguments:
+//	sparse_example_indices: a list of vectors which contain example indices.
+//	sparse_feature_indices: a list of vectors which contain feature indices.
+//	sparse_feature_values: a list of vectors which contains feature value
+// associated with each feature group.
+//	dense_features: a list of matrices which contains the dense feature values.
+//	example_weights: a vector which contains the weight associated with each
+// example.
+//	example_labels: a vector which contains the label/target associated with each
+// example.
+//	sparse_indices: a list of vectors where each value is the indices which has
+// corresponding weights in sparse_weights. This field maybe omitted for the
+// dense approach.
+//	sparse_weights: a list of vectors where each value is the weight associated with
+// a sparse feature group.
+//	dense_weights: a list of vectors where the values are the weights associated
+// with a dense feature group.
+//	example_state_data: a list of vectors containing the example state data.
+//	loss_type: Type of the primal loss. Currently SdcaSolver supports logistic,
+// squared and hinge losses.
+//	l1: Symmetric l1 regularization strength.
+//	l2: Symmetric l2 regularization strength.
+//	num_loss_partitions: Number of partitions of the global loss function.
+//	num_inner_iterations: Number of iterations per mini-batch.
+//
+// Returns a list of vectors containing the updated example state
+// data.a list of vectors where each value is the delta
+// weights associated with a sparse feature group.a list of vectors where the values are the delta
+// weights associated with a dense feature group.
+func SdcaOptimizer(scope *Scope, sparse_example_indices []tf.Output, sparse_feature_indices []tf.Output, sparse_feature_values []tf.Output, dense_features []tf.Output, example_weights tf.Output, example_labels tf.Output, sparse_indices []tf.Output, sparse_weights []tf.Output, dense_weights []tf.Output, example_state_data tf.Output, loss_type string, l1 float32, l2 float32, num_loss_partitions int64, num_inner_iterations int64, optional ...SdcaOptimizerAttr) (out_example_state_data tf.Output, out_delta_sparse_weights []tf.Output, out_delta_dense_weights []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"loss_type": loss_type, "l1": l1, "l2": l2, "num_loss_partitions": num_loss_partitions, "num_inner_iterations": num_inner_iterations}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedAdd",
+		Type: "SdcaOptimizer",
 		Input: []tf.Input{
-			x, y, min_x, max_x, min_y, max_y,
+			tf.OutputList(sparse_example_indices), tf.OutputList(sparse_feature_indices), tf.OutputList(sparse_feature_values), tf.OutputList(dense_features), example_weights, example_labels, tf.OutputList(sparse_indices), tf.OutputList(sparse_weights), tf.OutputList(dense_weights), example_state_data,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	out_example_state_data = op.Output(idx)
+	if out_delta_sparse_weights, idx, err = makeOutputList(op, idx, "out_delta_sparse_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	if out_delta_dense_weights, idx, err = makeOutputList(op, idx, "out_delta_dense_weights"); err != nil {
+		scope.UpdateErr("SdcaOptimizer", err)
+		return
+	}
+	return out_example_state_data, out_delta_sparse_weights, out_delta_dense_weights
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// SparseMatMulAttr is an optional argument to SparseMatMul.
+type SparseMatMulAttr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
-//
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// SparseMatMulTransposeA sets the optional transpose_a attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeA(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["transpose_a"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
-//
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// SparseMatMulTransposeB sets the optional transpose_b attribute to value.
+// If not specified, defaults to false
+func SparseMatMulTransposeB(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["transpose_b"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
-//
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// SparseMatMulAIsSparse sets the optional a_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulAIsSparse(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["a_is_sparse"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
-//
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
+// SparseMatMulBIsSparse sets the optional b_is_sparse attribute to value.
+// If not specified, defaults to false
+func SparseMatMulBIsSparse(value bool) SparseMatMulAttr {
 	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+		m["b_is_sparse"] = value
 	}
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// Multiply matrix "a" by matrix "b".
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// The inputs must be two-dimensional matrices and the inner dimension of "a" must
+// match the outer dimension of "b". This op is optimized for the case where at
+// least one of "a" or "b" is sparse. The breakeven for using this versus a dense
+// matrix multiply on one platform was 30% zero values in the sparse matrix.
 //
-// Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+// The gradient computation of this operation will only take advantage of sparsity
+// in the input gradient when that gradient comes from a Relu.
+func SparseMatMul(scope *Scope, a tf.Output, b tf.Output, optional ...SparseMatMulAttr) (product tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17528,9 +16817,9 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
+		Type: "SparseMatMul",
 		Input: []tf.Input{
-			spectrogram, sample_rate,
+			a, b,
 		},
 		Attrs: attrs,
 	}
@@ -17538,202 +16827,146 @@ func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional .
 	return op.Output(0)
 }
 
-// Given a quantized tensor described by (input, input_min, input_max), outputs a
-//
-// range that covers the actual values present in that tensor.  This op is
-// typically used to produce the requested_output_min and requested_output_max for
-// Requantize.
+// ShapeAttr is an optional argument to Shape.
+type ShapeAttr func(optionalAttr)
+
+// ShapeOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeOutType(value tf.DataType) ShapeAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns the shape of a tensor.
 //
-// Arguments:
+// This operation returns a 1-D integer tensor representing the shape of `input`.
 //
-//	input_min: The float value that the minimum quantized input value represents.
-//	input_max: The float value that the maximum quantized input value represents.
+// For example:
 //
-// Returns The computed min output.the computed max output.
-func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RequantizationRange",
-		Input: []tf.Input{
-			input, input_min, input_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+// ```
+// # 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+// shape(t) ==> [2, 2, 3]
+// ```
+func Shape(scope *Scope, input tf.Output, optional ...ShapeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapPeek",
+		Type: "Shape",
 		Input: []tf.Input{
-			key, indices,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// Looks up keys in a table, outputs the corresponding values.
-//
-// The tensor `keys` must of the same type as the keys of the table.
-// The output `values` is of the type of the table values.
-//
-// The scalar `default_value` is the value output for keys not present in the
-// table. It must also be of the same type as the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
+// Computes the power of one value to another.
 //
+// Given a tensor `x` and a tensor `y`, this operation computes \\(x^y\\) for
+// corresponding elements in `x` and `y`. For example:
 //
-// Returns Same shape as `keys`.  Values found in the table, or `default_values`
-// for missing keys.
-func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
+// ```
+// # tensor 'x' is [[2, 2]], [3, 3]]
+// # tensor 'y' is [[8, 16], [2, 3]]
+// tf.pow(x, y) ==> [[256, 65536], [9, 27]]
+// ```
+func Pow(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LookupTableFindV2",
+		Type: "Pow",
 		Input: []tf.Input{
-			table_handle, keys, default_value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Bucketizes 'input' based on 'boundaries'.
-//
-// For example, if the inputs are
-//     boundaries = [0, 10, 100]
-//     input = [[-5, 10000]
-//              [150,   10]
-//              [5,    100]]
-//
-// then the output will be
-//     output = [[0, 3]
-//               [3, 2]
-//               [1, 3]]
+// Computes fingerprints of the input strings.
 //
 // Arguments:
-//	input: Any shape of Tensor contains with int or float type.
-//	boundaries: A sorted list of floats gives the boundary of the buckets.
-//
-// Returns Same shape with 'input', each value of input replaced with bucket index.
+//	input: vector of strings to compute fingerprints on.
 //
-// @compatibility(numpy)
-// Equivalent to np.digitize.
-// @end_compatibility
-func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
+// Returns a (N,2) shaped matrix where N is the number of elements in the input
+// vector. Each row contains the low and high parts of the fingerprint.
+func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "Bucketize",
+		Type: "SdcaFprint",
 		Input: []tf.Input{
 			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EncodePngAttr is an optional argument to EncodePng.
-type EncodePngAttr func(optionalAttr)
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
 
-// EncodePngCompression sets the optional compression attribute to value.
+// RandomPoissonV2Seed sets the optional seed attribute to value.
 //
-// value: Compression level.
-// If not specified, defaults to -1
-func EncodePngCompression(value int64) EncodePngAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
 	return func(m optionalAttr) {
-		m["compression"] = value
+		m["seed"] = value
 	}
 }
 
-// PNG-encode an image.
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
 //
-// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
-// where `channels` is:
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
 //
-// *   1: for grayscale.
-// *   2: for grayscale + alpha.
-// *   3: for RGB.
-// *   4: for RGBA.
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
 //
-// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
-// default or a value from 0 to 9.  9 is the highest compression level, generating
-// the smallest output, but is slower.
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
 //
-// Returns 0-D. PNG-encoded image.
-func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17742,9 +16975,9 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EncodePng",
+		Type: "RandomPoissonV2",
 		Input: []tf.Input{
-			image,
+			shape, rate,
 		},
 		Attrs: attrs,
 	}
@@ -17752,91 +16985,59 @@ func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (conten
 	return op.Output(0)
 }
 
-// Updates the table to associates keys with values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+// MatrixTriangularSolveAttr is an optional argument to MatrixTriangularSolve.
+type MatrixTriangularSolveAttr func(optionalAttr)
+
+// MatrixTriangularSolveLower sets the optional lower attribute to value.
 //
-// Returns the created operation.
-func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableInsertV2",
-		Input: []tf.Input{
-			table_handle, keys, values,
-		},
+// value: Boolean indicating whether the innermost matrices in `matrix` are
+// lower or upper triangular.
+// If not specified, defaults to true
+func MatrixTriangularSolveLower(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["lower"] = value
 	}
-	return scope.AddOperation(opspec)
 }
 
-// Returns element-wise smallest integer in not less than x.
-func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Ceil",
-		Input: []tf.Input{
-			x,
-		},
+// MatrixTriangularSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+//          adjoint.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.triangular_solve
+// @end_compatibility
+// If not specified, defaults to false
+func MatrixTriangularSolveAdjoint(value bool) MatrixTriangularSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Computes the number of elements in the given table.
+// Solves systems of linear equations with upper or lower triangular matrices by
 //
-// Arguments:
-//	table_handle: Handle to the table.
+// backsubstitution.
 //
-// Returns Scalar that contains number of elements in the table.
-func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LookupTableSizeV2",
-		Input: []tf.Input{
-			table_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
-type ResizeBilinearGradAttr func(optionalAttr)
-
-// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
+// `matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions form
+// square matrices. If `lower` is `True` then the strictly upper triangular part
+// of each inner-most matrix is assumed to be zero and not accessed.
+// If `lower` is False then the strictly lower triangular part of each inner-most
+// matrix is assumed to be zero and not accessed.
+// `rhs` is a tensor of shape `[..., M, K]`.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of bilinear interpolation.
+// The output is a tensor of shape `[..., M, K]`. If `adjoint` is
+// `True` then the innermost matrices in `output` satisfy matrix equations
+// `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `False` then the strictly then the  innermost matrices in
+// `output` satisfy matrix equations
+// `adjoint(matrix[..., i, k]) * output[..., k, j] = rhs[..., i, j]`.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
+// Returns Shape is `[..., M, K]`.
+func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixTriangularSolveAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -17845,9 +17046,9 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeBilinearGrad",
+		Type: "MatrixTriangularSolve",
 		Input: []tf.Input{
-			grads, original_image,
+			matrix, rhs,
 		},
 		Attrs: attrs,
 	}
@@ -17855,109 +17056,163 @@ func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// Outputs all keys and values in the table.
-//
-// Arguments:
-//	table_handle: Handle to the table.
-//
-//
-//
-// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
-func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
+// Computes inverse hyperbolic sine of x element-wise.
+func Asinh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "LookupTableExportV2",
+		Type: "Asinh",
 		Input: []tf.Input{
-			table_handle,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Replaces the contents of the table with the specified keys and values.
-//
-// The tensor `keys` must be of the same type as the keys of the table.
-// The tensor `values` must be of the type of the table values.
+// Creates a dataset with a range of values. Corresponds to python's xrange.
 //
 // Arguments:
-//	table_handle: Handle to the table.
-//	keys: Any shape.  Keys to look up.
-//	values: Values to associate with keys.
+//	start: corresponds to start in python's xrange().
+//	stop: corresponds to stop in python's xrange().
+//	step: corresponds to step in python's xrange().
 //
-// Returns the created operation.
-func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+//
+func RangeDataset(scope *Scope, start tf.Output, stop tf.Output, step tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "LookupTableImportV2",
+		Type: "RangeDataset",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			start, stop, step,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
+// DepthwiseConv2dNativeBackpropInputAttr is an optional argument to DepthwiseConv2dNativeBackpropInput.
+type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// DepthwiseConv2dNativeBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["data_format"] = value
 	}
 }
 
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// DepthwiseConv2dNativeBackpropInputDilations sets the optional dilations attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["dilations"] = value
 	}
 }
 
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Computes the gradients of depthwise convolution with respect to the input.
+//
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`, based
+// on `data_format`.  For example, if `data_format` is 'NHWC' then
+//  `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, depthwise_multiplier]`.
+//	out_backprop: 4-D with shape  based on `data_format`.
+// For example, if `data_format` is 'NHWC' then
+// out_backprop shape is `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape according to `data_format`.  For example, if
+// `data_format` is 'NHWC', output shape is `[batch, in_height,
+// in_width, in_channels]`.  Gradient w.r.t. the input of the
+// convolution.
+func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...DepthwiseConv2dNativeBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DepthwiseConv2dNativeBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// Stops gradient computation.
+//
+// When executed in a graph, this op outputs its input tensor as-is.
+//
+// When building ops to compute gradients, this op prevents the contribution of
+// its inputs to be taken into account.  Normally, the gradient generator adds ops
+// to a graph to compute the derivatives of a specified 'loss' by recursively
+// finding out inputs that contributed to its computation.  If you insert this op
+// in the graph it inputs are masked from the gradient generator.  They are not
+// taken into account for computing gradients.
+//
+// This is useful any time you want to compute a value with TensorFlow but need
+// to pretend that the value was a constant. Some examples include:
+//
+// *  The *EM* algorithm where the *M-step* should not involve backpropagation
+//    through the output of the *E-step*.
+// *  Contrastive divergence training of Boltzmann machines where, when
+//    differentiating the energy function, the training must not backpropagate
+//    through the graph that generated the samples from the model.
+// *  Adversarial training, where no backprop should happen through the adversarial
+//    example generation process.
+func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StopGradient",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op removes and returns a random (key, value)
+// Eagerly executes a python function to compute func(input)->output. The
 //
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+// semantics of the input, output, and attributes are the same as those for
+// PyFunc.
+func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"token": token, "Tout": Tout}
 	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
+		Type: "EagerPyFunc",
 		Input: []tf.Input{
-			indices,
+			tf.OutputList(input),
 		},
 		Attrs: attrs,
 	}
@@ -17967,351 +17222,289 @@ func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, opti
 	}
 	var idx int
 	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("EagerPyFunc", err)
 		return
 	}
-	return key, values
+	return output
 }
 
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
+// Adds sparse updates to the variable referenced by `resource`.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
+// This operation computes
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//     # Scalar indices
+//     ref[indices, ...] += updates[...]
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] += updates[i, ...]
 //
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
 //
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+// Returns the created operation.
+func ResourceScatterAdd(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
+		Type: "ResourceScatterAdd",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
-type MutableHashTableV2Attr func(optionalAttr)
-
-// MutableHashTableV2Container sets the optional container attribute to value.
+// Says whether the targets are in the top `K` predictions.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+// More formally, let
 //
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates an empty hash table.
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
 // Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-// Returns Handle to a table.
-func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
+// Returns Computed Precision at `k` as a `bool Tensor`.
+func InTopK(scope *Scope, predictions tf.Output, targets tf.Output, k int64) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"k": k}
 	opspec := tf.OpSpec{
-		Type: "MutableHashTableV2",
-
+		Type: "InTopK",
+		Input: []tf.Input{
+			predictions, targets,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DequantizeAttr is an optional argument to Dequantize.
-type DequantizeAttr func(optionalAttr)
-
-// DequantizeMode sets the optional mode attribute to value.
-// If not specified, defaults to "MIN_COMBINED"
-func DequantizeMode(value string) DequantizeAttr {
-	return func(m optionalAttr) {
-		m["mode"] = value
+// Returns (x - y)(x - y) element-wise.
+//
+// *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SquaredDifference",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Dequantize the 'input' tensor into a float Tensor.
-//
-// [min_range, max_range] are scalar floats that specify the range for
-// the 'input' data. The 'mode' attribute controls exactly which calculations are
-// used to convert the float values to their quantized equivalents.
-//
-// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
-//
-// ```
-// if T == qint8, in[i] += (range(T) + 1)/ 2.0
-// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
-// ```
-// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
-//
-// *MIN_COMBINED Mode Example*
-//
-// If the input comes from a QuantizedRelu6, the output type is
-// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
-// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
-// Dequantize on quint8 will take each value, cast to float, and multiply
-// by 6 / 255.
-// Note that if quantizedtype is qint8, the operation will additionally add
-// each value by 128 prior to casting.
-//
-// If the mode is 'MIN_FIRST', then this approach is used:
-//
-// ```c++
-// num_discrete_values = 1 << (# of bits in T)
-// range_adjust = num_discrete_values / (num_discrete_values - 1)
-// range = (range_max - range_min) * range_adjust
-// range_scale = range / num_discrete_values
-// const double offset_input = static_cast<double>(input) - lowest_quantized;
-// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
-// ```
-//
-// *SCALED mode Example*
-//
-// `SCALED` mode matches the quantization approach used in
-// `QuantizeAndDequantize{V2|V3}`.
-//
-// If the mode is `SCALED`, we do not use the full range of the output type,
-// choosing to elide the lowest possible value for symmetry (e.g., output range is
-// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
-// 0.
-//
-// We first find the range of values in our tensor. The
-// range we use is always centered on 0, so we find m such that
-// ```c++
-//   m = max(abs(input_min), abs(input_max))
-// ```
-//
-// Our input tensor range is then `[-m, m]`.
-//
-// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
-// If T is signed, this is
-// ```
-//   num_bits = sizeof(T) * 8
-//   [min_fixed, max_fixed] =
-//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
-// ```
-//
-// Otherwise, if T is unsigned, the fixed-point range is
-// ```
-//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
-// ```
-//
-// From this we compute our scaling factor, s:
-// ```c++
-//   s = (2 * m) / (max_fixed - min_fixed)
-// ```
+// Forwards the input to the output.
 //
-// Now we can dequantize the elements of our tensor:
-// ```c++
-// result = input * s
-// ```
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
 // Arguments:
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-//	min_range: The minimum scalar value possibly produced for the input.
-//	max_range: The maximum scalar value possibly produced for the input.
-func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Dequantize",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			input, min_range, max_range,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Flips all bits elementwise.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// The result will have exactly those bits set, that are not set in `x`. The
-// computation is performed on the underlying representation of x.
-func Invert(scope *Scope, x tf.Output) (y tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Invert",
+		Type: "ReciprocalGrad",
 		Input: []tf.Input{
-			x,
+			y, dy,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Disallowed in GraphDef version >= 2.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
-func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrast",
+		Type: "Minimum",
 		Input: []tf.Input{
-			images, contrast_factor, min_value, max_value,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Table initializer that takes two tensors for keys and values respectively.
+// Returns the element-wise sum of a list of tensors.
 //
-// Arguments:
-//	table_handle: Handle to a table which will be initialized.
-//	keys: Keys of type Tkey.
-//	values: Values of type Tval.
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
 //
-// Returns the created operation.
-func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "InitializeTableV2",
+		Type: "AccumulateNV2",
 		Input: []tf.Input{
-			table_handle, keys, values,
+			tf.OutputList(inputs),
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PrintAttr is an optional argument to Print.
-type PrintAttr func(optionalAttr)
-
-// PrintMessage sets the optional message attribute to value.
+// Convert the quantized 'input' tensor into a lower-precision 'output', using the
 //
-// value: A string, prefix of the error message.
-// If not specified, defaults to ""
-func PrintMessage(value string) PrintAttr {
-	return func(m optionalAttr) {
-		m["message"] = value
+// actual distribution of the values to maximize the usage of the lower bit depth
+// and adjusting the output min and max ranges accordingly.
+//
+// [input_min, input_max] are scalar floats that specify the range for the float
+// interpretation of the 'input' data. For example, if input_min is -1.0f and
+// input_max is 1.0f, and we are dealing with quint16 quantized data, then a 0
+// value in the 16-bit data should be interpreted as -1.0f, and a 65535 means 1.0f.
+//
+// This operator tries to squeeze as much precision as possible into an output with
+// a lower bit depth by calculating the actual min and max values found in the
+// data. For example, maybe that quint16 input has no values lower than 16,384 and
+// none higher than 49,152. That means only half the range is actually needed, all
+// the float interpretations are between -0.5f and 0.5f, so if we want to compress
+// the data into a quint8 output, we can use that range rather than the theoretical
+// -1.0f to 1.0f that is suggested by the input min and max.
+//
+// In practice, this is most useful for taking output from operations like
+// QuantizedMatMul that can produce higher bit-depth outputs than their inputs and
+// may have large potential output ranges, but in practice have a distribution of
+// input values that only uses a small fraction of the possible range. By feeding
+// that output into this operator, we can reduce it from 32 bits down to 8 with
+// minimal loss of accuracy.
+//
+// Arguments:
+//
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//	out_type: The type of the output. Should be a lower bit depth than Tinput.
+//
+// Returns The float value that the minimum quantized output value represents.The float value that the maximum quantized output value represents.
+func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, out_type tf.DataType) (output tf.Output, output_min tf.Output, output_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"out_type": out_type}
+	opspec := tf.OpSpec{
+		Type: "QuantizeDownAndShrinkRange",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PrintFirstN sets the optional first_n attribute to value.
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
+
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// value: Only log `first_n` number of times. -1 disables logging.
-// If not specified, defaults to -1
-func PrintFirstN(value int64) PrintAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["first_n"] = value
+		m["seed"] = value
 	}
 }
 
-// PrintSummarize sets the optional summarize attribute to value.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// value: Only print this many entries of each tensor.
-// If not specified, defaults to 3
-func PrintSummarize(value int64) PrintAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
 	return func(m optionalAttr) {
-		m["summarize"] = value
+		m["seed2"] = value
 	}
 }
 
-// Prints a list of tensors.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// Passes `input` through to `output` and prints `data` when evaluating.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	input: The tensor passed to `output`
-//	data: A list of tensors to print out when op is evaluated.
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
 //
-// Returns = The unmodified `input` tensor
-func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18320,9 +17513,9 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Print",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			input, tf.OutputList(data),
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -18330,44 +17523,59 @@ func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAtt
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
-		},
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
+// Outputs random integers from a uniform distribution.
 //
-// Arguments:
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-//	buffer_size: The maximum number of elements to buffer in an iterator over
-// this dataset.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "PrefetchDataset",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			input_dataset, buffer_size,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -18375,223 +17583,259 @@ func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Outpu
 	return op.Output(0)
 }
 
-// TensorSummaryAttr is an optional argument to TensorSummary.
-type TensorSummaryAttr func(optionalAttr)
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
 
-// TensorSummaryDescription sets the optional description attribute to value.
+// SkipgramWindowSize sets the optional window_size attribute to value.
 //
-// value: A json-encoded SummaryDescription proto.
-// If not specified, defaults to ""
-func TensorSummaryDescription(value string) TensorSummaryAttr {
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["description"] = value
+		m["window_size"] = value
 	}
 }
 
-// TensorSummaryLabels sets the optional labels attribute to value.
+// SkipgramMinCount sets the optional min_count attribute to value.
 //
-// value: An unused list of strings.
-// If not specified, defaults to <>
-func TensorSummaryLabels(value []string) TensorSummaryAttr {
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["labels"] = value
+		m["min_count"] = value
 	}
 }
 
-// TensorSummaryDisplayName sets the optional display_name attribute to value.
+// SkipgramSubsample sets the optional subsample attribute to value.
 //
-// value: An unused string.
-// If not specified, defaults to ""
-func TensorSummaryDisplayName(value string) TensorSummaryAttr {
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
 	return func(m optionalAttr) {
-		m["display_name"] = value
+		m["subsample"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with a tensor.
+// Parses a text file and creates a batch of examples.
 //
-// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
-// a tag as well as a serialized SummaryMetadata proto string that contains
-// plugin-specific data. We will keep this op to maintain backwards compatibility.
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
 //
 // Arguments:
-//	tensor: A tensor to serialize.
-func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummary",
-		Input: []tf.Input{
-			tensor,
-		},
+		Type: "Skipgram",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
 //
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
+// Converts each string in the input Tensor to the specified numeric type.
 //
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
 //
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
+		Type: "StringToNumber",
 		Input: []tf.Input{
-			tags, values,
+			string_tensor,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
 //
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
 //
-// This op reports an `InvalidArgument` error if any value is not finite.
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
 //
 // Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
+		Type: "ResourceApplyFtrlV2",
 		Input: []tf.Input{
-			tag, values,
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Computes the number of elements in the given queue.
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
 //
 // Arguments:
-//	handle: The handle to a queue.
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
 //
-// Returns The number of elements in the given queue.
-func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "QueueSizeV2",
+		Type: "TruncatedNormal",
 		Input: []tf.Input{
-			handle,
+			shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ImageSummaryAttr is an optional argument to ImageSummary.
-type ImageSummaryAttr func(optionalAttr)
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
 
-// ImageSummaryMaxImages sets the optional max_images attribute to value.
-//
-// value: Max number of batch elements to generate images for.
-// If not specified, defaults to 3
+// RandomShuffleSeed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= 1
-func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["max_images"] = value
+		m["seed"] = value
 	}
 }
 
-// ImageSummaryBadColor sets the optional bad_color attribute to value.
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
 //
-// value: Color to use for pixels with non-finite values.
-// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
-func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
 	return func(m optionalAttr) {
-		m["bad_color"] = value
+		m["seed2"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with images.
-//
-// The summary has up to `max_images` summary values containing images. The
-// images are built from `tensor` which must be 4-D with shape `[batch_size,
-// height, width, channels]` and where `channels` can be:
-//
-// *  1: `tensor` is interpreted as Grayscale.
-// *  3: `tensor` is interpreted as RGB.
-// *  4: `tensor` is interpreted as RGBA.
-//
-// The images have the same number of channels as the input tensor. For float
-// input, the values are normalized one image at a time to fit in the range
-// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
-// normalization algorithms:
-//
-// *  If the input values are all positive, they are rescaled so the largest one
-//    is 255.
-//
-// *  If any input value is negative, the values are shifted so input value 0.0
-//    is at 127.  They are then rescaled so that either the smallest value is 0,
-//    or the largest one is 255.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
+// Randomly shuffles a tensor along its first dimension.
 //
-// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
-// *  If `max_images` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
 //
-// The `bad_color` argument is the color to use in the generated images for
-// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
-// Each element must be in the range `[0, 255]` (It represents the value of a
-// pixel in the output image).  Non-finite values in the input tensor are
-// replaced by this tensor in the output image.  The default value is the color
-// red.
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
 //
 // Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
-// `channels` is 1, 3, or 4.
+//	value: The tensor to be shuffled.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18600,9 +17844,9 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ImageSummary",
+		Type: "RandomShuffle",
 		Input: []tf.Input{
-			tag, tensor,
+			value,
 		},
 		Attrs: attrs,
 	}
@@ -18610,101 +17854,99 @@ func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...Ima
 	return op.Output(0)
 }
 
-// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
-type AudioSummaryV2Attr func(optionalAttr)
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
 
-// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
-//
-// value: Max number of batch elements to generate audio for.
-// If not specified, defaults to 3
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// REQUIRES: value >= 1
-func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
 	return func(m optionalAttr) {
-		m["max_outputs"] = value
+		m["capacity"] = value
 	}
 }
 
-// Outputs a `Summary` protocol buffer with audio.
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// The summary has up to `max_outputs` summary values containing audio. The
-// audio is built from `tensor` which must be 3-D with shape `[batch_size,
-// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
-// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
-//
-// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
-// build the `tag` of the summary values:
-//
-// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
-// *  If `max_outputs` is greater than 1, the summary value tags are
-//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
-//
-// Arguments:
-//	tag: Scalar. Used to build the `tag` attribute of the summary values.
-//	tensor: 2-D of shape `[batch_size, frames]`.
-//	sample_rate: The sample rate of the signal in hertz.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AudioSummaryV2",
-		Input: []tf.Input{
-			tag, tensor, sample_rate,
-		},
+		Type: "OrderedMapIncompleteSize",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPoolAttr is an optional argument to AvgPool.
-type AvgPoolAttr func(optionalAttr)
+// DecodeRawAttr is an optional argument to DecodeRaw.
+type DecodeRawAttr func(optionalAttr)
 
-// AvgPoolDataFormat sets the optional data_format attribute to value.
+// DecodeRawLittleEndian sets the optional little_endian attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolDataFormat(value string) AvgPoolAttr {
+// value: Whether the input `bytes` are in little-endian order.
+// Ignored for `out_type` values that are stored in a single byte like
+// `uint8`.
+// If not specified, defaults to true
+func DecodeRawLittleEndian(value bool) DecodeRawAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["little_endian"] = value
 	}
 }
 
-// Performs average pooling on the input.
-//
-// Each entry in `output` is the mean of the corresponding size `ksize`
-// window in `value`.
+// Reinterpret the bytes of a string as a vector of numbers.
 //
 // Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	ksize: The size of the sliding window for each dimension of `value`.
-//	strides: The stride of the sliding window for each dimension of `value`.
-//	padding: The type of padding algorithm to use.
+//	bytes: All the elements must have the same length.
 //
-// Returns The average pooled output tensor.
-func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
+//
+// Returns A Tensor with one more dimension than the input `bytes`.  The
+// added dimension will have size equal to the length of the elements
+// of `bytes` divided by the number of bytes to represent `out_type`.
+func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...DecodeRawAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"out_type": out_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool",
+		Type: "DecodeRaw",
 		Input: []tf.Input{
-			value,
+			bytes,
 		},
 		Attrs: attrs,
 	}
@@ -18712,113 +17954,131 @@ func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padd
 	return op.Output(0)
 }
 
-// Merges summaries.
+// Copy a tensor setting everything outside a central band in each innermost matrix
 //
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
+// to zero.
 //
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
 //
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
 //
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the gradient of morphological 2-D dilation with respect to the filter.
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
-//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
-// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
-// Must be: `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
 //
-// Returns 3-D with shape `[filter_height, filter_width, depth]`.
-func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "Dilation2DBackpropFilter",
+		Type: "MatrixBandPart",
 		Input: []tf.Input{
-			input, filter, out_backprop,
+			input, num_lower, num_upper,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
+// CumsumAttr is an optional argument to Cumsum.
+type CumsumAttr func(optionalAttr)
 
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+// CumsumExclusive sets the optional exclusive attribute to value.
 //
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+// value: If `True`, perform exclusive cumsum.
+// If not specified, defaults to false
+func CumsumExclusive(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["exclusive"] = value
 	}
 }
 
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+// CumsumReverse sets the optional reverse attribute to value.
 //
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumsumReverse(value bool) CumsumAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["reverse"] = value
 	}
 }
 
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+// Compute the cumulative sum of the tensor `x` along `axis`.
 //
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
+// By default, this op performs an inclusive cumsum, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
+// ```python
+// tf.cumsum([a, b, c])  # => [a, a + b, a + b + c]
+// ```
 //
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumsum is
+// performed instead:
 //
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+// ```python
+// tf.cumsum([a, b, c], exclusive=True)  # => [0, a, a + b]
+// ```
 //
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+// By setting the `reverse` kwarg to `True`, the cumsum is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumsum([a, b, c], reverse=True)  # => [a + b + c, b + c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18827,9 +18087,9 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
+		Type: "Cumsum",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			x, axis,
 		},
 		Attrs: attrs,
 	}
@@ -18837,193 +18097,260 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// Computes the matrix exponential of one or more square matrices:
+// CumprodAttr is an optional argument to Cumprod.
+type CumprodAttr func(optionalAttr)
+
+// CumprodExclusive sets the optional exclusive attribute to value.
 //
-// exp(A) = \sum_{n=0}^\infty A^n/n!
+// value: If `True`, perform exclusive cumprod.
+// If not specified, defaults to false
+func CumprodExclusive(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["exclusive"] = value
+	}
+}
+
+// CumprodReverse sets the optional reverse attribute to value.
 //
-// The exponential is computed using a combination of the scaling and squaring
-// method and the Pade approximation. Details can be founds in:
-// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+// value: A `bool` (default: False).
+// If not specified, defaults to false
+func CumprodReverse(value bool) CumprodAttr {
+	return func(m optionalAttr) {
+		m["reverse"] = value
+	}
+}
+
+// Compute the cumulative product of the tensor `x` along `axis`.
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// By default, this op performs an inclusive cumprod, which means that the first
+// element of the input is identical to the first element of the output:
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// ```python
+// tf.cumprod([a, b, c])  # => [a, a * b, a * b * c]
+// ```
 //
-// Returns Shape is `[..., M, M]`.
+// By setting the `exclusive` kwarg to `True`, an exclusive cumprod is
+// performed instead:
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.expm
-// @end_compatibility
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// ```python
+// tf.cumprod([a, b, c], exclusive=True)  # => [1, a, a * b]
+// ```
+//
+// By setting the `reverse` kwarg to `True`, the cumprod is performed in the
+// opposite direction:
+//
+// ```python
+// tf.cumprod([a, b, c], reverse=True)  # => [a * b * c, b * c, c]
+// ```
+//
+// This is more efficient than using separate `tf.reverse` ops.
+//
+// The `reverse` and `exclusive` kwargs can also be combined:
+//
+// ```python
+// tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
+// ```
+//
+// Arguments:
+//	x: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+// `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`,
+// `complex128`, `qint8`, `quint8`, `qint32`, `half`.
+//	axis: A `Tensor` of type `int32` (default: 0). Must be in the range
+// `[-rank(x), rank(x))`.
+func Cumprod(scope *Scope, x tf.Output, axis tf.Output, optional ...CumprodAttr) (out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "Cumprod",
 		Input: []tf.Input{
-			input,
+			x, axis,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
-type QueueDequeueUpToV2Attr func(optionalAttr)
+// QuantizedMatMulAttr is an optional argument to QuantizedMatMul.
+type QuantizedMatMulAttr func(optionalAttr)
 
-// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
+// QuantizedMatMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulToutput(value tf.DataType) QuantizedMatMulAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
+// QuantizedMatMulTransposeA sets the optional transpose_a attribute to value.
 //
-// This operation is not supported by all queues.  If a queue does not support
-// DequeueUpTo, then an Unimplemented error is returned.
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeA(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulTransposeB sets the optional transpose_b attribute to value.
 //
-// If the queue is closed and there are more than 0 but less than `n`
-// elements remaining, then instead of returning an OutOfRange error like
-// QueueDequeueMany, less than `n` elements are returned immediately.  If
-// the queue is closed and there are 0 elements left in the queue, then
-// an OutOfRange error is returned just like in QueueDequeueMany.
-// Otherwise the behavior is identical to QueueDequeueMany:
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulTransposeB(value bool) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulTactivation sets the optional Tactivation attribute to value.
 //
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size n in the 0th dimension.
+// value: The type of output produced by activation function
+// following this operation.
+// If not specified, defaults to DT_QUINT8
+func QuantizedMatMulTactivation(value tf.DataType) QuantizedMatMulAttr {
+	return func(m optionalAttr) {
+		m["Tactivation"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b`.
 //
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
+// The inputs must be two-dimensional matrices and the inner dimension of
+// `a` (after being transposed if `transpose_a` is non-zero) must match the
+// outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero).
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
+//	a: Must be a two-dimensional tensor.
+//	b: Must be a two-dimensional tensor.
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+func QuantizedMatMul(scope *Scope, a tf.Output, b tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueUpToV2",
+		Type: "QuantizedMatMul",
 		Input: []tf.Input{
-			handle, n,
+			a, b, min_a, max_a, min_b, max_b,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueUpToV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
 	}
-	return components
+	return scope.AddOperation(opspec)
 }
 
-// Computes the Cholesky decomposition of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices.
-//
-// The input has to be symmetric and positive definite. Only the lower-triangular
-// part of the input will be used for this operation. The upper-triangular part
-// will not be read.
+// Batch normalization.
 //
-// The output is a tensor of the same shape as the input
-// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// **Note**: The gradient computation on GPU is faster for large matrices but
-// not for large batch dimensions when the submatrices are small. In this
-// case it might be faster to use the CPU.
+// This op is deprecated. Prefer `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	beta: A 1D beta Tensor with size matching the last dimension of t.
+// An offset to be added to the normalized tensor.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this tensor will be multiplied
+// with the normalized tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+func BatchNormWithGlobalNormalization(scope *Scope, t tf.Output, m tf.Output, v tf.Output, beta tf.Output, gamma tf.Output, variance_epsilon float32, scale_after_normalization bool) (result tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Cholesky",
+		Type: "BatchNormWithGlobalNormalization",
 		Input: []tf.Input{
-			input,
+			t, m, v, beta, gamma,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Writes contents to the file at input filename. Creates file and recursively
-//
-// creates directory if not existing.
-//
-// Arguments:
-//	filename: scalar. The name of the file to which we write the contents.
-//	contents: scalar. The content to be written to the output file.
+// Deprecated. Use TensorArrayReadV3
 //
-// Returns the created operation.
-func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayReadV3
+func TensorArrayReadV2(scope *Scope, handle tf.Output, index tf.Output, flow_in tf.Output, dtype tf.DataType) (value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
 	opspec := tf.OpSpec{
-		Type: "WriteFile",
+		Type: "TensorArrayReadV2",
 		Input: []tf.Input{
-			filename, contents,
+			handle, index, flow_in,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// AllAttr is an optional argument to All.
-type AllAttr func(optionalAttr)
+// QuantizedMulAttr is an optional argument to QuantizedMul.
+type QuantizedMulAttr func(optionalAttr)
 
-// AllKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func AllKeepDims(value bool) AllAttr {
+// QuantizedMulToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMulToutput(value tf.DataType) QuantizedMulAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["Toutput"] = value
 	}
 }
 
-// Computes the "logical and" of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
+// Returns x * y element-wise, working on quantized buffers.
 //
 // Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
 //
-// Returns The reduced tensor.
-func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
+//
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
+//
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
+//
+// *NOTE*: `QuantizedMul` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedMul(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedMulAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19032,98 +18359,119 @@ func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "All",
+		Type: "QuantizedMul",
 		Input: []tf.Input{
-			input, axis,
+			x, y, min_x, max_x, min_y, max_y,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
+// QuantizedAddAttr is an optional argument to QuantizedAdd.
+type QuantizedAddAttr func(optionalAttr)
+
+// QuantizedAddToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedAddToutput(value tf.DataType) QuantizedAddAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// Returns x + y element-wise, working on quantized buffers.
 //
-// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
+// Arguments:
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices, with the same constraints as the single matrix
-// SelfAdjointEig.
 //
-// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
-// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
+//	min_x: The float value that the lowest quantized `x` value represents.
+//	max_x: The float value that the highest quantized `x` value represents.
+//	min_y: The float value that the lowest quantized `y` value represents.
+//	max_y: The float value that the highest quantized `y` value represents.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// Returns The float value that the lowest quantized output value represents.The float value that the highest quantized output value represents.
 //
-// Returns Shape is `[..., M+1, M]`.
-func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
+// *NOTE*: `QuantizedAdd` supports limited forms of broadcasting. More about
+// broadcasting [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func QuantizedAdd(scope *Scope, x tf.Output, y tf.Output, min_x tf.Output, max_x tf.Output, min_y tf.Output, max_y tf.Output, optional ...QuantizedAddAttr) (z tf.Output, min_z tf.Output, max_z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEig",
+		Type: "QuantizedAdd",
 		Input: []tf.Input{
-			input,
+			x, y, min_x, max_x, min_y, max_y,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes softplus gradients for a softplus operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softplus operation.
-//	features: The features passed as input to the corresponding softplus operation.
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
 //
-// Returns The gradients: `gradients / (1 + exp(-features))`.
-func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftplusGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["upper_frequency_limit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
-type SelfAdjointEigV2Attr func(optionalAttr)
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+//
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
+	return func(m optionalAttr) {
+		m["lower_frequency_limit"] = value
+	}
+}
 
-// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: If `True` then eigenvectors will be computed and returned in `v`.
-// Otherwise, only the eigenvalues will be computed.
-// If not specified, defaults to true
-func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["compute_v"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// Computes the eigen decomposition of one or more square self-adjoint matrices.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
-// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
+	return func(m optionalAttr) {
+		m["dct_coefficient_count"] = value
+	}
+}
+
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// ```python
-// # a is a tensor.
-// # e is a tensor of eigenvalues.
-// # v is a tensor of eigenvectors.
-// e, v = self_adjoint_eig(a)
-// e = self_adjoint_eig(a, compute_v=False)
-// ```
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
 // Arguments:
-//	input: `Tensor` input of shape `[N, N]`.
-//
-// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
-func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19132,272 +18480,212 @@ func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SelfAdjointEigV2",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			input,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Adjust the saturation of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
+// Given a quantized tensor described by (input, input_min, input_max), outputs a
 //
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A scale is then applied all the saturation
-// values, and then remapped back to RGB colorspace.
+// range that covers the actual values present in that tensor.  This op is
+// typically used to produce the requested_output_min and requested_output_max for
+// Requantize.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	scale: A float scale to add to the saturation.
 //
-// Returns The hue-adjusted image or images.
-func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
+//	input_min: The float value that the minimum quantized input value represents.
+//	input_max: The float value that the maximum quantized input value represents.
+//
+// Returns The computed min output.the computed max output.
+func RequantizationRange(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output) (output_min tf.Output, output_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustSaturation",
+		Type: "RequantizationRange",
 		Input: []tf.Input{
-			images, scale,
+			input, input_min, input_max,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// SvdAttr is an optional argument to Svd.
-type SvdAttr func(optionalAttr)
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
 
-// SvdComputeUv sets the optional compute_uv attribute to value.
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, left and right singular vectors will be
-// computed and returned in `u` and `v`, respectively.
-// If false, `u` and `v` are not set and should never referenced.
-// If not specified, defaults to true
-func SvdComputeUv(value bool) SvdAttr {
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["compute_uv"] = value
+		m["capacity"] = value
 	}
 }
 
-// SvdFullMatrices sets the optional full_matrices attribute to value.
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// value: If true, compute full-sized `u` and `v`. If false
-// (the default), compute only the leading `P` singular vectors.
-// Ignored if `compute_uv` is `False`.
-// If not specified, defaults to false
-func SvdFullMatrices(value bool) SvdAttr {
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["full_matrices"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// Computes the singular value decompositions of one or more matrices.
-//
-// Computes the SVD of each inner matrix in `input` such that
-// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
-//
-// ```python
-// # a is a tensor containing a batch of matrices.
-// # s is a tensor of singular values for each matrix.
-// # u is the tensor containing of left singular vectors for each matrix.
-// # v is the tensor containing of right singular vectors for each matrix.
-// s, u, v = svd(a)
-// s, _, _ = svd(a, compute_uv=False)
-// ```
-//
-// Arguments:
-//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
-//
-// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., M, P]`; if `full_matrices` is `True` then shape is
-// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
-// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
-// Undefined if `compute_uv` is false.
-func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Svd",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
-type QueueEnqueueManyV2Attr func(optionalAttr)
-
-// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is too full, this operation will block for up
-// to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Enqueues zero or more tuples of one or more tensors in the given queue.
-//
-// This operation slices each component tensor along the 0th dimension to
-// make multiple queue elements. All of the tuple components must have the
-// same size in the 0th dimension.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
-//
-// N.B. If the queue is full, this operation will block until the given
-// elements have been enqueued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should
-// be taken.
+// Op peeks at the values at the specified key.  If the
 //
-// Returns the created operation.
-func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueManyV2",
+		Type: "MapPeek",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			key, indices,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
 }
 
-// Computes the product along segments of a tensor.
-//
-// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \prod_j data_j\\) where the product is over `j` such
-// that `segment_ids[j] == i`.
+// Looks up keys in a table, outputs the corresponding values.
 //
-// If the product is empty for a given segment ID `i`, `output[i] = 1`.
+// The tensor `keys` must of the same type as the keys of the table.
+// The output `values` is of the type of the table values.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
-// </div>
+// The scalar `default_value` is the value output for keys not present in the
+// table. It must also be of the same type as the table values.
 //
 // Arguments:
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
 //
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
 //
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+// Returns Same shape as `keys`.  Values found in the table, or `default_values`
+// for missing keys.
+func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, default_value tf.Output) (values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SegmentProd",
+		Type: "LookupTableFindV2",
 		Input: []tf.Input{
-			data, segment_ids,
+			table_handle, keys, default_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Converts one or more images from RGB to HSV.
+// Bucketizes 'input' based on 'boundaries'.
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// For example, if the inputs are
+//     boundaries = [0, 10, 100]
+//     input = [[-5, 10000]
+//              [150,   10]
+//              [5,    100]]
 //
-// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
-// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
-// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+// then the output will be
+//     output = [[0, 3]
+//               [3, 2]
+//               [1, 3]]
 //
 // Arguments:
-//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//	input: Any shape of Tensor contains with int or float type.
+//	boundaries: A sorted list of floats gives the boundary of the buckets.
 //
-// Returns `images` converted to HSV.
-func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns Same shape with 'input', each value of input replaced with bucket index.
+//
+// @compatibility(numpy)
+// Equivalent to np.digitize.
+// @end_compatibility
+func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"boundaries": boundaries}
 	opspec := tf.OpSpec{
-		Type: "RGBToHSV",
+		Type: "Bucketize",
 		Input: []tf.Input{
-			images,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Does nothing. Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func NoOp(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NoOp",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
-type MergeV2CheckpointsAttr func(optionalAttr)
+// EncodePngAttr is an optional argument to EncodePng.
+type EncodePngAttr func(optionalAttr)
 
-// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+// EncodePngCompression sets the optional compression attribute to value.
 //
-// value: see above.
-// If not specified, defaults to true
-func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+// value: Compression level.
+// If not specified, defaults to -1
+func EncodePngCompression(value int64) EncodePngAttr {
 	return func(m optionalAttr) {
-		m["delete_old_dirs"] = value
+		m["compression"] = value
 	}
 }
 
-// V2 format specific: merges the metadata files of sharded checkpoints.  The
+// PNG-encode an image.
 //
-// result is one logical checkpoint, with one physical metadata file and renamed
-// data files.
+// `image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+// where `channels` is:
 //
-// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+// *   1: for grayscale.
+// *   2: for grayscale + alpha.
+// *   3: for RGB.
+// *   4: for RGBA.
 //
-// If delete_old_dirs is true, attempts to delete recursively the dirname of each
-// path in the input checkpoint_prefixes.  This is useful when those paths are non
-// user-facing temporary locations.
+// The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+// default or a value from 0 to 9.  9 is the highest compression level, generating
+// the smallest output, but is slower.
 //
 // Arguments:
-//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
-//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
-// as one of the checkpoint_prefixes.
+//	image: 3-D with shape `[height, width, channels]`.
 //
-// Returns the created operation.
-func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
+// Returns 0-D. PNG-encoded image.
+func EncodePng(scope *Scope, image tf.Output, optional ...EncodePngAttr) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19406,697 +18694,459 @@ func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MergeV2Checkpoints",
+		Type: "EncodePng",
 		Input: []tf.Input{
-			checkpoint_prefixes, destination_prefix,
+			image,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Saves input tensors slices to disk.
-//
-// This is like `Save` except that tensors can be listed in the saved file as being
-// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
-// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
-// have as many elements as `tensor_names`.
-//
-// Elements of the `shapes_and_slices` input must either be:
-//
-// *  The empty string, in which case the corresponding tensor is
-//    saved normally.
-// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
-//    `dimI` are the dimensions of the larger tensor and `slice-spec`
-//    specifies what part is covered by the tensor to save.
-//
-// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
-// where each `sliceI` is either:
-//
-// *  The string `-` meaning that the slice covers all indices of this dimension
-// *  `start,length` where `start` and `length` are integers.  In that
-//    case the slice covers `length` indices starting at `start`.
+// Updates the table to associates keys with values.
 //
-// See also `Save`.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	filename: Must have a single element. The name of the file to which we write the
-// tensor.
-//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
-//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
-// saving the tensors.
-//	data: `N` tensors to save.
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
 // Returns the created operation.
-func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
+func LookupTableInsertV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveSlices",
+		Type: "LookupTableInsertV2",
 		Input: []tf.Input{
-			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
+			table_handle, keys, values,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
-type DenseToDenseSetOperationAttr func(optionalAttr)
-
-// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
+// Returns element-wise smallest integer in not less than x.
+func Ceil(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Ceil",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Applies set operation along last dimension of 2 `Tensor` inputs.
+// Computes the number of elements in the given table.
 //
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Arguments:
+//	table_handle: Handle to the table.
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
-//
-// Arguments:
-//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
-// Dimension `n` contains values in a set, duplicates are allowed but ignored.
-//
-//
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns Scalar that contains number of elements in the table.
+func LookupTableSizeV2(scope *Scope, table_handle tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DenseToDenseSetOperation",
+		Type: "LookupTableSizeV2",
 		Input: []tf.Input{
-			set1, set2,
+			table_handle,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Generate a sharded filename. The filename is printf formatted as
+// ResizeBilinearGradAttr is an optional argument to ResizeBilinearGrad.
+type ResizeBilinearGradAttr func(optionalAttr)
+
+// ResizeBilinearGradAlignCorners sets the optional align_corners attribute to value.
 //
-//    %s-%05d-of-%05d, basename, shard, num_shards.
-func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBilinearGradAlignCorners(value bool) ResizeBilinearGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bilinear interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBilinearGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBilinearGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShardedFilename",
+		Type: "ResizeBilinearGrad",
 		Input: []tf.Input{
-			basename, shard, num_shards,
+			grads, original_image,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// BatchToSpace for N-D tensors of type T.
-//
-// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
-// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
-// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
-// the input.  The spatial dimensions of this intermediate result are then
-// optionally cropped according to `crops` to produce the output.  This is the
-// reverse of SpaceToBatch.  See below for a precise description.
+// Outputs all keys and values in the table.
 //
 // Arguments:
-//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
-// where spatial_shape has M dimensions.
-//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
-//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
-//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
-//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
-//   required that
-//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
-//
-// This operation is equivalent to the following steps:
-//
-// 1. Reshape `input` to `reshaped` of shape:
-//      [block_shape[0], ..., block_shape[M-1],
-//       batch / prod(block_shape),
-//       input_shape[1], ..., input_shape[N-1]]
-//
-// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1], block_shape[0],
-//       ...,
-//       input_shape[M], block_shape[M-1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0],
-//       ...,
-//       input_shape[M] * block_shape[M-1],
-//
-//       input_shape[M+1],
-//       ...,
-//       input_shape[N-1]]
-//
-// 4. Crop the start and end of dimensions `[1, ..., M]` of
-//    `reshaped_permuted` according to `crops` to produce the output of shape:
-//      [batch / prod(block_shape),
-//
-//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
-//       ...,
-//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
-//
-//       input_shape[M+1], ..., input_shape[N-1]]
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [0, 0]]`:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
-//     `crops = [[0, 0], [2, 0]]`:
+//	table_handle: Handle to the table.
 //
-// ```
-// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
-//      [[[0], [2], [4]]], [[[0], [10], [12]]],
-//      [[[0], [5], [7]]], [[[0], [13], [15]]],
-//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
-// ```
 //
-// The output tensor has shape `[2, 2, 4, 1]` and value:
 //
-// ```
-// x = [[[[1],   [2],  [3],  [4]],
-//       [[5],   [6],  [7],  [8]]],
-//      [[[9],  [10], [11],  [12]],
-//       [[13], [14], [15],  [16]]]]
-// ```
-func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
+// Returns Vector of all keys present in the table.Tensor of all values in the table. Indexed in parallel with `keys`.
+func LookupTableExportV2(scope *Scope, table_handle tf.Output, Tkeys tf.DataType, Tvalues tf.DataType) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"Tkeys": Tkeys, "Tvalues": Tvalues}
 	opspec := tf.OpSpec{
-		Type: "BatchToSpaceND",
+		Type: "LookupTableExportV2",
 		Input: []tf.Input{
-			input, block_shape, crops,
+			table_handle,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UnpackAttr is an optional argument to Unpack.
-type UnpackAttr func(optionalAttr)
-
-// UnpackAxis sets the optional axis attribute to value.
-//
-// value: Dimension along which to unpack.  Negative values wrap around, so the
-// valid range is `[-R, R)`.
-// If not specified, defaults to 0
-func UnpackAxis(value int64) UnpackAttr {
-	return func(m optionalAttr) {
-		m["axis"] = value
-	}
+	return op.Output(0), op.Output(1)
 }
 
-// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
-//
-// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
-// For example, given a tensor of shape `(A, B, C, D)`;
-//
-// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
-//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
-//   dimension unpacked along is gone, unlike `split`).
-//
-// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
-//   and each tensor in `output` will have shape `(A, C, D)`.
-// Etc.
+// Replaces the contents of the table with the specified keys and values.
 //
-// This is the opposite of `pack`.
+// The tensor `keys` must be of the same type as the keys of the table.
+// The tensor `values` must be of the type of the table values.
 //
 // Arguments:
-//	value: 1-D or higher, with `axis` dimension size equal to `num`.
-//
+//	table_handle: Handle to the table.
+//	keys: Any shape.  Keys to look up.
+//	values: Values to associate with keys.
 //
-// Returns The list of tensors unpacked from `value`.
-func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
+// Returns the created operation.
+func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num": num}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Unpack",
+		Type: "LookupTableImportV2",
 		Input: []tf.Input{
-			value,
+			table_handle, keys, values,
 		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Unpack", err)
-		return
 	}
-	return output
+	return scope.AddOperation(opspec)
 }
 
-// Increments variable pointed to by 'resource' until it reaches 'limit'.
-//
-// Arguments:
-//	resource: Should be from a scalar `Variable` node.
-//	limit: If incrementing ref would bring it above limit, instead generates an
-// 'OutOfRange' error.
-//
-//
-// Returns A copy of the input before increment. If nothing else modifies the
-// input, the values produced will all be distinct.
-func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"limit": limit, "T": T}
-	opspec := tf.OpSpec{
-		Type: "ResourceCountUpTo",
-		Input: []tf.Input{
-			resource,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
 
-// Delete the stack from its resource container.
-//
-// Arguments:
-//	handle: The handle to a stack.
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Returns the created operation.
-func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StackCloseV2",
-		Input: []tf.Input{
-			handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Generate a glob pattern matching all sharded file names.
-func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShardedFilespec",
-		Input: []tf.Input{
-			basename, num_shards,
-		},
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// A Reader that outputs the lines of a file delimited by '\n'.
+// Op removes and returns a random (key, value)
 //
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
+		Type: "MapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
 }
 
-// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
-type LoadAndRemapMatrixAttr func(optionalAttr)
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
 
-// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
+// HashTableV2Container sets the optional container attribute to value.
 //
-// value: The maximum number of rows to load from the checkpoint at
-// once. If less than or equal to 0, the entire matrix will be loaded into
-// memory. Setting this arg trades increased disk reads for lower memory usage.
-// If not specified, defaults to -1
-func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
 	return func(m optionalAttr) {
-		m["max_rows_in_memory"] = value
+		m["container"] = value
 	}
 }
 
-// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
-//
-// at `ckpt_path` and potentially reorders its rows and columns using the
-// specified remappings.
-//
-// Most users should use one of the wrapper initializers (such as
-// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
-// function directly.
-//
-// The remappings are 1-D tensors with the following properties:
-//
-// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
-//   matrix will be initialized from the row corresponding to index
-//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
-// * `col_remapping` must have either 0 entries (indicating that no column
-//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
-//   output matrix will be initialized from the column corresponding to index
-//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
-// * A value of -1 in either of the remappings signifies a "missing" entry. In that
-//   case, values from the `initializing_values` tensor will be used to fill that
-//   missing row or column. If `row_remapping` has `r` missing entries and
-//   `col_remapping` has `c` missing entries, then the following condition must be
-//   true:
-//
-// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
+// HashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// The remapping tensors can be generated using the GenerateVocabRemapping op.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 //
-// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
-// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
-// the value from row i, column j of the old tensor in the checkpoint, the output
-// matrix will look like the following:
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
 //
-// [[w(1, 0),  w(1, 2),  0.5],
-//  [w(0, 0),  w(0, 2), -0.5],
-//  [0.25,    -0.25,      42]]
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
 //
 // Arguments:
-//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
-// which the old matrix `Tensor` will be loaded.
-//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
-//	row_remapping: An int `Tensor` of row remappings (generally created by
-// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
-// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
-// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
-//	col_remapping: An int `Tensor` of column remappings (generally created by
-// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
-// is to be done (e.g. column ordering is the same).
-//	initializing_values: A float `Tensor` containing  values to fill in for cells
-// in the output matrix that are not loaded from the checkpoint. Length must be
-// exactly the same as the number of missing / new cells.
-//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
-//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
 //
-// Returns Output matrix containing existing values loaded from the
-// checkpoint, and with any missing values filled in from initializing_values.
-func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "LoadAndRemapMatrix",
-		Input: []tf.Input{
-			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
-		},
+		Type: "HashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
-type TFRecordReaderV2Attr func(optionalAttr)
+// MutableHashTableV2Attr is an optional argument to MutableHashTableV2.
+type MutableHashTableV2Attr func(optionalAttr)
 
-// TFRecordReaderV2Container sets the optional container attribute to value.
+// MutableHashTableV2Container sets the optional container attribute to value.
 //
-// value: If non-empty, this reader is placed in the given container.
+// value: If non-empty, this table is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+func MutableHashTableV2Container(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
+// MutableHashTableV2SharedName sets the optional shared_name attribute to value.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
 // If not specified, defaults to ""
-func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+func MutableHashTableV2SharedName(value string) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
-// If not specified, defaults to ""
-func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+// MutableHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func MutableHashTableV2UseNodeNameSharing(value bool) MutableHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// A Reader that outputs the records from a TensorFlow Records file.
+// Creates an empty hash table.
 //
-// Returns The handle to reference the Reader.
-func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TFRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
-type QuantizeAndDequantizeV3Attr func(optionalAttr)
-
-// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["signed_input"] = value
-	}
-}
-
-// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
-	return func(m optionalAttr) {
-		m["range_given"] = value
-	}
-}
-
-// Quantizes then dequantizes a tensor.
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
 //
-// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
-// tensor, so its value can change during training.
-func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableHashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...MutableHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantizeV3",
-		Input: []tf.Input{
-			input, input_min, input_max, num_bits,
-		},
+		Type: "MutableHashTableV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
-type IdentityReaderV2Attr func(optionalAttr)
+// DequantizeAttr is an optional argument to Dequantize.
+type DequantizeAttr func(optionalAttr)
 
-// IdentityReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+// DequantizeMode sets the optional mode attribute to value.
+// If not specified, defaults to "MIN_COMBINED"
+func DequantizeMode(value string) DequantizeAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["mode"] = value
 	}
 }
 
-// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
+// Dequantize the 'input' tensor into a float Tensor.
 //
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the queued work as both the key and value.
+// [min_range, max_range] are scalar floats that specify the range for
+// the 'input' data. The 'mode' attribute controls exactly which calculations are
+// used to convert the float values to their quantized equivalents.
 //
-// To use, enqueue strings in a Queue.  ReaderRead will take the front
-// work string and output (work, work).
+// In 'MIN_COMBINED' mode, each value of the tensor will undergo the following:
 //
-// Returns The handle to reference the Reader.
-func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "IdentityReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
-type ResourceApplyGradientDescentAttr func(optionalAttr)
-
-// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
+// ```
+// if T == qint8, in[i] += (range(T) + 1)/ 2.0
+// out[i] = min_range + (in[i]* (max_range - min_range) / range(T))
+// ```
+// here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`
 //
-// value: If `True`, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' by subtracting 'alpha' * 'delta' from it.
+// *MIN_COMBINED Mode Example*
+//
+// If the input comes from a QuantizedRelu6, the output type is
+// quint8 (range of 0-255) but the possible range of QuantizedRelu6 is
+// 0-6.  The min_range and max_range values are therefore 0.0 and 6.0.
+// Dequantize on quint8 will take each value, cast to float, and multiply
+// by 6 / 255.
+// Note that if quantizedtype is qint8, the operation will additionally add
+// each value by 128 prior to casting.
+//
+// If the mode is 'MIN_FIRST', then this approach is used:
+//
+// ```c++
+// num_discrete_values = 1 << (# of bits in T)
+// range_adjust = num_discrete_values / (num_discrete_values - 1)
+// range = (range_max - range_min) * range_adjust
+// range_scale = range / num_discrete_values
+// const double offset_input = static_cast<double>(input) - lowest_quantized;
+// result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
+// ```
+//
+// *SCALED mode Example*
+//
+// `SCALED` mode matches the quantization approach used in
+// `QuantizeAndDequantize{V2|V3}`.
+//
+// If the mode is `SCALED`, we do not use the full range of the output type,
+// choosing to elide the lowest possible value for symmetry (e.g., output range is
+// -127 to 127, not -128 to 127 for signed 8 bit quantization), so that 0.0 maps to
+// 0.
+//
+// We first find the range of values in our tensor. The
+// range we use is always centered on 0, so we find m such that
+// ```c++
+//   m = max(abs(input_min), abs(input_max))
+// ```
+//
+// Our input tensor range is then `[-m, m]`.
+//
+// Next, we choose our fixed-point quantization buckets, `[min_fixed, max_fixed]`.
+// If T is signed, this is
+// ```
+//   num_bits = sizeof(T) * 8
+//   [min_fixed, max_fixed] =
+//       [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
+// ```
+//
+// Otherwise, if T is unsigned, the fixed-point range is
+// ```
+//   [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
+// ```
+//
+// From this we compute our scaling factor, s:
+// ```c++
+//   s = (2 * m) / (max_fixed - min_fixed)
+// ```
+//
+// Now we can dequantize the elements of our tensor:
+// ```c++
+// result = input * s
+// ```
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	delta: The change.
 //
-// Returns the created operation.
-func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+func Dequantize(scope *Scope, input tf.Output, min_range tf.Output, max_range tf.Output, optional ...DequantizeAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20105,132 +19155,115 @@ func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyGradientDescent",
+		Type: "Dequantize",
 		Input: []tf.Input{
-			var_, alpha, delta,
+			input, min_range, max_range,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns the next record (key, value pair) produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-//	queue_handle: Handle to a Queue, with string work items.
+// Flips all bits elementwise.
 //
-// Returns A scalar.A scalar.
-func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
+// The result will have exactly those bits set, that are not set in `x`. The
+// computation is performed on the underlying representation of x.
+func Invert(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadV2",
+		Type: "Invert",
 		Input: []tf.Input{
-			reader_handle, queue_handle,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Returns up to `num_records` (key, value) pairs produced by a Reader.
-//
-// Will dequeue from the input queue if necessary (e.g. when the
-// Reader needs to start reading from a new file since it has finished
-// with the previous file).
-// It may return less than `num_records` even before the last batch.
-//
-// Arguments:
-//	reader_handle: Handle to a `Reader`.
-//	queue_handle: Handle to a `Queue`, with string work items.
-//	num_records: number of records to read from `Reader`.
+// Deprecated. Disallowed in GraphDef version >= 2.
 //
-// Returns A 1-D tensor.A 1-D tensor.
-func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
+// DEPRECATED at GraphDef version 2: Use AdjustContrastv2 instead
+func AdjustContrast(scope *Scope, images tf.Output, contrast_factor tf.Output, min_value tf.Output, max_value tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderReadUpToV2",
+		Type: "AdjustContrast",
 		Input: []tf.Input{
-			reader_handle, queue_handle, num_records,
+			images, contrast_factor, min_value, max_value,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// Table initializer that takes two tensors for keys and values respectively.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	table_handle: Handle to a table which will be initialized.
+//	keys: Keys of type Tkey.
+//	values: Values of type Tval.
 //
 // Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+func InitializeTableV2(scope *Scope, table_handle tf.Output, keys tf.Output, values tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
+		Type: "InitializeTableV2",
 		Input: []tf.Input{
-			reader_handle,
+			table_handle, keys, values,
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+// PrintAttr is an optional argument to Print.
+type PrintAttr func(optionalAttr)
+
+// PrintMessage sets the optional message attribute to value.
 //
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+// value: A string, prefix of the error message.
+// If not specified, defaults to ""
+func PrintMessage(value string) PrintAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["message"] = value
 	}
 }
 
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+// PrintFirstN sets the optional first_n attribute to value.
 //
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+// value: Only log `first_n` number of times. -1 disables logging.
+// If not specified, defaults to -1
+func PrintFirstN(value int64) PrintAttr {
 	return func(m optionalAttr) {
-		m["use_nesterov"] = value
+		m["first_n"] = value
 	}
 }
 
-// Update '*var' according to the Adam algorithm.
+// PrintSummarize sets the optional summarize attribute to value.
 //
-// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
-// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
-// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+// value: Only print this many entries of each tensor.
+// If not specified, defaults to 3
+func PrintSummarize(value int64) PrintAttr {
+	return func(m optionalAttr) {
+		m["summarize"] = value
+	}
+}
+
+// Prints a list of tensors.
+//
+// Passes `input` through to `output` and prints `data` when evaluating.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	input: The tensor passed to `output`
+//	data: A list of tensors to print out when op is evaluated.
 //
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+// Returns = The unmodified `input` tensor
+func Print(scope *Scope, input tf.Output, data []tf.Output, optional ...PrintAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20239,73 +19272,54 @@ func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, b
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
+		Type: "Print",
 		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+			input, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
 //
 // Arguments:
-//	value: The tensor to be stored.
-//
-// Returns The handle for the tensor stored in the session state, represented
-// as a ResourceHandle object.
-func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandleV2",
+		Type: "TensorSummaryV2",
 		Input: []tf.Input{
-			value,
+			tag, tensor, serialized_summary_metadata,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
-type ResizeBicubicGradAttr func(optionalAttr)
-
-// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Computes the gradient of bicubic interpolation.
+// Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
-// The image tensor that was resized.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
-// Gradients with respect to the input image. Input image must have been
-// float or double.
-func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
+//	buffer_size: The maximum number of elements to buffer in an iterator over
+// this dataset.
+//
+//
+func PrefetchDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ResizeBicubicGrad",
+		Type: "PrefetchDataset",
 		Input: []tf.Input{
-			grads, original_image,
+			input_dataset, buffer_size,
 		},
 		Attrs: attrs,
 	}
@@ -20313,74 +19327,48 @@ func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output,
 	return op.Output(0)
 }
 
-// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
-type ResizeNearestNeighborAttr func(optionalAttr)
+// TensorSummaryAttr is an optional argument to TensorSummary.
+type TensorSummaryAttr func(optionalAttr)
 
-// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
+// TensorSummaryDescription sets the optional description attribute to value.
 //
-// value: If true, rescale input by (new_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of images and resized images. If false, rescale
-// by new_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+// value: A json-encoded SummaryDescription proto.
+// If not specified, defaults to ""
+func TensorSummaryDescription(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["description"] = value
 	}
 }
 
-// Resize `images` to `size` using nearest neighbor interpolation.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
+// TensorSummaryLabels sets the optional labels attribute to value.
 //
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighbor",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
+// value: An unused list of strings.
+// If not specified, defaults to <>
+func TensorSummaryLabels(value []string) TensorSummaryAttr {
+	return func(m optionalAttr) {
+		m["labels"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
-type ResizeNearestNeighborGradAttr func(optionalAttr)
-
-// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
+// TensorSummaryDisplayName sets the optional display_name attribute to value.
 //
-// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
-// exactly aligns the 4 corners of grads and original_image. If false, rescale by
-// orig_height / height. Treat similarly the width dimension.
-// If not specified, defaults to false
-func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
+// value: An unused string.
+// If not specified, defaults to ""
+func TensorSummaryDisplayName(value string) TensorSummaryAttr {
 	return func(m optionalAttr) {
-		m["align_corners"] = value
+		m["display_name"] = value
 	}
 }
 
-// Computes the gradient of nearest neighbor interpolation.
+// Outputs a `Summary` protocol buffer with a tensor.
 //
-// Arguments:
-//	grads: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
-// original input size.
+// This op is being phased out in favor of TensorSummaryV2, which lets callers pass
+// a tag as well as a serialized SummaryMetadata proto string that contains
+// plugin-specific data. We will keep this op to maintain backwards compatibility.
 //
-// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
-// with respect to the input image.
-func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
+// Arguments:
+//	tensor: A tensor to serialize.
+func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20389,9 +19377,9 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResizeNearestNeighborGrad",
+		Type: "TensorSummary",
 		Input: []tf.Input{
-			grads, size,
+			tensor,
 		},
 		Attrs: attrs,
 	}
@@ -20399,173 +19387,217 @@ func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, op
 	return op.Output(0)
 }
 
-// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
-type ExtractJpegShapeAttr func(optionalAttr)
-
-// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+// Computes the gradient for the tanh of `x` wrt its input.
 //
-// value: (Optional) The output type of the operation (int32 or int64).
-// Defaults to int32.
-// If not specified, defaults to DT_INT32
-func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
-	return func(m optionalAttr) {
-		m["output_type"] = value
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Extract the shape information of a JPEG-encoded image.
+// Outputs a `Summary` protocol buffer with scalar values.
 //
-// This op only parses the image header, so it is much faster than DecodeJpeg.
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
 //
 // Arguments:
-//	contents: 0-D. The JPEG-encoded image.
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
 //
-// Returns 1-D. The image shape with format [height, width, channels].
-func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExtractJpegShape",
+		Type: "ScalarSummary",
 		Input: []tf.Input{
-			contents,
+			tags, values,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
-type PaddingFIFOQueueV2Attr func(optionalAttr)
-
-// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types.
-// Shapes of fixed rank but variable size are allowed by setting
-// any shape dimension to -1.  In this case, the inputs' shape may vary along
-// the given dimension, and DequeueMany will pad the given dimension with
-// zeros up to the maximum shape of all elements in the given batch.
-// If the length of this attr is 0, different queue elements may have
-// different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to <>
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// REQUIRES: len(value) >= 0
-func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "HistogramSummary",
+		Input: []tf.Input{
+			tag, values,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
+// Computes the number of elements in the given queue.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns The number of elements in the given queue.
+func QueueSizeV2(scope *Scope, handle tf.Output) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueSizeV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// PaddingFIFOQueueV2Container sets the optional container attribute to value.
+// ImageSummaryAttr is an optional argument to ImageSummary.
+type ImageSummaryAttr func(optionalAttr)
+
+// ImageSummaryMaxImages sets the optional max_images attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
+// value: Max number of batch elements to generate images for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["max_images"] = value
 	}
 }
 
-// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
+// value: Color to use for pixels with non-finite values.
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
+func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["bad_color"] = value
 	}
 }
 
-// A queue that produces elements in first-in first-out order.
+// Outputs a `Summary` protocol buffer with images.
 //
-// Variable-size shapes are allowed by setting the corresponding shape dimensions
-// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
-// size of any given element in the minibatch.  See below for details.
+// The summary has up to `max_images` summary values containing images. The
+// images are built from `tensor` which must be 4-D with shape `[batch_size,
+// height, width, channels]` and where `channels` can be:
+//
+// *  1: `tensor` is interpreted as Grayscale.
+// *  3: `tensor` is interpreted as RGB.
+// *  4: `tensor` is interpreted as RGBA.
+//
+// The images have the same number of channels as the input tensor. For float
+// input, the values are normalized one image at a time to fit in the range
+// `[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+// normalization algorithms:
+//
+// *  If the input values are all positive, they are rescaled so the largest one
+//    is 255.
+//
+// *  If any input value is negative, the values are shifted so input value 0.0
+//    is at 127.  They are then rescaled so that either the smallest value is 0,
+//    or the largest one is 255.
+//
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
+//
+// *  If `max_images` is 1, the summary value tag is '*tag*/image'.
+// *  If `max_images` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+//
+// The `bad_color` argument is the color to use in the generated images for
+// non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+// Each element must be in the range `[0, 255]` (It represents the value of a
+// pixel in the output image).  Non-finite values in the input tensor are
+// replaced by this tensor in the output image.  The default value is the color
+// red.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 4-D of shape `[batch_size, height, width, channels]` where
+// `channels` is 1, 3, or 4.
 //
-// Returns The handle to the queue.
-func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func ImageSummary(scope *Scope, tag tf.Output, tensor tf.Output, optional ...ImageSummaryAttr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PaddingFIFOQueueV2",
-
+		Type: "ImageSummary",
+		Input: []tf.Input{
+			tag, tensor,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DecodePngAttr is an optional argument to DecodePng.
-type DecodePngAttr func(optionalAttr)
+// AudioSummaryV2Attr is an optional argument to AudioSummaryV2.
+type AudioSummaryV2Attr func(optionalAttr)
 
-// DecodePngChannels sets the optional channels attribute to value.
+// AudioSummaryV2MaxOutputs sets the optional max_outputs attribute to value.
 //
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodePngChannels(value int64) DecodePngAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodePngDtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_UINT8
-func DecodePngDtype(value tf.DataType) DecodePngAttr {
+// value: Max number of batch elements to generate audio for.
+// If not specified, defaults to 3
+//
+// REQUIRES: value >= 1
+func AudioSummaryV2MaxOutputs(value int64) AudioSummaryV2Attr {
 	return func(m optionalAttr) {
-		m["dtype"] = value
+		m["max_outputs"] = value
 	}
 }
 
-// Decode a PNG-encoded image to a uint8 or uint16 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
+// Outputs a `Summary` protocol buffer with audio.
 //
-// *   0: Use the number of channels in the PNG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-// *   4: output an RGBA image.
+// The summary has up to `max_outputs` summary values containing audio. The
+// audio is built from `tensor` which must be 3-D with shape `[batch_size,
+// frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+// assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
 //
-// If needed, the PNG-encoded image is transformed to match the requested number
-// of color channels.
+// The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+// build the `tag` of the summary values:
 //
-// This op also supports decoding JPEGs and non-animated GIFs since the interface
-// is the same, though it is cleaner to use `tf.image.decode_image`.
+// *  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+// *  If `max_outputs` is greater than 1, the summary value tags are
+//    generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
 //
 // Arguments:
-//	contents: 0-D.  The PNG-encoded image.
+//	tag: Scalar. Used to build the `tag` attribute of the summary values.
+//	tensor: 2-D of shape `[batch_size, frames]`.
+//	sample_rate: The sample rate of the signal in hertz.
 //
-// Returns 3-D with shape `[height, width, channels]`.
-func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func AudioSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, sample_rate tf.Output, optional ...AudioSummaryV2Attr) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -20574,9 +19606,9 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodePng",
+		Type: "AudioSummaryV2",
 		Input: []tf.Input{
-			contents,
+			tag, tensor, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -20584,484 +19616,377 @@ func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (ima
 	return op.Output(0)
 }
 
-// Decode the first frame of a GIF-encoded image to a uint8 tensor.
-//
-// GIF with frame or transparency compression are not supported
-// convert animated GIF from compressed to uncompressed by:
+// AvgPoolAttr is an optional argument to AvgPool.
+type AvgPoolAttr func(optionalAttr)
+
+// AvgPoolDataFormat sets the optional data_format attribute to value.
 //
-//     convert $src.gif -coalesce $dst.gif
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolDataFormat(value string) AvgPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs average pooling on the input.
 //
-// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
-// `tf.image.decode_image`.
+// Each entry in `output` is the mean of the corresponding size `ksize`
+// window in `value`.
 //
 // Arguments:
-//	contents: 0-D.  The GIF-encoded image.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	ksize: The size of the sliding window for each dimension of `value`.
+//	strides: The stride of the sliding window for each dimension of `value`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
-func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
+// Returns The average pooled output tensor.
+func AvgPool(scope *Scope, value tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeGif",
+		Type: "AvgPool",
 		Input: []tf.Input{
-			contents,
+			value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
-type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
+// Merges summaries.
 //
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
 //
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-//
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// mg <- rho * mg_{t-1} + (1-rho) * grad
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-// var <- var - mom
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
 //
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			tf.OutputList(inputs),
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Returns a list of tensors with the same shapes and contents as the input
-//
-// tensors.
-//
-// This op can be used to override the gradient for complicated functions. For
-// example, suppose y = f(x) and we wish to apply a custom function g for backprop
-// such that dx = g(dy). In Python,
+// Computes the gradient of morphological 2-D dilation with respect to the filter.
 //
-// ```python
-// with tf.get_default_graph().gradient_override_map(
-//     {'IdentityN': 'OverrideGradientWithG'}):
-//   y, _ = identity_n([f(x), x])
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, depth]`.
+//	strides: 1-D of length 4. The stride of the sliding window for each dimension of
+// the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: 1-D of length 4. The input stride for atrous morphological dilation.
+// Must be: `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// @tf.RegisterGradient('OverrideGradientWithG')
-// def ApplyG(op, dy, _):
-//   return [None, g(dy)]  # Do not backprop to f(x).
-// ```
-func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
+// Returns 3-D with shape `[filter_height, filter_width, depth]`.
+func Dilation2DBackpropFilter(scope *Scope, input tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, rates []int64, padding string) (filter_backprop tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "IdentityN",
+		Type: "Dilation2DBackpropFilter",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			input, filter, out_backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("IdentityN", err)
-		return
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
 	}
-	return output
 }
 
-// Computes the gradient of the sigmoid of `x` wrt its input.
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
 //
-// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
-// `dy` is the corresponding input gradient.
-func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SigmoidGrad",
+		Type: "AddSparseToTensorsMap",
 		Input: []tf.Input{
-			y, dy,
+			sparse_indices, sparse_values, sparse_shape,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Convert one or more images from HSV to RGB.
+// Computes the matrix exponential of one or more square matrices:
 //
-// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
-// value of the pixels. The output is only well defined if the value in `images`
-// are in `[0,1]`.
+// exp(A) = \sum_{n=0}^\infty A^n/n!
 //
-// See `rgb_to_hsv` for a description of the HSV encoding.
+// The exponential is computed using a combination of the scaling and squaring
+// method and the Pade approximation. Details can be founds in:
+// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
 //
 // Arguments:
-//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns `images` converted to RGB.
-func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.expm
+// @end_compatibility
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "HSVToRGB",
+		Type: "MatrixExponential",
 		Input: []tf.Input{
-			images,
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
-type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+// QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
+type QueueDequeueUpToV2Attr func(optionalAttr)
 
-// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
+// QueueDequeueUpToV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// value: If either `seed` or `seed2` are set to non-zero, the random number
-// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
-// seed.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueUpToV2TimeoutMs(value int64) QueueDequeueUpToV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
+// This operation is not supported by all queues.  If a queue does not support
+// DequeueUpTo, then an Unimplemented error is returned.
 //
-// value: The cropped area of the image must have an aspect ratio =
-// width / height within this range.
-// If not specified, defaults to <f:0.75 f:1.33 >
-func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["aspect_ratio_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
+// If the queue is closed and there are more than 0 but less than `n`
+// elements remaining, then instead of returning an OutOfRange error like
+// QueueDequeueMany, less than `n` elements are returned immediately.  If
+// the queue is closed and there are 0 elements left in the queue, then
+// an OutOfRange error is returned just like in QueueDequeueMany.
+// Otherwise the behavior is identical to QueueDequeueMany:
 //
-// value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
-// If not specified, defaults to <f:0.05 f:1 >
-func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["area_range"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size n in the 0th dimension.
 //
-// value: Number of attempts at generating a cropped region of the image
-// of the specified constraints. After `max_attempts` failures, return the entire
-// image.
-// If not specified, defaults to 100
-func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["max_attempts"] = value
-	}
-}
-
-// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
-//
-// value: Controls behavior if no bounding boxes supplied.
-// If true, assume an implicit bounding box covering the whole input. If false,
-// raise an error.
-// If not specified, defaults to false
-func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
-	return func(m optionalAttr) {
-		m["use_image_if_no_bounding_boxes"] = value
-	}
-}
-
-// Generate a single randomly distorted bounding box for an image.
-//
-// Bounding box annotations are often supplied in addition to ground-truth labels
-// in image recognition or object localization tasks. A common technique for
-// training such a system is to randomly distort an image while preserving
-// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
-// localization of an object, i.e. bounding box, given an `image_size`,
-// `bounding_boxes` and a series of constraints.
-//
-// The output of this Op is a single bounding box that may be used to crop the
-// original image. The output is returned as 3 tensors: `begin`, `size` and
-// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
-// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
-// what the bounding box looks like.
-//
-// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example,
-//
-// ```python
-//     # Generate a single distorted bounding box.
-//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
-//         tf.shape(image),
-//         bounding_boxes=bounding_boxes)
-//
-//     # Draw the bounding box in an image summary.
-//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
-//                                                   bbox_for_draw)
-//     tf.summary.image('images_with_box', image_with_box)
-//
-//     # Employ the bounding box to distort the image.
-//     distorted_image = tf.slice(image, begin, size)
-// ```
-//
-// Note that if no bounding box information is available, setting
-// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
-// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
-// false and no bounding boxes are supplied, an error is raised.
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
 //
 // Arguments:
-//	image_size: 1-D, containing `[height, width, channels]`.
-//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
-// associated with the image.
-//	min_object_covered: The cropped area of the image must contain at least this
-// fraction of any bounding box supplied. The value of this parameter should be
-// non-negative. In the case of 0, the cropped area does not need to overlap
-// any of the bounding boxes supplied.
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
 //
-// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
-// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
-// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
-// Provide as input to `tf.image.draw_bounding_boxes`.
-func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueUpToV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SampleDistortedBoundingBoxV2",
+		Type: "QueueDequeueUpToV2",
 		Input: []tf.Input{
-			image_size, bounding_boxes, min_object_covered,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
-type ExtractGlimpseAttr func(optionalAttr)
-
-// ExtractGlimpseCentered sets the optional centered attribute to value.
-//
-// value: indicates if the offset coordinates are centered relative to
-// the image, in which case the (0, 0) offset is relative to the center
-// of the input images. If false, the (0,0) offset corresponds to the
-// upper left corner of the input images.
-// If not specified, defaults to true
-func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["centered"] = value
-	}
-}
-
-// ExtractGlimpseNormalized sets the optional normalized attribute to value.
-//
-// value: indicates if the offset coordinates are normalized.
-// If not specified, defaults to true
-func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["normalized"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
-//
-// value: indicates if the noise should be generated using a
-// uniform distribution or a Gaussian distribution.
-// If not specified, defaults to true
-func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
-	return func(m optionalAttr) {
-		m["uniform_noise"] = value
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueUpToV2", err)
+		return
 	}
+	return components
 }
 
-// Extracts a glimpse from the input tensor.
+// Computes the Cholesky decomposition of one or more square matrices.
 //
-// Returns a set of windows called glimpses extracted at location
-// `offsets` from the input tensor. If the windows only partially
-// overlaps the inputs, the non overlapping areas will be filled with
-// random noise.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices.
 //
-// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
-// glimpse_width, channels]`. The channels and batch dimensions are the
-// same as that of the input tensor. The height and width of the output
-// windows are specified in the `size` parameter.
+// The input has to be symmetric and positive definite. Only the lower-triangular
+// part of the input will be used for this operation. The upper-triangular part
+// will not be read.
 //
-// The argument `normalized` and `centered` controls how the windows are built:
+// The output is a tensor of the same shape as the input
+// containing the Cholesky decompositions for all input submatrices `[..., :, :]`.
 //
-// * If the coordinates are normalized but not centered, 0.0 and 1.0
-//   correspond to the minimum and maximum of each height and width
-//   dimension.
-// * If the coordinates are both normalized and centered, they range from
-//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
-//   left corner, the lower right corner is located at (1.0, 1.0) and the
-//   center is at (0, 0).
-// * If the coordinates are not normalized they are interpreted as
-//   numbers of pixels.
+// **Note**: The gradient computation on GPU is faster for large matrices but
+// not for large batch dimensions when the submatrices are small. In this
+// case it might be faster to use the CPU.
 //
 // Arguments:
-//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
-//	size: A 1-D tensor of 2 elements containing the size of the glimpses
-// to extract.  The glimpse height must be specified first, following
-// by the glimpse width.
-//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
-// the y, x locations of the center of each window.
+//	input: Shape is `[..., M, M]`.
 //
-// Returns A tensor representing the glimpses `[batch_size,
-// glimpse_height, glimpse_width, channels]`.
-func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
+// Returns Shape is `[..., M, M]`.
+func Cholesky(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ExtractGlimpse",
+		Type: "Cholesky",
 		Input: []tf.Input{
-			input, size, offsets,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// A container for an iterator resource.
+// Writes contents to the file at input filename. Creates file and recursively
 //
-// Returns A handle to the iterator that can be passed to a "MakeIterator"
-// or "IteratorGetNext" op.
-func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// creates directory if not existing.
+//
+// Arguments:
+//	filename: scalar. The name of the file to which we write the contents.
+//	contents: scalar. The content to be written to the output file.
+//
+// Returns the created operation.
+func WriteFile(scope *Scope, filename tf.Output, contents tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Iterator",
-
-		Attrs: attrs,
+		Type: "WriteFile",
+		Input: []tf.Input{
+			filename, contents,
+		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
-type CropAndResizeGradImageAttr func(optionalAttr)
+// AllAttr is an optional argument to All.
+type AllAttr func(optionalAttr)
 
-// CropAndResizeGradImageMethod sets the optional method attribute to value.
+// AllKeepDims sets the optional keep_dims attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func AllKeepDims(value bool) AllAttr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input image tensor.
+// Computes the "logical and" of elements across dimensions of a tensor.
 //
-// Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
-//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
-// containing the original image size. Both `image_height` and `image_width` need
-// to be positive.
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
 //
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
 //
-// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
+// Returns The reduced tensor.
+func All(scope *Scope, input tf.Output, axis tf.Output, optional ...AllAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradImage",
+		Type: "All",
 		Input: []tf.Input{
-			grads, boxes, box_ind, image_size,
+			input, axis,
 		},
 		Attrs: attrs,
 	}
@@ -21069,118 +19994,88 @@ func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_
 	return op.Output(0)
 }
 
-// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
-type ShuffleDatasetAttr func(optionalAttr)
-
-// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+// Computes the Eigen Decomposition of a batch of square self-adjoint matrices.
 //
-// value: If true, each iterator over this dataset will be given
-// a different pseudorandomly generated seed, based on a sequence seeded by the
-// `seed` and `seed2` inputs. If false, each iterator will be given the same
-// seed, and repeated iteration over this dataset will yield the exact same
-// sequence of results.
-// If not specified, defaults to true
-func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
-	return func(m optionalAttr) {
-		m["reshuffle_each_iteration"] = value
-	}
-}
-
-// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
+// DEPRECATED at GraphDef version 11: Use SelfAdjointEigV2 instead.
 //
-// Arguments:
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices, with the same constraints as the single matrix
+// SelfAdjointEig.
 //
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// The result is a [..., M+1, M] matrix with [..., 0,:] containing the
+// eigenvalues, and subsequent [...,1:, :] containing the eigenvectors.
 //
+// Arguments:
+//	input: Shape is `[..., M, M]`.
 //
-func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
+// Returns Shape is `[..., M+1, M]`.
+func SelfAdjointEig(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleDataset",
+		Type: "SelfAdjointEig",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// 3D fast Fourier transform.
-//
-// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
-// dimensions of `input`.
+// Computes softplus gradients for a softplus operation.
 //
 // Arguments:
-//	input: A complex64 tensor.
-//
-// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
-//   dimensions of `input` are replaced with their 3D Fourier transform.
+//	gradients: The backpropagated gradients to the corresponding softplus operation.
+//	features: The features passed as input to the corresponding softplus operation.
 //
-// @compatibility(numpy)
-// Equivalent to np.fft.fftn with 3 dimensions.
-// @end_compatibility
-func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+// Returns The gradients: `gradients / (1 + exp(-features))`.
+func SoftplusGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FFT3D",
+		Type: "SoftplusGrad",
 		Input: []tf.Input{
-			input,
+			gradients, features,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
-type CropAndResizeGradBoxesAttr func(optionalAttr)
+// SelfAdjointEigV2Attr is an optional argument to SelfAdjointEigV2.
+type SelfAdjointEigV2Attr func(optionalAttr)
 
-// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+// SelfAdjointEigV2ComputeV sets the optional compute_v attribute to value.
 //
-// value: A string specifying the interpolation method. Only 'bilinear' is
-// supported for now.
-// If not specified, defaults to "bilinear"
-func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
+// value: If `True` then eigenvectors will be computed and returned in `v`.
+// Otherwise, only the eigenvalues will be computed.
+// If not specified, defaults to true
+func SelfAdjointEigV2ComputeV(value bool) SelfAdjointEigV2Attr {
 	return func(m optionalAttr) {
-		m["method"] = value
+		m["compute_v"] = value
 	}
 }
 
-// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
+// Computes the eigen decomposition of one or more square self-adjoint matrices.
+//
+// Computes the eigenvalues and (optionally) eigenvectors of each inner matrix in
+// `input` such that `input[..., :, :] = v[..., :, :] * diag(e[..., :])`.
+//
+// ```python
+// # a is a tensor.
+// # e is a tensor of eigenvalues.
+// # v is a tensor of eigenvectors.
+// e, v = self_adjoint_eig(a)
+// e = self_adjoint_eig(a, compute_v=False)
+// ```
 //
 // Arguments:
-//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
-//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
-// Both `image_height` and `image_width` need to be positive.
-//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
-// specifies the coordinates of a box in the `box_ind[i]` image and is specified
-// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
-// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
-// `[0, 1]` interval of normalized image height is mapped to
-// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
-// which case the sampled crop is an up-down flipped version of the original
-// image. The width dimension is treated similarly. Normalized coordinates
-// outside the `[0, 1]` range are allowed, in which case we use
-// `extrapolation_value` to extrapolate the input image values.
-//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
-// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	input: `Tensor` input of shape `[N, N]`.
 //
-// Returns A 2-D tensor of shape `[num_boxes, 4]`.
-func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
+// Returns Eigenvalues. Shape is `[N]`.Eigenvectors. Shape is `[N, N]`.
+func SelfAdjointEigV2(scope *Scope, input tf.Output, optional ...SelfAdjointEigV2Attr) (e tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21189,65 +20084,95 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CropAndResizeGradBoxes",
+		Type: "SelfAdjointEigV2",
 		Input: []tf.Input{
-			grads, image, boxes, box_ind,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Saves tensors in V2 checkpoint format.
+// Adjust the saturation of one or more images.
 //
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A scale is then applied all the saturation
+// values, and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
+//	images: Images to adjust.  At least 3-D.
+//	scale: A float scale to add to the saturation.
 //
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+// Returns The hue-adjusted image or images.
+func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SaveV2",
+		Type: "AdjustSaturation",
 		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+			images, scale,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
+// SvdAttr is an optional argument to Svd.
+type SvdAttr func(optionalAttr)
 
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
+// SvdComputeUv sets the optional compute_uv attribute to value.
+//
+// value: If true, left and right singular vectors will be
+// computed and returned in `u` and `v`, respectively.
+// If false, `u` and `v` are not set and should never referenced.
+// If not specified, defaults to true
+func SvdComputeUv(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["compute_uv"] = value
 	}
 }
 
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+// SvdFullMatrices sets the optional full_matrices attribute to value.
+//
+// value: If true, compute full-sized `u` and `v`. If false
+// (the default), compute only the leading `P` singular vectors.
+// Ignored if `compute_uv` is `False`.
+// If not specified, defaults to false
+func SvdFullMatrices(value bool) SvdAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["full_matrices"] = value
 	}
 }
 
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
+// Computes the singular value decompositions of one or more matrices.
+//
+// Computes the SVD of each inner matrix in `input` such that
+// `input[..., :, :] = u[..., :, :] * diag(s[..., :, :]) * transpose(v[..., :, :])`
+//
+// ```python
+// # a is a tensor containing a batch of matrices.
+// # s is a tensor of singular values for each matrix.
+// # u is the tensor containing of left singular vectors for each matrix.
+// # v is the tensor containing of right singular vectors for each matrix.
+// s, u, v = svd(a)
+// s, _, _ = svd(a, compute_uv=False)
+// ```
+//
+// Arguments:
+//	input: A tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form matrices of size `[M, N]`. Let `P` be the minimum of `M` and `N`.
+//
+// Returns Singular values. Shape is `[..., P]`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., M, P]`; if `full_matrices` is `True` then shape is
+// `[..., M, M]`. Undefined if `compute_uv` is `False`.Left singular vectors. If `full_matrices` is `False` then shape is
+// `[..., N, P]`. If `full_matrices` is `True` then shape is `[..., N, N]`.
+// Undefined if `compute_uv` is false.
+func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.Output, v tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21256,386 +20181,522 @@ func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
-
+		Type: "Svd",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Greedily selects a subset of bounding boxes in descending order of score,
+// QueueEnqueueManyV2Attr is an optional argument to QueueEnqueueManyV2.
+type QueueEnqueueManyV2Attr func(optionalAttr)
+
+// QueueEnqueueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// pruning away boxes that have high intersection-over-union (IOU) overlap
-// with previously selected boxes.  Bounding boxes are supplied as
-// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
-// diagonal pair of box corners and the coordinates can be provided as normalized
-// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
-// is agnostic to where the origin is in the coordinate system.  Note that this
-// algorithm is invariant to orthogonal transformations and translations
-// of the coordinate system; thus translating or reflections of the coordinate
-// system result in the same boxes being selected by the algorithm.
+// value: If the queue is too full, this operation will block for up
+// to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueManyV2TimeoutMs(value int64) QueueEnqueueManyV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_ms"] = value
+	}
+}
+
+// Enqueues zero or more tuples of one or more tensors in the given queue.
 //
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
+// This operation slices each component tensor along the 0th dimension to
+// make multiple queue elements. All of the tuple components must have the
+// same size in the 0th dimension.
 //
-//   selected_indices = tf.image.non_max_suppression_v2(
-//       boxes, scores, max_output_size, iou_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
+//
+// N.B. If the queue is full, this operation will block until the given
+// elements have been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too much with respect to IOU.
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should
+// be taken.
 //
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
+// Returns the created operation.
+func QueueEnqueueManyV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueManyV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionV2",
-		Input: []tf.Input{
-			boxes, scores, max_output_size, iou_threshold,
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueEnqueueManyV2",
+		Input: []tf.Input{
+			handle, tf.OutputList(components),
 		},
+		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Reshapes a tensor.
-//
-// Given `tensor`, this operation returns a tensor that has the same values
-// as `tensor` with shape `shape`.
-//
-// If one component of `shape` is the special value -1, the size of that dimension
-// is computed so that the total size remains constant.  In particular, a `shape`
-// of `[-1]` flattens into 1-D.  At most one component of `shape` can be -1.
-//
-// If `shape` is 1-D or higher, then the operation returns a tensor with shape
-// `shape` filled with the values of `tensor`. In this case, the number of elements
-// implied by `shape` must be the same as the number of elements in `tensor`.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [1, 2, 3, 4, 5, 6, 7, 8, 9]
-// # tensor 't' has shape [9]
-// reshape(t, [3, 3]) ==> [[1, 2, 3],
-//                         [4, 5, 6],
-//                         [7, 8, 9]]
-//
-// # tensor 't' is [[[1, 1], [2, 2]],
-// #                [[3, 3], [4, 4]]]
-// # tensor 't' has shape [2, 2, 2]
-// reshape(t, [2, 4]) ==> [[1, 1, 2, 2],
-//                         [3, 3, 4, 4]]
+// Computes the product along segments of a tensor.
 //
-// # tensor 't' is [[[1, 1, 1],
-// #                 [2, 2, 2]],
-// #                [[3, 3, 3],
-// #                 [4, 4, 4]],
-// #                [[5, 5, 5],
-// #                 [6, 6, 6]]]
-// # tensor 't' has shape [3, 2, 3]
-// # pass '[-1]' to flatten 't'
-// reshape(t, [-1]) ==> [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6]
+// Read @{$math_ops#segmentation$the section on segmentation} for an explanation of
+// segments.
 //
-// # -1 can also be used to infer the shape
+// Computes a tensor such that
+// \\(output_i = \prod_j data_j\\) where the product is over `j` such
+// that `segment_ids[j] == i`.
 //
-// # -1 is inferred to be 9:
-// reshape(t, [2, -1]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 2:
-// reshape(t, [-1, 9]) ==> [[1, 1, 1, 2, 2, 2, 3, 3, 3],
-//                          [4, 4, 4, 5, 5, 5, 6, 6, 6]]
-// # -1 is inferred to be 3:
-// reshape(t, [ 2, -1, 3]) ==> [[[1, 1, 1],
-//                               [2, 2, 2],
-//                               [3, 3, 3]],
-//                              [[4, 4, 4],
-//                               [5, 5, 5],
-//                               [6, 6, 6]]]
+// If the product is empty for a given segment ID `i`, `output[i] = 1`.
 //
-// # tensor 't' is [7]
-// # shape `[]` reshapes to a scalar
-// reshape(t, []) ==> 7
-// ```
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentProd.png" alt>
+// </div>
 //
 // Arguments:
 //
-//	shape: Defines the shape of the output tensor.
-func Reshape(scope *Scope, tensor tf.Output, shape tf.Output) (output tf.Output) {
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Reshape",
+		Type: "SegmentProd",
 		Input: []tf.Input{
-			tensor, shape,
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that splits a SparseTensor into elements row-wise.
-func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
+// Converts one or more images from RGB to HSV.
+//
+// Outputs a tensor of the same shape as the `images` tensor, containing the HSV
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
+//
+// `output[..., 0]` contains hue, `output[..., 1]` contains saturation, and
+// `output[..., 2]` contains value. All HSV values are in `[0,1]`. A hue of 0
+// corresponds to pure red, hue 1/3 is pure green, and 2/3 is pure blue.
+//
+// Arguments:
+//	images: 1-D or higher rank. RGB data to convert. Last dimension must be size 3.
+//
+// Returns `images` converted to HSV.
+func RGBToHSV(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseTensorSliceDataset",
+		Type: "RGBToHSV",
 		Input: []tf.Input{
-			indices, values, dense_shape,
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns x / y element-wise for real types.
-//
-// If `x` and `y` are reals, this will return the floating-point division.
+// Does nothing. Only useful as a placeholder for control edges.
 //
-// *NOTE*: `Div` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns the created operation.
+func NoOp(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RealDiv",
-		Input: []tf.Input{
-			x, y,
-		},
+		Type: "NoOp",
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
-func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// MergeV2CheckpointsAttr is an optional argument to MergeV2Checkpoints.
+type MergeV2CheckpointsAttr func(optionalAttr)
+
+// MergeV2CheckpointsDeleteOldDirs sets the optional delete_old_dirs attribute to value.
+//
+// value: see above.
+// If not specified, defaults to true
+func MergeV2CheckpointsDeleteOldDirs(value bool) MergeV2CheckpointsAttr {
+	return func(m optionalAttr) {
+		m["delete_old_dirs"] = value
+	}
+}
+
+// V2 format specific: merges the metadata files of sharded checkpoints.  The
+//
+// result is one logical checkpoint, with one physical metadata file and renamed
+// data files.
+//
+// Intended for "grouping" multiple checkpoints in a sharded checkpoint setup.
+//
+// If delete_old_dirs is true, attempts to delete recursively the dirname of each
+// path in the input checkpoint_prefixes.  This is useful when those paths are non
+// user-facing temporary locations.
+//
+// Arguments:
+//	checkpoint_prefixes: prefixes of V2 checkpoints to merge.
+//	destination_prefix: scalar.  The desired final prefix.  Allowed to be the same
+// as one of the checkpoint_prefixes.
+//
+// Returns the created operation.
+func MergeV2Checkpoints(scope *Scope, checkpoint_prefixes tf.Output, destination_prefix tf.Output, optional ...MergeV2CheckpointsAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ConcatenateDataset",
+		Type: "MergeV2Checkpoints",
 		Input: []tf.Input{
-			input_dataset, another_dataset,
+			checkpoint_prefixes, destination_prefix,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Adds a value to the current value of a variable.
+// Saves input tensors slices to disk.
 //
-// Any ReadVariableOp which depends directly or indirectly on this assign is
-// guaranteed to see the incremented value or a subsequent newer one.
+// This is like `Save` except that tensors can be listed in the saved file as being
+// a slice of a larger tensor.  `shapes_and_slices` specifies the shape of the
+// larger tensor and the slice that this tensor covers. `shapes_and_slices` must
+// have as many elements as `tensor_names`.
 //
-// Outputs the incremented value, which can be used to totally order the
-// increments to this variable.
+// Elements of the `shapes_and_slices` input must either be:
+//
+// *  The empty string, in which case the corresponding tensor is
+//    saved normally.
+// *  A string of the form `dim0 dim1 ... dimN-1 slice-spec` where the
+//    `dimI` are the dimensions of the larger tensor and `slice-spec`
+//    specifies what part is covered by the tensor to save.
+//
+// `slice-spec` itself is a `:`-separated list: `slice0:slice1:...:sliceN-1`
+// where each `sliceI` is either:
+//
+// *  The string `-` meaning that the slice covers all indices of this dimension
+// *  `start,length` where `start` and `length` are integers.  In that
+//    case the slice covers `length` indices starting at `start`.
+//
+// See also `Save`.
 //
 // Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
+//	filename: Must have a single element. The name of the file to which we write the
+// tensor.
+//	tensor_names: Shape `[N]`. The names of the tensors to be saved.
+//	shapes_and_slices: Shape `[N]`.  The shapes and slice specifications to use when
+// saving the tensors.
+//	data: `N` tensors to save.
 //
 // Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
+func SaveSlices(scope *Scope, filename tf.Output, tensor_names tf.Output, shapes_and_slices tf.Output, data []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
+		Type: "SaveSlices",
 		Input: []tf.Input{
-			resource, value,
+			filename, tensor_names, shapes_and_slices, tf.OutputList(data),
 		},
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// DenseToDenseSetOperationAttr is an optional argument to DenseToDenseSetOperation.
+type DenseToDenseSetOperationAttr func(optionalAttr)
+
+// DenseToDenseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func DenseToDenseSetOperationValidateIndices(value bool) DenseToDenseSetOperationAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Applies set operation along last dimension of 2 `Tensor` inputs.
+//
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
+//
+// Arguments:
+//	set1: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set2`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//	set2: `Tensor` with rank `n`. 1st `n-1` dimensions must be the same as `set1`.
+// Dimension `n` contains values in a set, duplicates are allowed but ignored.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func DenseToDenseSetOperation(scope *Scope, set1 tf.Output, set2 tf.Output, set_operation string, optional ...DenseToDenseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{"set_operation": set_operation}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
+		Type: "DenseToDenseSetOperation",
 		Input: []tf.Input{
-			input_dataset, tag,
+			set1, set2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+// Generate a sharded filename. The filename is printf formatted as
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+//    %s-%05d-of-%05d, basename, shard, num_shards.
+func ShardedFilename(scope *Scope, basename tf.Output, shard tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "ShardedFilename",
 		Input: []tf.Input{
-			json_examples,
+			basename, shard, num_shards,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
+// BatchToSpace for N-D tensors of type T.
 //
-// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
-// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
-// input channel is processed independently of the others with its own structuring
-// function. The `output` tensor has shape
-// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
-// tensor depend on the `padding` algorithm. We currently only support the default
-// "NHWC" `data_format`.
+// This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of shape
+// `block_shape + [batch]`, interleaves these blocks back into the grid defined by
+// the spatial dimensions `[1, ..., M]`, to obtain a result with the same rank as
+// the input.  The spatial dimensions of this intermediate result are then
+// optionally cropped according to `crops` to produce the output.  This is the
+// reverse of SpaceToBatch.  See below for a precise description.
 //
-// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
-// (for consistency with `conv2d`, we use unmirrored filters):
+// Arguments:
+//	input: N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+// where spatial_shape has M dimensions.
+//	block_shape: 1-D with shape `[M]`, all values must be >= 1.
+//	crops: 2-D with shape `[M, 2]`, all values must be >= 0.
+//   `crops[i] = [crop_start, crop_end]` specifies the amount to crop from input
+//   dimension `i + 1`, which corresponds to spatial dimension `i`.  It is
+//   required that
+//   `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
 //
-//     output[b, y, x, c] =
-//        max_{dy, dx} input[b,
-//                           strides[1] * y + rates[1] * dy,
-//                           strides[2] * x + rates[2] * dx,
-//                           c] +
-//                     filter[dy, dx, c]
+// This operation is equivalent to the following steps:
 //
-// Max-pooling is a special case when the filter has size equal to the pooling
-// kernel size and contains all zeros.
+// 1. Reshape `input` to `reshaped` of shape:
+//      [block_shape[0], ..., block_shape[M-1],
+//       batch / prod(block_shape),
+//       input_shape[1], ..., input_shape[N-1]]
 //
-// Note on duality: The dilation of `input` by the `filter` is equal to the
-// negation of the erosion of `-input` by the reflected `filter`.
+// 2. Permute dimensions of `reshaped` to produce `permuted` of shape
+//      [batch / prod(block_shape),
 //
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
-//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
-//	strides: The stride of the sliding window for each dimension of the input
-// tensor. Must be: `[1, stride_height, stride_width, 1]`.
-//	rates: The input stride for atrous morphological dilation. Must be:
-// `[1, rate_height, rate_width, 1]`.
-//	padding: The type of padding algorithm to use.
+//       input_shape[1], block_shape[0],
+//       ...,
+//       input_shape[M], block_shape[M-1],
 //
-// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
-func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
-	opspec := tf.OpSpec{
-		Type: "Dilation2D",
-		Input: []tf.Input{
-			input, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given variant tensor to an iterator and stores it in the given resource.
+//       input_shape[M+1], ..., input_shape[N-1]]
 //
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//	serialized: A variant tensor storing the state of the iterator contained in the
-// resource.
+// 3. Reshape `permuted` to produce `reshaped_permuted` of shape
+//      [batch / prod(block_shape),
 //
-// Returns the created operation.
-func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+//       input_shape[1] * block_shape[0],
+//       ...,
+//       input_shape[M] * block_shape[M-1],
+//
+//       input_shape[M+1],
+//       ...,
+//       input_shape[N-1]]
+//
+// 4. Crop the start and end of dimensions `[1, ..., M]` of
+//    `reshaped_permuted` according to `crops` to produce the output of shape:
+//      [batch / prod(block_shape),
+//
+//       input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+//       ...,
+//       input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+//
+//       input_shape[M+1], ..., input_shape[N-1]]
+//
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [0, 0]]`:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 3, 1]`, `block_shape = [2, 2]`, and
+//     `crops = [[0, 0], [2, 0]]`:
+//
+// ```
+// x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+//      [[[0], [2], [4]]], [[[0], [10], [12]]],
+//      [[[0], [5], [7]]], [[[0], [13], [15]]],
+//      [[[0], [6], [8]]], [[[0], [14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1],   [2],  [3],  [4]],
+//       [[5],   [6],  [7],  [8]]],
+//      [[[9],  [10], [11],  [12]],
+//       [[13], [14], [15],  [16]]]]
+// ```
+func BatchToSpaceND(scope *Scope, input tf.Output, block_shape tf.Output, crops tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DeserializeIterator",
+		Type: "BatchToSpaceND",
 		Input: []tf.Input{
-			resource_handle, serialized,
+			input, block_shape, crops,
 		},
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
-type TensorArrayConcatV2Attr func(optionalAttr)
+// UnpackAttr is an optional argument to Unpack.
+type UnpackAttr func(optionalAttr)
 
-// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
+// UnpackAxis sets the optional axis attribute to value.
+//
+// value: Dimension along which to unpack.  Negative values wrap around, so the
+// valid range is `[-R, R)`.
+// If not specified, defaults to 0
+func UnpackAxis(value int64) UnpackAttr {
 	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
+		m["axis"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayConcatV3
-func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
+// Unpacks a given dimension of a rank-`R` tensor into `num` rank-`(R-1)` tensors.
+//
+// Unpacks `num` tensors from `value` by chipping it along the `axis` dimension.
+// For example, given a tensor of shape `(A, B, C, D)`;
+//
+// If `axis == 0` then the i'th tensor in `output` is the slice `value[i, :, :, :]`
+//   and each tensor in `output` will have shape `(B, C, D)`. (Note that the
+//   dimension unpacked along is gone, unlike `split`).
+//
+// If `axis == 1` then the i'th tensor in `output` is the slice `value[:, i, :, :]`
+//   and each tensor in `output` will have shape `(A, C, D)`.
+// Etc.
+//
+// This is the opposite of `pack`.
+//
+// Arguments:
+//	value: 1-D or higher, with `axis` dimension size equal to `num`.
+//
+//
+// Returns The list of tensors unpacked from `value`.
+func Unpack(scope *Scope, value tf.Output, num int64, optional ...UnpackAttr) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"num": num}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV2",
+		Type: "Unpack",
 		Input: []tf.Input{
-			handle, flow_in,
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Unpack", err)
+		return
+	}
+	return output
 }
 
-// Creates a dataset that batches and pads `batch_size` elements from the input.
+// Increments variable pointed to by 'resource' until it reaches 'limit'.
 //
 // Arguments:
+//	resource: Should be from a scalar `Variable` node.
+//	limit: If incrementing ref would bring it above limit, instead generates an
+// 'OutOfRange' error.
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	padded_shapes: A list of int64 tensors representing the desired padded shapes
-// of the corresponding output components. These shapes may be partially
-// specified, using `-1` to indicate that a particular dimension should be
-// padded to the maximum size of all batch elements.
-//	padding_values: A list of scalars containing the padding value to use for
-// each of the outputs.
 //
-func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns A copy of the input before increment. If nothing else modifies the
+// input, the values produced will all be distinct.
+func ResourceCountUpTo(scope *Scope, resource tf.Output, limit int64, T tf.DataType) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_shapes": output_shapes}
+	attrs := map[string]interface{}{"limit": limit, "T": T}
 	opspec := tf.OpSpec{
-		Type: "PaddedBatchDataset",
+		Type: "ResourceCountUpTo",
 		Input: []tf.Input{
-			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
+			resource,
 		},
 		Attrs: attrs,
 	}
@@ -21643,80 +20704,79 @@ func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Out
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
+// Delete the stack from its resource container.
 //
 // Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
+//	handle: The handle to a stack.
 //
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func StackCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
+		Type: "StackCloseV2",
 		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
+			handle,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+// Generate a glob pattern matching all sharded file names.
+func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
+		Type: "ShardedFilespec",
 		Input: []tf.Input{
-			handle, flow_in,
+			basename, num_shards,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
 
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["skip_header_lines"] = value
 	}
 }
 
-// var: Should be from a Variable().
+// TextLineReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
 //
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
 //
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -21725,181 +20785,192 @@ func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
+		Type: "TextLineReaderV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Identity op for gradient debugging.
+// LoadAndRemapMatrixAttr is an optional argument to LoadAndRemapMatrix.
+type LoadAndRemapMatrixAttr func(optionalAttr)
+
+// LoadAndRemapMatrixMaxRowsInMemory sets the optional max_rows_in_memory attribute to value.
 //
-// This op is hidden from public in Python. It is used by TensorFlow Debugger to
-// register gradient tensors for gradient debugging.
-// This op operates on non-reference-type tensors.
-func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DebugGradientIdentity",
-		Input: []tf.Input{
-			input,
-		},
+// value: The maximum number of rows to load from the checkpoint at
+// once. If less than or equal to 0, the entire matrix will be loaded into
+// memory. Setting this arg trades increased disk reads for lower memory usage.
+// If not specified, defaults to -1
+func LoadAndRemapMatrixMaxRowsInMemory(value int64) LoadAndRemapMatrixAttr {
+	return func(m optionalAttr) {
+		m["max_rows_in_memory"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// If `pos` is negative or specifies a character index larger than any of the input
-// strings, then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
+// Loads a 2-D (matrix) `Tensor` with name `old_tensor_name` from the checkpoint
 //
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
+// at `ckpt_path` and potentially reorders its rows and columns using the
+// specified remappings.
 //
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
+// Most users should use one of the wrapper initializers (such as
+// `tf.contrib.framework.load_and_remap_matrix_initializer`) instead of this
+// function directly.
 //
-// Broadcasting `pos` and `len` onto `input`:
+// The remappings are 1-D tensors with the following properties:
 //
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
+// * `row_remapping` must have exactly `num_rows` entries. Row `i` of the output
+//   matrix will be initialized from the row corresponding to index
+//   `row_remapping[i]` in the old `Tensor` from the checkpoint.
+// * `col_remapping` must have either 0 entries (indicating that no column
+//   reordering is needed) or `num_cols` entries. If specified, column `j` of the
+//   output matrix will be initialized from the column corresponding to index
+//   `col_remapping[j]` in the old `Tensor` from the checkpoint.
+// * A value of -1 in either of the remappings signifies a "missing" entry. In that
+//   case, values from the `initializing_values` tensor will be used to fill that
+//   missing row or column. If `row_remapping` has `r` missing entries and
+//   `col_remapping` has `c` missing entries, then the following condition must be
+//   true:
 //
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
+// `(r * num_cols) + (c * num_rows) - (r * c) == len(initializing_values)`
 //
-// Broadcasting `input` onto `pos` and `len`:
+// The remapping tensors can be generated using the GenerateVocabRemapping op.
 //
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
+// As an example, with row_remapping = [1, 0, -1], col_remapping = [0, 2, -1],
+// initializing_values = [0.5, -0.5, 0.25, -0.25, 42], and w(i, j) representing
+// the value from row i, column j of the old tensor in the checkpoint, the output
+// matrix will look like the following:
 //
-// output = [b'hir', b'ee', b'n']
-// ```
+// [[w(1, 0),  w(1, 2),  0.5],
+//  [w(0, 0),  w(0, 2), -0.5],
+//  [0.25,    -0.25,      42]]
 //
 // Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
+//	ckpt_path: Path to the TensorFlow checkpoint (version 2, `TensorBundle`) from
+// which the old matrix `Tensor` will be loaded.
+//	old_tensor_name: Name of the 2-D `Tensor` to load from checkpoint.
+//	row_remapping: An int `Tensor` of row remappings (generally created by
+// `generate_vocab_remapping`).  Even if no row remapping is needed, this must
+// still be an index-valued Tensor (e.g. [0, 1, 2, ...]), or a shifted
+// index-valued `Tensor` (e.g. [8, 9, 10, ...], for partitioned `Variables`).
+//	col_remapping: An int `Tensor` of column remappings (generally created by
+// `generate_vocab_remapping`).  May be a size-0 `Tensor` if only row remapping
+// is to be done (e.g. column ordering is the same).
+//	initializing_values: A float `Tensor` containing  values to fill in for cells
+// in the output matrix that are not loaded from the checkpoint. Length must be
+// exactly the same as the number of missing / new cells.
+//	num_rows: Number of rows (length of the 1st dimension) in the output matrix.
+//	num_cols: Number of columns (length of the 2nd dimension) in the output matrix.
 //
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
+// Returns Output matrix containing existing values loaded from the
+// checkpoint, and with any missing values filled in from initializing_values.
+func LoadAndRemapMatrix(scope *Scope, ckpt_path tf.Output, old_tensor_name tf.Output, row_remapping tf.Output, col_remapping tf.Output, initializing_values tf.Output, num_rows int64, num_cols int64, optional ...LoadAndRemapMatrixAttr) (output_matrix tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
+	attrs := map[string]interface{}{"num_rows": num_rows, "num_cols": num_cols}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadAndRemapMatrix",
 		Input: []tf.Input{
-			input, pos, len,
+			ckpt_path, old_tensor_name, row_remapping, col_remapping, initializing_values,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a Dataset that returns pseudorandom numbers.
+// TFRecordReaderV2Attr is an optional argument to TFRecordReaderV2.
+type TFRecordReaderV2Attr func(optionalAttr)
+
+// TFRecordReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TFRecordReaderV2Container(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TFRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TFRecordReaderV2SharedName(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// TFRecordReaderV2CompressionType sets the optional compression_type attribute to value.
+// If not specified, defaults to ""
+func TFRecordReaderV2CompressionType(value string) TFRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["compression_type"] = value
+	}
+}
+
+// A Reader that outputs the records from a TensorFlow Records file.
 //
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The handle to reference the Reader.
+func TFRecordReaderV2(scope *Scope, optional ...TFRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
+		Type: "TFRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that shuffles and repeats elements from `input_dataset`
-//
-// pseudorandomly.
-//
-// Arguments:
-//
-//	buffer_size: The number of output elements to buffer in an iterator over
-// this dataset. Compare with the `min_after_dequeue` attr when creating a
-// `RandomShuffleQueue`.
-//	seed: A scalar seed for the random number generator. If either `seed` or
-// `seed2` is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//	count: A scalar representing the number of times the underlying dataset
-// should be repeated. The default is `-1`, which results in infinite repetition.
-//
+// QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
+type QuantizeAndDequantizeV3Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV3SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3SignedInput(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV3RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV3RangeGiven(value bool) QuantizeAndDequantizeV3Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// Quantizes then dequantizes a tensor.
 //
-func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// This is almost identical to QuantizeAndDequantizeV2, except that num_bits is a
+// tensor, so its value can change during training.
+func QuantizeAndDequantizeV3(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, num_bits tf.Output, optional ...QuantizeAndDequantizeV3Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ShuffleAndRepeatDataset",
+		Type: "QuantizeAndDequantizeV3",
 		Input: []tf.Input{
-			input_dataset, buffer_size, seed, seed2, count,
+			input, input_min, input_max, num_bits,
 		},
 		Attrs: attrs,
 	}
@@ -21907,416 +20978,361 @@ func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size
 	return op.Output(0)
 }
 
-// Creates a dataset that caches elements from `input_dataset`.
-//
-// A CacheDataset will iterate over the input_dataset, and store tensors. If the
-// cache already exists, the cache will be used. If the cache is inappropriate
-// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
-// will the returned when used.
+// IdentityReaderV2Attr is an optional argument to IdentityReaderV2.
+type IdentityReaderV2Attr func(optionalAttr)
+
+// IdentityReaderV2Container sets the optional container attribute to value.
 //
-// Arguments:
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func IdentityReaderV2Container(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// IdentityReaderV2SharedName sets the optional shared_name attribute to value.
 //
-//	filename: A path on the filesystem where we should cache the dataset. Note: this
-// will be a directory.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func IdentityReaderV2SharedName(value string) IdentityReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the queued work as both the key and value.
 //
+// To use, enqueue strings in a Queue.  ReaderRead will take the front
+// work string and output (work, work).
 //
-func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns The handle to reference the Reader.
+func IdentityReaderV2(scope *Scope, optional ...IdentityReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CacheDataset",
-		Input: []tf.Input{
-			input_dataset, filename,
-		},
+		Type: "IdentityReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
+// ResourceApplyGradientDescentAttr is an optional argument to ResourceApplyGradientDescent.
+type ResourceApplyGradientDescentAttr func(optionalAttr)
+
+// ResourceApplyGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
+// value: If `True`, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyGradientDescentUseLocking(value bool) ResourceApplyGradientDescentAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' by subtracting 'alpha' * 'delta' from it.
 //
+// Arguments:
+//	var_: Should be from a Variable().
+//	alpha: Scaling factor. Must be a scalar.
+//	delta: The change.
 //
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func ResourceApplyGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, delta tf.Output, optional ...ResourceApplyGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SqlDataset",
+		Type: "ResourceApplyGradientDescent",
 		Input: []tf.Input{
-			driver_name, data_source_name, query,
+			var_, alpha, delta,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Creates a dataset that emits the records from one or more binary files.
+// Returns the next record (key, value pair) produced by a Reader.
+//
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
 //
 // Arguments:
-//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
-// read.
-//	header_bytes: A scalar representing the number of bytes to skip at the
-// beginning of a file.
-//	record_bytes: A scalar representing the number of bytes in each record.
-//	footer_bytes: A scalar representing the number of bytes to skip at the end
-// of a file.
-//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
-func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	reader_handle: Handle to a Reader.
+//	queue_handle: Handle to a Queue, with string work items.
+//
+// Returns A scalar.A scalar.
+func ReaderReadV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output) (key tf.Output, value tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordDataset",
+		Type: "ReaderReadV2",
 		Input: []tf.Input{
-			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
+			reader_handle, queue_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Gradients for batch normalization.
-//
-// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
+// Returns up to `num_records` (key, value) pairs produced by a Reader.
 //
-// This op is deprecated. See `tf.nn.batch_normalization`.
+// Will dequeue from the input queue if necessary (e.g. when the
+// Reader needs to start reading from a new file since it has finished
+// with the previous file).
+// It may return less than `num_records` even before the last batch.
 //
 // Arguments:
-//	t: A 4D input Tensor.
-//	m: A 1D mean Tensor with size matching the last dimension of t.
-// This is the first output from tf.nn.moments,
-// or a saved moving average thereof.
-//	v: A 1D variance Tensor with size matching the last dimension of t.
-// This is the second output from tf.nn.moments,
-// or a saved moving average thereof.
-//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
-// If "scale_after_normalization" is true, this Tensor will be multiplied
-// with the normalized Tensor.
-//	backprop: 4D backprop Tensor.
-//	variance_epsilon: A small float number to avoid dividing by 0.
-//	scale_after_normalization: A bool indicating whether the resulted tensor
-// needs to be multiplied with gamma.
+//	reader_handle: Handle to a `Reader`.
+//	queue_handle: Handle to a `Queue`, with string work items.
+//	num_records: number of records to read from `Reader`.
 //
-// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
-func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
+// Returns A 1-D tensor.A 1-D tensor.
+func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Output, num_records tf.Output) (keys tf.Output, values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "BatchNormWithGlobalNormalizationGrad",
+		Type: "ReaderReadUpToV2",
 		Input: []tf.Input{
-			t, m, v, gamma, backprop,
+			reader_handle, queue_handle, num_records,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0), op.Output(1)
 }
 
-// Creates a dataset that emits the records from one or more TFRecord files.
+// Restore a Reader to its initial clean state.
 //
 // Arguments:
-//	filenames: A scalar or vector containing the name(s) of the file(s) to be
-// read.
-//	compression_type: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//	buffer_size: A scalar representing the number of bytes to buffer. A value of
-// 0 means no buffering will be performed.
-func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TFRecordDataset",
+		Type: "ReaderResetV2",
 		Input: []tf.Input{
-			filenames, compression_type, buffer_size,
+			reader_handle,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// BatchToSpace for 4-D tensors of type T.
-//
-// This is a legacy version of the more general BatchToSpaceND.
-//
-// Rearranges (permutes) data from batch into blocks of spatial data, followed by
-// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
-// this op outputs a copy of the input tensor where values from the `batch`
-// dimension are moved in spatial blocks to the `height` and `width` dimensions,
-// followed by cropping along the `height` and `width` dimensions.
-//
-// Arguments:
-//	input: 4-D tensor with shape
-// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
-//   depth]`. Note that the batch size of the input tensor must be divisible by
-// `block_size * block_size`.
-//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
-// how many elements to crop from the intermediate result across the spatial
-// dimensions as follows:
-//
-//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
-//
-//
-// Returns 4-D with shape `[batch, height, width, depth]`, where:
-//
-//       height = height_pad - crop_top - crop_bottom
-//       width = width_pad - crop_left - crop_right
-//
-// The attr `block_size` must be greater than one. It indicates the block size.
-//
-// Some examples:
-//
-// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
-//
-// ```
-// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 1]` and value:
-//
-// ```
-// x = [[[[1], [2]], [[3], [4]]]]
-// ```
-//
-// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
-//
-// ```
-// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
-// ```
-//
-// The output tensor has shape `[1, 2, 2, 3]` and value:
-//
-// ```
-// x = [[[[1, 2, 3], [4, 5, 6]],
-//       [[7, 8, 9], [10, 11, 12]]]]
-// ```
-//
-// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]], [[9], [11]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[1, 4, 4, 1]` and value:
-//
-// ```
-// x = [[[1],   [2],  [3],  [4]],
-//      [[5],   [6],  [7],  [8]],
-//      [[9],  [10], [11],  [12]],
-//      [[13], [14], [15],  [16]]]
-// ```
-//
-// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
-//
-// ```
-// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
-//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
-// ```
-//
-// The output tensor has shape `[2, 2, 4, 1]` and value:
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
 //
-// ```
-// x = [[[[1], [3]], [[5], [7]]],
-//      [[[2], [4]], [[10], [12]]],
-//      [[[5], [7]], [[13], [15]]],
-//      [[[6], [8]], [[14], [16]]]]
-// ```
-func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
 	}
-	attrs := map[string]interface{}{"block_size": block_size}
-	opspec := tf.OpSpec{
-		Type: "BatchToSpace",
-		Input: []tf.Input{
-			input, crops,
-		},
-		Attrs: attrs,
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Makes a new iterator from the given `dataset` and stores it in `iterator`.
+// Update '*var' according to the Adam algorithm.
 //
-// This operation may be executed multiple times. Each execution will reset the
-// iterator in `iterator` to the first element of `dataset`.
+// lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
+// v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
+// variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MakeIterator",
+		Type: "ResourceApplyAdam",
 		Input: []tf.Input{
-			dataset, iterator,
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// Adjust the contrast of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
-// interpreted as `[height, width, channels]`.  The other dimensions only
-// represent a collection of images, such as `[batch, height, width, channels].`
-//
-// Contrast is adjusted independently for each channel of each image.
-//
-// For each channel, the Op first computes the mean of the image pixels in the
-// channel and then adjusts each component of each pixel to
-// `(x - mean) * contrast_factor + mean`.
+// Store the input tensor in the state of the current session.
 //
 // Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	contrast_factor: A float multiplier for adjusting contrast.
+//	value: The tensor to be stored.
 //
-// Returns The contrast-adjusted image or images.
-func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
+// Returns The handle for the tensor stored in the session state, represented
+// as a ResourceHandle object.
+func GetSessionHandleV2(scope *Scope, value tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustContrastv2",
+		Type: "GetSessionHandleV2",
 		Input: []tf.Input{
-			images, contrast_factor,
+			value,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next output from the given iterator.
-func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+// ResizeBicubicGradAttr is an optional argument to ResizeBicubicGrad.
+type ResizeBicubicGradAttr func(optionalAttr)
+
+// ResizeBicubicGradAlignCorners sets the optional align_corners attribute to value.
+//
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeBicubicGradAlignCorners(value bool) ResizeBicubicGradAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
+	}
+}
+
+// Computes the gradient of bicubic interpolation.
+//
+// Arguments:
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	original_image: 4-D with shape `[batch, orig_height, orig_width, channels]`,
+// The image tensor that was resized.
+//
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`.
+// Gradients with respect to the input image. Input image must have been
+// float or double.
+func ResizeBicubicGrad(scope *Scope, grads tf.Output, original_image tf.Output, optional ...ResizeBicubicGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorGetNext",
+		Type: "ResizeBicubicGrad",
 		Input: []tf.Input{
-			iterator,
+			grads, original_image,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("IteratorGetNext", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// Outputs the single element from the given dataset.
-//
-// Arguments:
-//	dataset: A handle to a dataset that contains a single element.
+// ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
+type ResizeNearestNeighborAttr func(optionalAttr)
+
+// ResizeNearestNeighborAlignCorners sets the optional align_corners attribute to value.
 //
-//
-//
-// Returns The components of the single element of `input`.
-func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DatasetToSingleElement",
-		Input: []tf.Input{
-			dataset,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("DatasetToSingleElement", err)
-		return
+// value: If true, rescale input by (new_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of images and resized images. If false, rescale
+// by new_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborAlignCorners(value bool) ResizeNearestNeighborAttr {
+	return func(m optionalAttr) {
+		m["align_corners"] = value
 	}
-	return components
 }
 
-// Converts the given `resource_handle` representing an iterator to a string.
+// Resize `images` to `size` using nearest neighbor interpolation.
 //
 // Arguments:
-//	resource_handle: A handle to an iterator resource.
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
 //
-// Returns A string representation of the given handle.
-func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeNearestNeighbor(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeNearestNeighborAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "IteratorToStringHandle",
+		Type: "ResizeNearestNeighbor",
 		Input: []tf.Input{
-			resource_handle,
+			images, size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
-type IteratorFromStringHandleAttr func(optionalAttr)
-
-// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
-//
-// value: If specified, defines the type of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
-	return func(m optionalAttr) {
-		m["output_types"] = value
-	}
-}
+// ResizeNearestNeighborGradAttr is an optional argument to ResizeNearestNeighborGrad.
+type ResizeNearestNeighborGradAttr func(optionalAttr)
 
-// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
-//
-// value: If specified, defines the shape of each tuple component in an
-// element produced by the resulting iterator.
-// If not specified, defaults to <>
+// ResizeNearestNeighborGradAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
+// value: If true, rescale grads by (orig_height - 1) / (height - 1), which
+// exactly aligns the 4 corners of grads and original_image. If false, rescale by
+// orig_height / height. Treat similarly the width dimension.
+// If not specified, defaults to false
+func ResizeNearestNeighborGradAlignCorners(value bool) ResizeNearestNeighborGradAttr {
 	return func(m optionalAttr) {
-		m["output_shapes"] = value
+		m["align_corners"] = value
 	}
 }
 
-// Converts the given string representing a handle to an iterator to a resource.
+// Computes the gradient of nearest neighbor interpolation.
 //
 // Arguments:
-//	string_handle: A string representation of the given handle.
+//	grads: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `orig_height, orig_width`. The
+// original input size.
 //
-// Returns A handle to an iterator resource.
-func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
+// Returns 4-D with shape `[batch, orig_height, orig_width, channels]`. Gradients
+// with respect to the input image.
+func ResizeNearestNeighborGrad(scope *Scope, grads tf.Output, size tf.Output, optional ...ResizeNearestNeighborGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22325,9 +21341,9 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "IteratorFromStringHandle",
+		Type: "ResizeNearestNeighborGrad",
 		Input: []tf.Input{
-			string_handle,
+			grads, size,
 		},
 		Attrs: attrs,
 	}
@@ -22335,156 +21351,97 @@ func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ..
 	return op.Output(0)
 }
 
-// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
-//
-// This is the angle \( \theta \in [-\pi, \pi] \) such that
-// \[ x = r \cos(\theta) \]
-// and
-// \[ y = r \sin(\theta) \]
-// where \(r = \sqrt(x^2 + y^2) \).
-func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan2",
-		Input: []tf.Input{
-			y, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// ExtractJpegShapeAttr is an optional argument to ExtractJpegShape.
+type ExtractJpegShapeAttr func(optionalAttr)
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
+// ExtractJpegShapeOutputType sets the optional output_type attribute to value.
+//
+// value: (Optional) The output type of the operation (int32 or int64).
+// Defaults to int32.
+// If not specified, defaults to DT_INT32
+func ExtractJpegShapeOutputType(value tf.DataType) ExtractJpegShapeAttr {
+	return func(m optionalAttr) {
+		m["output_type"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Gather slices from `params` axis `axis` according to `indices`.
-//
-// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
-//
-// ```python
-//     # Scalar indices (output is rank(params) - 1).
-//     output[a_0, ..., a_n, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
-//
-//     # Vector indices (output is rank(params)).
-//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
-//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
-//
-//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
-//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
-//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
-// ```
+// Extract the shape information of a JPEG-encoded image.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
-// </div>
+// This op only parses the image header, so it is much faster than DecodeJpeg.
 //
 // Arguments:
-//	params: The tensor from which to gather values. Must be at least rank
-// `axis + 1`.
-//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
-//	axis: The axis in `params` to gather `indices` from. Defaults to the first
-// dimension. Supports negative indexes.
+//	contents: 0-D. The JPEG-encoded image.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
-func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 1-D. The image shape with format [height, width, channels].
+func ExtractJpegShape(scope *Scope, contents tf.Output, optional ...ExtractJpegShapeAttr) (image_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "GatherV2",
-		Input: []tf.Input{
-			params, indices, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Converts the given `resource_handle` representing an iterator to a variant tensor.
-//
-// Arguments:
-//	resource_handle: A handle to an iterator resource.
-//
-// Returns A variant tensor storing the state of the iterator contained in the
-// resource.
-func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
-	if scope.Err() != nil {
-		return
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeIterator",
+		Type: "ExtractJpegShape",
 		Input: []tf.Input{
-			resource_handle,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
-type FIFOQueueV2Attr func(optionalAttr)
+// PaddingFIFOQueueV2Attr is an optional argument to PaddingFIFOQueueV2.
+type PaddingFIFOQueueV2Attr func(optionalAttr)
 
-// FIFOQueueV2Shapes sets the optional shapes attribute to value.
+// PaddingFIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
 // value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+// be either 0 or the same as the length of component_types.
+// Shapes of fixed rank but variable size are allowed by setting
+// any shape dimension to -1.  In this case, the inputs' shape may vary along
+// the given dimension, and DequeueMany will pad the given dimension with
+// zeros up to the maximum shape of all elements in the given batch.
+// If the length of this attr is 0, different queue elements may have
+// different ranks and shapes, but only one element may be dequeued at a time.
 // If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
-func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
+func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shapes"] = value
 	}
 }
 
-// FIFOQueueV2Capacity sets the optional capacity attribute to value.
+// PaddingFIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
 // value: The upper bound on the number of elements in this queue.
 // Negative numbers mean no limit.
 // If not specified, defaults to -1
-func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
+func PaddingFIFOQueueV2Capacity(value int64) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// FIFOQueueV2Container sets the optional container attribute to value.
+// PaddingFIFOQueueV2Container sets the optional container attribute to value.
 //
 // value: If non-empty, this queue is placed in the given container.
 // Otherwise, a default container is used.
 // If not specified, defaults to ""
-func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
+func PaddingFIFOQueueV2Container(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
+// PaddingFIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
 // value: If non-empty, this queue will be shared under the given name
 // across multiple sessions.
 // If not specified, defaults to ""
-func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
+func PaddingFIFOQueueV2SharedName(value string) PaddingFIFOQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
@@ -22492,11 +21449,15 @@ func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 
 // A queue that produces elements in first-in first-out order.
 //
+// Variable-size shapes are allowed by setting the corresponding shape dimensions
+// to 0 in the shape attr.  In this case DequeueMany will pad up to the maximum
+// size of any given element in the minibatch.  See below for details.
+//
 // Arguments:
 //	component_types: The type of each component in a value.
 //
 // Returns The handle to the queue.
-func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
+func PaddingFIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...PaddingFIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -22505,7 +21466,7 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FIFOQueueV2",
+		Type: "PaddingFIFOQueueV2",
 
 		Attrs: attrs,
 	}
@@ -22513,559 +21474,372 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
+// DecodePngAttr is an optional argument to DecodePng.
+type DecodePngAttr func(optionalAttr)
+
+// DecodePngChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodePngChannels(value int64) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
+}
+
+// DecodePngDtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_UINT8
+func DecodePngDtype(value tf.DataType) DecodePngAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Compute the pairwise cross product.
+// Decode a PNG-encoded image to a uint8 or uint16 tensor.
 //
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the PNG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+// *   4: output an RGBA image.
+//
+// If needed, the PNG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// This op also supports decoding JPEGs and non-animated GIFs since the interface
+// is the same, though it is cleaner to use `tf.image.decode_image`.
 //
 // Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
+//	contents: 0-D.  The PNG-encoded image.
 //
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+// Returns 3-D with shape `[height, width, channels]`.
+func DecodePng(scope *Scope, contents tf.Output, optional ...DecodePngAttr) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Cross",
+		Type: "DecodePng",
 		Input: []tf.Input{
-			a, b,
+			contents,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Performs a padding as a preprocess during a convolution.
+// Decode the first frame of a GIF-encoded image to a uint8 tensor.
 //
-// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
-// implementation where the spatial padding transformation stage is fused with the
-// im2col lookup, but in this case without the bilinear filtering required for
-// resizing. Fusing the padding prevents the need to write out the intermediate
-// results as whole tensors, reducing memory pressure, and we can get some latency
-// gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
-// order is used instead.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
+// GIF with frame or transparency compression are not supported
+// convert animated GIF from compressed to uncompressed by:
+//
+//     convert $src.gif -coalesce $dst.gif
+//
+// This op also supports decoding JPEGs and PNGs, though it is cleaner to use
+// `tf.image.decode_image`.
 //
 // Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
+//	contents: 0-D.  The GIF-encoded image.
 //
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
+// Returns 4-D with shape `[num_frames, height, width, 3]`. RGB order
+func DecodeGif(scope *Scope, contents tf.Output) (image tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "FusedPadConv2D",
+		Type: "DecodeGif",
 		Input: []tf.Input{
-			input, paddings, filter,
+			contents,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
-type Conv2DBackpropInputAttr func(optionalAttr)
-
-// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
+// ResourceApplyCenteredRMSPropAttr is an optional argument to ResourceApplyCenteredRMSProp.
+type ResourceApplyCenteredRMSPropAttr func(optionalAttr)
 
-// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
+// ResourceApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMSPropAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
+// Update '*var' according to the centered RMSProp algorithm.
 //
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the input.
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+//
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// mg <- rho * mg_{t-1} + (1-rho) * grad
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
+// var <- var - mom
 //
 // Arguments:
-//	input_sizes: An integer vector representing the shape of `input`,
-// where `input` is a 4-D `[batch, height, width, channels]` tensor.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
 //
-// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
-// w.r.t. the input of the convolution.
-func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropInput",
+		Type: "ResourceApplyCenteredRMSProp",
 		Input: []tf.Input{
-			input_sizes, filter, out_backprop,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
+// Returns a list of tensors with the same shapes and contents as the input
 //
-// Values are merged in order, so if an index appears in both `indices[m][i]` and
-// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
-// merged result. If you do not need this guarantee, ParallelDynamicStitch might
-// perform better on some devices.
+// tensors.
 //
-// For example:
+// This op can be used to override the gradient for complicated functions. For
+// example, suppose y = f(x) and we wish to apply a custom function g for backprop
+// such that dx = g(dy). In Python,
 //
 // ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// with tf.get_default_graph().gradient_override_map(
+//     {'IdentityN': 'OverrideGradientWithG'}):
+//   y, _ = identity_n([f(x), x])
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
+// @tf.RegisterGradient('OverrideGradientWithG')
+// def ApplyG(op, dy, _):
+//   return [None, g(dy)]  # Do not backprop to f(x).
 // ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+func IdentityN(scope *Scope, input []tf.Output) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DynamicStitch",
+		Type: "IdentityN",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			tf.OutputList(input),
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the truth value of (x == y) element-wise.
-//
-// *NOTE*: `Equal` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Equal",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
-type TensorArrayGatherV2Attr func(optionalAttr)
-
-// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("IdentityN", err)
+		return
 	}
+	return output
 }
 
-// Deprecated. Use TensorArrayGatherV3
+// Computes the gradient of the sigmoid of `x` wrt its input.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
-func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+// Specifically, `grad = dy * y * (1 - y)`, where `y = sigmoid(x)`, and
+// `dy` is the corresponding input gradient.
+func SigmoidGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGatherV2",
+		Type: "SigmoidGrad",
 		Input: []tf.Input{
-			handle, indices, flow_in,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
+// Convert one or more images from HSV to RGB.
 //
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
+// Outputs a tensor of the same shape as the `images` tensor, containing the RGB
+// value of the pixels. The output is only well defined if the value in `images`
+// are in `[0,1]`.
 //
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
+// See `rgb_to_hsv` for a description of the HSV encoding.
 //
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
+// Arguments:
+//	images: 1-D or higher rank. HSV data to convert. Last dimension must be size 3.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+// Returns `images` converted to RGB.
+func HSVToRGB(scope *Scope, images tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
+		Type: "HSVToRGB",
 		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
+			images,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// SampleDistortedBoundingBoxV2Attr is an optional argument to SampleDistortedBoundingBoxV2.
+type SampleDistortedBoundingBoxV2Attr func(optionalAttr)
+
+// SampleDistortedBoundingBoxV2Seed sets the optional seed attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: If either `seed` or `seed2` are set to non-zero, the random number
+// generator is seeded by the given `seed`.  Otherwise, it is seeded by a random
+// seed.
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxV2Seed(value int64) SampleDistortedBoundingBoxV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// StridedSliceAttr is an optional argument to StridedSlice.
-type StridedSliceAttr func(optionalAttr)
-
-// StridedSliceBeginMask sets the optional begin_mask attribute to value.
+// SampleDistortedBoundingBoxV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: a bitmask where a bit i being 1 means to ignore the begin
-// value and instead use the largest interval possible. At runtime
-// begin[i] will be replaced with `[0, n-1) if `stride[i] > 0` or
-// `[-1, n-1]` if `stride[i] < 0`
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func StridedSliceBeginMask(value int64) StridedSliceAttr {
+func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["begin_mask"] = value
+		m["seed2"] = value
 	}
 }
 
-// StridedSliceEndMask sets the optional end_mask attribute to value.
+// SampleDistortedBoundingBoxV2AspectRatioRange sets the optional aspect_ratio_range attribute to value.
 //
-// value: analogous to `begin_mask`
-// If not specified, defaults to 0
-func StridedSliceEndMask(value int64) StridedSliceAttr {
+// value: The cropped area of the image must have an aspect ratio =
+// width / height within this range.
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["end_mask"] = value
+		m["aspect_ratio_range"] = value
 	}
 }
 
-// StridedSliceEllipsisMask sets the optional ellipsis_mask attribute to value.
+// SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// position is actually an ellipsis. One bit at most can be 1.
-// If `ellipsis_mask == 0`, then an implicit ellipsis mask of `1 << (m+1)`
-// is provided. This means that `foo[3:5] == foo[3:5, ...]`. An ellipsis
-// implicitly creates as many range specifications as necessary to fully
-// specify the sliced range for every dimension. For example for a 4-dimensional
-// tensor `foo` the slice `foo[2, ..., 5:8]` implies `foo[2, :, :, 5:8]`.
-// If not specified, defaults to 0
-func StridedSliceEllipsisMask(value int64) StridedSliceAttr {
+// value: The cropped area of the image must contain a fraction of the
+// supplied image within in this range.
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+		m["area_range"] = value
 	}
 }
 
-// StridedSliceNewAxisMask sets the optional new_axis_mask attribute to value.
+// SampleDistortedBoundingBoxV2MaxAttempts sets the optional max_attempts attribute to value.
 //
-// value: a bitmask where bit `i` being 1 means the `i`th
-// specification creates a new shape 1 dimension. For example
-// `foo[:4, tf.newaxis, :2]` would produce a shape `(4, 1, 2)` tensor.
-// If not specified, defaults to 0
-func StridedSliceNewAxisMask(value int64) StridedSliceAttr {
+// value: Number of attempts at generating a cropped region of the image
+// of the specified constraints. After `max_attempts` failures, return the entire
+// image.
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxV2MaxAttempts(value int64) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+		m["max_attempts"] = value
 	}
 }
 
-// StridedSliceShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
+// SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
 //
-// value: a bitmask where bit `i` implies that the `i`th
-// specification should shrink the dimensionality. begin and end
-// must imply a slice of size 1 in the dimension. For example in
-// python one might do `foo[:, 3, :]` which would result in
-// `shrink_axis_mask` being 2.
-// If not specified, defaults to 0
-func StridedSliceShrinkAxisMask(value int64) StridedSliceAttr {
+// value: Controls behavior if no bounding boxes supplied.
+// If true, assume an implicit bounding box covering the whole input. If false,
+// raise an error.
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxV2UseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["use_image_if_no_bounding_boxes"] = value
 	}
 }
 
-// Return a strided slice from `input`.
+// Generate a single randomly distorted bounding box for an image.
 //
-// Note, most python users will want to use the Python `Tensor.__getitem__`
-// or `Variable.__getitem__` rather than this op directly.
+// Bounding box annotations are often supplied in addition to ground-truth labels
+// in image recognition or object localization tasks. A common technique for
+// training such a system is to randomly distort an image while preserving
+// its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+// localization of an object, i.e. bounding box, given an `image_size`,
+// `bounding_boxes` and a series of constraints.
 //
-// The goal of this op is to produce a new tensor with a subset of
-// the elements from the `n` dimensional `input` tensor. The subset is chosen using
-// a sequence of `m` sparse range specifications encoded into the arguments
-// of this function. Note, in some cases
-// `m` could be equal to `n`, but this need not be the case. Each
-// range specification entry can be one of the following:
+// The output of this Op is a single bounding box that may be used to crop the
+// original image. The output is returned as 3 tensors: `begin`, `size` and
+// `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+// image. The latter may be supplied to `tf.image.draw_bounding_boxes` to visualize
+// what the bounding box looks like.
 //
-// - An ellipsis (...). Ellipses are used to imply zero or more
-//   dimensions of full-dimension selection and are produced using
-//   `ellipsis_mask`. For example, `foo[...]` is the identity slice.
+// Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
 //
-// - A new axis. This is used to insert a new shape=1 dimension and is
-//   produced using `new_axis_mask`. For example, `foo[:, ...]` where
-//   `foo` is shape `(3, 4)` produces a `(1, 3, 4)` tensor.
+// For example,
 //
+// ```python
+//     # Generate a single distorted bounding box.
+//     begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+//         tf.shape(image),
+//         bounding_boxes=bounding_boxes)
 //
-// - A range `begin:end:stride`. This is used to specify how much to choose from
-//   a given dimension. `stride` can be any integer but 0.  `begin` is an integer
-//   which represents the index of the first value to select while `end` represents
-//   the index of the last value to select. The number of values selected in each
-//   dimension is `end - begin` if `stride > 0` and `begin - end` if `stride < 0`.
-//   `begin` and `end` can be negative where `-1` is the last element, `-2` is
-//   the second to last. `begin_mask` controls whether to replace the explicitly
-//   given `begin` with an implicit effective value of `0` if `stride > 0` and
-//   `-1` if `stride < 0`. `end_mask` is analogous but produces the number
-//   required to create the largest open interval. For example, given a shape
-//   `(3,)` tensor `foo[:]`, the effective `begin` and `end` are `0` and `3`. Do
-//   not assume this is equivalent to `foo[0:-1]` which has an effective `begin`
-//   and `end` of `0` and `2`. Another example is `foo[-2::-1]` which reverses the
-//   first dimension of a tensor while dropping the last two (in the original
-//   order elements). For example `foo = [1,2,3,4]; foo[-2::-1]` is `[4,3]`.
-//
-// - A single index. This is used to keep only elements that have a given
-//   index. For example (`foo[2, :]` on a shape `(5,6)` tensor produces a
-//   shape `(6,)` tensor. This is encoded in `begin` and `end` and
-//   `shrink_axis_mask`.
-//
-// Each conceptual range specification is encoded in the op's argument. This
-// encoding is best understand by considering a non-trivial example. In
-// particular,
-// `foo[1, 2:4, None, ..., :-3:-1, :]` will be encoded as
+//     # Draw the bounding box in an image summary.
+//     image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+//                                                   bbox_for_draw)
+//     tf.summary.image('images_with_box', image_with_box)
 //
+//     # Employ the bounding box to distort the image.
+//     distorted_image = tf.slice(image, begin, size)
 // ```
-// begin = [1, 2, x, x, 0, x] # x denotes don't care (usually 0)
-// end = [2, 4, x, x, -3, x]
-// strides = [1, 1, x, x, -1, 1]
-// begin_mask = 1<<4 | 1 << 5 = 48
-// end_mask = 1<<5 = 32
-// ellipsis_mask = 1<<3 = 8
-// new_axis_mask = 1<<2 4
-// shrink_axis_mask = 1<<0
-// ```
-//
-// In this case if `foo.shape` is (5, 5, 5, 5, 5, 5) the final shape of
-// the slice becomes (2, 1, 5, 5, 2, 5).
-// Let us walk step by step through each argument specification.
-//
-// 1.  The first argument in the example slice is turned into `begin = 1` and
-// `end = begin + 1 = 2`. To disambiguate from the original spec `2:4` we
-// also set the appropriate bit in `shrink_axis_mask`.
-//
-// 2. `2:4` is contributes 2, 4, 1 to begin, end, and stride. All masks have
-// zero bits contributed.
-//
-// 3. None is a synonym for `tf.newaxis`. This means insert a dimension of size 1
-// dimension in the final shape. Dummy values are contributed to begin,
-// end and stride, while the new_axis_mask bit is set.
-//
-// 4. `...` grab the full ranges from as many dimensions as needed to
-// fully specify a slice for every dimension of the input shape.
-//
-// 5. `:-3:-1` shows the use of negative indices. A negative index `i` associated
-// with a dimension that has shape `s` is converted to a positive index
-// `s + i`. So `-1` becomes `s-1` (i.e. the last element). This conversion
-// is done internally so begin, end and strides receive x, -3, and -1.
-// The appropriate begin_mask bit is set to indicate the start range is the
-// full range (ignoring the x).
 //
-// 6. `:` indicates that the entire contents of the corresponding dimension
-// is selected. This is equivalent to `::` or `0::1`. begin, end, and strides
-// receive 0, 0, and 1, respectively. The appropriate bits in `begin_mask` and
-// `end_mask` are also set.
-//
-// *Requirements*:
-//   `0 != strides[i] for i in [0, m)`
-//   `ellipsis_mask must be a power of two (only one ellipsis)`
+// Note that if no bounding box information is available, setting
+// `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+// bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+// false and no bounding boxes are supplied, an error is raised.
 //
 // Arguments:
+//	image_size: 1-D, containing `[height, width, channels]`.
+//	bounding_boxes: 3-D with shape `[batch, N, 4]` describing the N bounding boxes
+// associated with the image.
+//	min_object_covered: The cropped area of the image must contain at least this
+// fraction of any bounding box supplied. The value of this parameter should be
+// non-negative. In the case of 0, the cropped area does not need to overlap
+// any of the bounding boxes supplied.
 //
-//	begin: `begin[k]` specifies the offset into the `k`th range specification.
-// The exact dimension this corresponds to will be determined by context.
-// Out-of-bounds values will be silently clamped. If the `k`th bit of
-// `begin_mask` then `begin[k]` is ignored and the full range of the
-// appropriate dimension is used instead. Negative values causes indexing
-// to start from the highest element e.g. If `foo==[1,2,3]` then `foo[-1]==3`.
-//	end: `end[i]` is like `begin` with the exception that `end_mask` is
-// used to determine full ranges.
-//	strides: `strides[i]` specifies the increment in the `i`th specification
-// after extracting a given element. Negative indices will reverse
-// the original order. Out or range values are
-// clamped to `[0,dim[i]) if slice[i]>0` or `[-1,dim[i]-1] if slice[i] < 0`
-func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output, strides tf.Output, optional ...StridedSliceAttr) (output tf.Output) {
+// Returns 1-D, containing `[offset_height, offset_width, 0]`. Provide as input to
+// `tf.slice`.1-D, containing `[target_height, target_width, -1]`. Provide as input to
+// `tf.slice`.3-D with shape `[1, 1, 4]` containing the distorted bounding box.
+// Provide as input to `tf.image.draw_bounding_boxes`.
+func SampleDistortedBoundingBoxV2(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, min_object_covered tf.Output, optional ...SampleDistortedBoundingBoxV2Attr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23074,197 +21848,221 @@ func StridedSlice(scope *Scope, input tf.Output, begin tf.Output, end tf.Output,
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StridedSlice",
+		Type: "SampleDistortedBoundingBoxV2",
 		Input: []tf.Input{
-			input, begin, end, strides,
+			image_size, bounding_boxes, min_object_covered,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
-type PriorityQueueV2Attr func(optionalAttr)
+// ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
+type ExtractGlimpseAttr func(optionalAttr)
 
-// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
-//
-// value: The type of each component in a value.
-// If not specified, defaults to <>
+// ExtractGlimpseCentered sets the optional centered attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseCentered(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["component_types"] = value
+		m["centered"] = value
 	}
 }
 
-// PriorityQueueV2Capacity sets the optional capacity attribute to value.
+// ExtractGlimpseNormalized sets the optional normalized attribute to value.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseNormalized(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["normalized"] = value
 	}
 }
 
-// PriorityQueueV2Container sets the optional container attribute to value.
+// ExtractGlimpseUniformNoise sets the optional uniform_noise attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseUniformNoise(value bool) ExtractGlimpseAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["uniform_noise"] = value
 	}
 }
 
-// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
+// Extracts a glimpse from the input tensor.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A queue that produces elements sorted by the first component value.
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
 //
-// Note that the PriorityQueue requires the first component of any element
-// to be a scalar int64, in addition to the other elements declared by
-// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
-// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
-// entry in their input (resp. output) lists.
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
 //
 // Arguments:
-//	shapes: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
 //
-// Returns The handle to the queue.
-func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpse(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseAttr) (glimpse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shapes": shapes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PriorityQueueV2",
-
+		Type: "ExtractGlimpse",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UnstageAttr is an optional argument to Unstage.
-type UnstageAttr func(optionalAttr)
-
-// UnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// A container for an iterator resource.
 //
-// REQUIRES: value >= 0
-func UnstageCapacity(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Returns A handle to the iterator that can be passed to a "MakeIterator"
+// or "IteratorGetNext" op.
+func Iterator(scope *Scope, shared_name string, container string, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
+	attrs := map[string]interface{}{"shared_name": shared_name, "container": container, "output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "Iterator",
 
-// UnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func UnstageMemoryLimit(value int64) UnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// UnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func UnstageContainer(value string) UnstageAttr {
+// CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
+type CropAndResizeGradImageAttr func(optionalAttr)
+
+// CropAndResizeGradImageMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradImageMethod(value string) CropAndResizeGradImageAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["method"] = value
 	}
 }
 
-// UnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func UnstageSharedName(value string) UnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op is similar to a lightweight Dequeue.
+// Computes the gradient of the crop_and_resize op wrt the input image tensor.
 //
-// The basic functionality is similar to dequeue with many fewer
-// capabilities and options.  This Op is optimized for performance.
-func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
+//	image_size: A 1-D tensor with value `[batch, image_height, image_width, depth]`
+// containing the original image size. Both `image_height` and `image_width` need
+// to be positive.
+//
+//
+// Returns A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+func CropAndResizeGradImage(scope *Scope, grads tf.Output, boxes tf.Output, box_ind tf.Output, image_size tf.Output, T tf.DataType, optional ...CropAndResizeGradImageAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"T": T}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Unstage",
-
+		Type: "CropAndResizeGradImage",
+		Input: []tf.Input{
+			grads, boxes, box_ind, image_size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("Unstage", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// ArgMaxAttr is an optional argument to ArgMax.
-type ArgMaxAttr func(optionalAttr)
+// ShuffleDatasetAttr is an optional argument to ShuffleDataset.
+type ShuffleDatasetAttr func(optionalAttr)
 
-// ArgMaxOutputType sets the optional output_type attribute to value.
-// If not specified, defaults to DT_INT64
-func ArgMaxOutputType(value tf.DataType) ArgMaxAttr {
+// ShuffleDatasetReshuffleEachIteration sets the optional reshuffle_each_iteration attribute to value.
+//
+// value: If true, each iterator over this dataset will be given
+// a different pseudorandomly generated seed, based on a sequence seeded by the
+// `seed` and `seed2` inputs. If false, each iterator will be given the same
+// seed, and repeated iteration over this dataset will yield the exact same
+// sequence of results.
+// If not specified, defaults to true
+func ShuffleDatasetReshuffleEachIteration(value bool) ShuffleDatasetAttr {
 	return func(m optionalAttr) {
-		m["output_type"] = value
+		m["reshuffle_each_iteration"] = value
 	}
 }
 
-// Returns the index with the largest value across dimensions of a tensor.
-//
-// Note that in case of ties the identity of the return value is not guaranteed.
+// Creates a dataset that shuffles elements from `input_dataset` pseudorandomly.
 //
 // Arguments:
 //
-//	dimension: int32 or int64, must be in the range `[-rank(input), rank(input))`.
-// Describes which dimension of the input Tensor to reduce across. For vectors,
-// use dimension = 0.
-func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgMaxAttr) (output tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//
+//
+func ShuffleDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ShuffleDatasetAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ArgMax",
+		Type: "ShuffleDataset",
 		Input: []tf.Input{
-			input, dimension,
+			input_dataset, buffer_size, seed, seed2,
 		},
 		Attrs: attrs,
 	}
@@ -23272,60 +22070,69 @@ func ArgMax(scope *Scope, input tf.Output, dimension tf.Output, optional ...ArgM
 	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignAttr is an optional argument to ResourceStridedSliceAssign.
-type ResourceStridedSliceAssignAttr func(optionalAttr)
-
-// ResourceStridedSliceAssignBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignBeginMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
-	}
-}
-
-// ResourceStridedSliceAssignEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEndMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
+// 3D fast Fourier transform.
+//
+// Computes the 3-dimensional discrete Fourier transform over the inner-most 3
+// dimensions of `input`.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//
+// Returns A complex64 tensor of the same shape as `input`. The inner-most 3
+//   dimensions of `input` are replaced with their 3D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.fftn with 3 dimensions.
+// @end_compatibility
+func FFT3D(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// ResourceStridedSliceAssignEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignEllipsisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+	opspec := tf.OpSpec{
+		Type: "FFT3D",
+		Input: []tf.Input{
+			input,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// ResourceStridedSliceAssignNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignNewAxisMask(value int64) ResourceStridedSliceAssignAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
-	}
-}
+// CropAndResizeGradBoxesAttr is an optional argument to CropAndResizeGradBoxes.
+type CropAndResizeGradBoxesAttr func(optionalAttr)
 
-// ResourceStridedSliceAssignShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func ResourceStridedSliceAssignShrinkAxisMask(value int64) ResourceStridedSliceAssignAttr {
+// CropAndResizeGradBoxesMethod sets the optional method attribute to value.
+//
+// value: A string specifying the interpolation method. Only 'bilinear' is
+// supported for now.
+// If not specified, defaults to "bilinear"
+func CropAndResizeGradBoxesMethod(value string) CropAndResizeGradBoxesAttr {
 	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+		m["method"] = value
 	}
 }
 
-// Assign `value` to the sliced l-value reference of `ref`.
-//
-// The values of `value` are assigned to the positions in the variable
-// `ref` that are selected by the slice parameters. The slice parameters
-// `begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+// Computes the gradient of the crop_and_resize op wrt the input boxes tensor.
 //
-// NOTE this op currently does not support broadcasting and so `value`'s
-// shape must be exactly the shape produced by the slice of `ref`.
+// Arguments:
+//	grads: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+//	image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+// Both `image_height` and `image_width` need to be positive.
+//	boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+// specifies the coordinates of a box in the `box_ind[i]` image and is specified
+// in normalized coordinates `[y1, x1, y2, x2]`. A normalized coordinate value of
+// `y` is mapped to the image coordinate at `y * (image_height - 1)`, so as the
+// `[0, 1]` interval of normalized image height is mapped to
+// `[0, image_height - 1] in image height coordinates. We do allow y1 > y2, in
+// which case the sampled crop is an up-down flipped version of the original
+// image. The width dimension is treated similarly. Normalized coordinates
+// outside the `[0, 1]` range are allowed, in which case we use
+// `extrapolation_value` to extrapolate the input image values.
+//	box_ind: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0, batch)`.
+// The value of `box_ind[i]` specifies the image that the `i`-th box refers to.
 //
-// Returns the created operation.
-func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, end tf.Output, strides tf.Output, value tf.Output, optional ...ResourceStridedSliceAssignAttr) (o *tf.Operation) {
+// Returns A 2-D tensor of shape `[num_boxes, 4]`.
+func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxes tf.Output, box_ind tf.Output, optional ...CropAndResizeGradBoxesAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23334,398 +22141,356 @@ func ResourceStridedSliceAssign(scope *Scope, ref tf.Output, begin tf.Output, en
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceStridedSliceAssign",
+		Type: "CropAndResizeGradBoxes",
 		Input: []tf.Input{
-			ref, begin, end, strides, value,
+			grads, image, boxes, box_ind,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
-type QueueEnqueueV2Attr func(optionalAttr)
-
-// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue is full, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Enqueues a tuple of one or more tensors in the given queue.
-//
-// The components input has k elements, which correspond to the components of
-// tuples stored in the given queue.
+// Saves tensors in V2 checkpoint format.
 //
-// N.B. If the queue is full, this operation will block until the given
-// element has been enqueued (or 'timeout_ms' elapses, if specified).
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	components: One or more tensors from which the enqueued tensors should be taken.
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
 //
 // Returns the created operation.
-func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueEnqueueV2",
+		Type: "SaveV2",
 		Input: []tf.Input{
-			handle, tf.OutputList(components),
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
 		},
-		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
-type QueueDequeueManyV2Attr func(optionalAttr)
+// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
+type StatsAggregatorHandleAttr func(optionalAttr)
 
-// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
-//
-// value: If the queue has fewer than n elements, this operation
-// will block for up to timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
+// StatsAggregatorHandleContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
 	return func(m optionalAttr) {
-		m["timeout_ms"] = value
+		m["container"] = value
 	}
 }
 
-// Dequeues `n` tuples of one or more tensors from the given queue.
-//
-// If the queue is closed and there are fewer than `n` elements, then an
-// OutOfRange error is returned.
-//
-// This operation concatenates queue-element component tensors along the
-// 0th dimension to make a single component tensor.  All of the components
-// in the dequeued tuple will have size `n` in the 0th dimension.
-//
-// This operation has `k` outputs, where `k` is the number of components in
-// the tuples stored in the given queue, and output `i` is the ith
-// component of the dequeued tuple.
-//
-// N.B. If the queue is empty, this operation will block until `n` elements
-// have been dequeued (or 'timeout_ms' elapses, if specified).
-//
-// Arguments:
-//	handle: The handle to a queue.
-//	n: The number of tuples to dequeue.
-//	component_types: The type of each component in a tuple.
-//
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
+// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a statistics manager resource.
+func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueManyV2",
-		Input: []tf.Input{
-			handle, n,
-		},
+		Type: "StatsAggregatorHandle",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueManyV2", err)
-		return
-	}
-	return components
+	return op.Output(0)
 }
 
-// EncodeBase64Attr is an optional argument to EncodeBase64.
-type EncodeBase64Attr func(optionalAttr)
-
-// EncodeBase64Pad sets the optional pad attribute to value.
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// value: Bool whether padding is applied at the ends.
-// If not specified, defaults to false
-func EncodeBase64Pad(value bool) EncodeBase64Attr {
-	return func(m optionalAttr) {
-		m["pad"] = value
-	}
-}
-
-// Encode strings into web-safe base64 format.
+// pruning away boxes that have high intersection-over-union (IOU) overlap
+// with previously selected boxes.  Bounding boxes are supplied as
+// [y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+// diagonal pair of box corners and the coordinates can be provided as normalized
+// (i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+// is agnostic to where the origin is in the coordinate system.  Note that this
+// algorithm is invariant to orthogonal transformations and translations
+// of the coordinate system; thus translating or reflections of the coordinate
+// system result in the same boxes being selected by the algorithm.
 //
-// Refer to the following article for more information on base64 format:
-// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
-// end so that the encoded has length multiple of 4. See Padding section of the
-// link above.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// Web-safe means that the encoder uses - and _ instead of + and /.
+//   selected_indices = tf.image.non_max_suppression_v2(
+//       boxes, scores, max_output_size, iou_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	input: Strings to be encoded.
+//	boxes: A 2-D float tensor of shape `[num_boxes, 4]`.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	iou_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too much with respect to IOU.
 //
-// Returns Input strings encoded in base64.
-func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_output_size tf.Output, iou_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeBase64",
+		Type: "NonMaxSuppressionV2",
 		Input: []tf.Input{
-			input,
+			boxes, scores, max_output_size, iou_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayCloseV3
+// Creates a TensorArray for storing the gradients of values in the given handle.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
+// If the given TensorArray gradient already exists, returns a reference to it.
 //
-// Returns the created operation.
-func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+// Locks the size of the original TensorArray by disabling its dynamic size flag.
+//
+// **A note about the input flow_in:**
+//
+// The handle flow_in forces the execution of the gradient lookup to occur
+// only after certain other operations have occurred.  For example, when
+// the forward TensorArray is dynamically sized, writes to this TensorArray
+// may resize the object.  The gradient TensorArray is statically sized based
+// on the size of the forward TensorArray when this operation executes.
+// Furthermore, the size of the forward TensorArray is frozen by this call.
+// As a result, the flow is used to ensure that the call to generate the gradient
+// TensorArray only happens after all writes are executed.
+//
+// In the case of dynamically sized TensorArrays, gradient computation should
+// only be performed on read operations that have themselves been chained via
+// flow to occur only after all writes have executed. That way the final size
+// of the forward TensorArray is known when this operation is called.
+//
+// **A note about the source attribute:**
+//
+// TensorArray gradient calls use an accumulator TensorArray object.  If
+// multiple gradients are calculated and run in the same session, the multiple
+// gradient nodes may accidentally flow through the same accumulator TensorArray.
+// This double counts and generally breaks the TensorArray gradient flow.
+//
+// The solution is to identify which gradient call this particular
+// TensorArray gradient is being called in.  This is performed by identifying
+// a unique string (e.g. "gradients", "gradients_1", ...) from the input
+// gradient Tensor's name.  This string is used as a suffix when creating
+// the TensorArray gradient object here (the attribute `source`).
+//
+// The attribute `source` is added as a suffix to the forward TensorArray's
+// name when performing the creation / lookup, so that each separate gradient
+// calculation gets its own TensorArray accumulator.
+//
+// Arguments:
+//	handle: The handle to the forward TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	source: The gradient source string, used to decide which gradient TensorArray
+// to return.
+func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayCloseV2",
+		Type: "TensorArrayGradV3",
 		Input: []tf.Input{
-			handle,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Forwards the value of an available tensor from `inputs` to `output`.
-//
-// `Merge` waits for at least one of the tensors in `inputs` to become available.
-// It is usually combined with `Switch` to implement branching.
-//
-// `Merge` forwards the first tensor to become available to `output`, and sets
-// `value_index` to its index in `inputs`.
-//
-// Arguments:
-//	inputs: The input tensors, exactly one of which will become available.
-//
-// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
-func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
+// Creates a dataset that splits a SparseTensor into elements row-wise.
+func SparseTensorSliceDataset(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Merge",
+		Type: "SparseTensorSliceDataset",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			indices, values, dense_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// QueueCloseV2Attr is an optional argument to QueueCloseV2.
-type QueueCloseV2Attr func(optionalAttr)
-
-// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
-//
-// value: If true, all pending enqueue requests that are
-// blocked on the given queue will be canceled.
-// If not specified, defaults to false
-func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
-	return func(m optionalAttr) {
-		m["cancel_pending_enqueues"] = value
-	}
+	return op.Output(0)
 }
 
-// Closes the given queue.
-//
-// This operation signals that no more elements will be enqueued in the
-// given queue. Subsequent Enqueue(Many) operations will fail.
-// Subsequent Dequeue(Many) operations will continue to succeed if
-// sufficient elements remain in the queue. Subsequent Dequeue(Many)
-// operations that would block will fail immediately.
+// Returns x / y element-wise for real types.
 //
-// Arguments:
-//	handle: The handle to a queue.
+// If `x` and `y` are reals, this will return the floating-point division.
 //
-// Returns the created operation.
-func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+// *NOTE*: `Div` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func RealDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueCloseV2",
+		Type: "RealDiv",
 		Input: []tf.Input{
-			handle,
+			x, y,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes inverse hyperbolic tangent of x element-wise.
-func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+// Creates a dataset that concatenates `input_dataset` with `another_dataset`.
+func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "Atanh",
+		Type: "ConcatenateDataset",
 		Input: []tf.Input{
-			x,
+			input_dataset, another_dataset,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns true if queue is closed.
+// Adds a value to the current value of a variable.
 //
-// This operation returns true if the queue is closed and false if the queue
-// is open.
+// Any ReadVariableOp which depends directly or indirectly on this assign is
+// guaranteed to see the incremented value or a subsequent newer one.
+//
+// Outputs the incremented value, which can be used to totally order the
+// increments to this variable.
 //
 // Arguments:
-//	handle: The handle to a queue.
-func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
+//
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "QueueIsClosedV2",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			handle,
+			resource, value,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+// Records the latency of producing `input_dataset` elements in a StatsAggregator.
+func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
+		Type: "LatencyStatsDataset",
 		Input: []tf.Input{
-			input,
+			input_dataset, tag,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes the absolute value of a tensor.
+// Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// Given a tensor `x`, this operation returns a tensor containing the absolute
-// value of each element in `x`. For example, if x is an input element and y is
-// an output element, this operation computes \\(y = |x|\\).
-func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Abs",
+		Type: "DecodeJSONExample",
 		Input: []tf.Input{
-			x,
+			json_examples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StackV2Attr is an optional argument to StackV2.
-type StackV2Attr func(optionalAttr)
-
-// StackV2StackName sets the optional stack_name attribute to value.
+// Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
-// value: Overrides the name used for the temporary stack resource. Default
-// value is the name of the 'Stack' op (which is guaranteed unique).
-// If not specified, defaults to ""
-func StackV2StackName(value string) StackV2Attr {
-	return func(m optionalAttr) {
-		m["stack_name"] = value
-	}
-}
-
-// A stack that produces elements in first-in last-out order.
+// The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+// `filter` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+// input channel is processed independently of the others with its own structuring
+// function. The `output` tensor has shape
+// `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+// tensor depend on the `padding` algorithm. We currently only support the default
+// "NHWC" `data_format`.
+//
+// In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+// (for consistency with `conv2d`, we use unmirrored filters):
+//
+//     output[b, y, x, c] =
+//        max_{dy, dx} input[b,
+//                           strides[1] * y + rates[1] * dy,
+//                           strides[2] * x + rates[2] * dx,
+//                           c] +
+//                     filter[dy, dx, c]
+//
+// Max-pooling is a special case when the filter has size equal to the pooling
+// kernel size and contains all zeros.
+//
+// Note on duality: The dilation of `input` by the `filter` is equal to the
+// negation of the erosion of `-input` by the reflected `filter`.
 //
 // Arguments:
-//	max_size: The maximum size of the stack if non-negative. If negative, the stack
-// size is unlimited.
-//	elem_type: The type of the elements on the stack.
+//	input: 4-D with shape `[batch, in_height, in_width, depth]`.
+//	filter: 3-D with shape `[filter_height, filter_width, depth]`.
+//	strides: The stride of the sliding window for each dimension of the input
+// tensor. Must be: `[1, stride_height, stride_width, 1]`.
+//	rates: The input stride for atrous morphological dilation. Must be:
+// `[1, rate_height, rate_width, 1]`.
+//	padding: The type of padding algorithm to use.
 //
-// Returns The handle to the stack.
-func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
+// Returns 4-D with shape `[batch, out_height, out_width, depth]`.
+func Dilation2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, rates []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"elem_type": elem_type}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"strides": strides, "rates": rates, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "StackV2",
+		Type: "Dilation2D",
 		Input: []tf.Input{
-			max_size,
+			input, filter,
 		},
 		Attrs: attrs,
 	}
@@ -23733,115 +22498,80 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
-
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
+// Converts the given variant tensor to an iterator and stores it in the given resource.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
+//	serialized: A variant tensor storing the state of the iterator contained in the
+// resource.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Returns the created operation.
+func DeserializeIterator(scope *Scope, resource_handle tf.Output, serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+	opspec := tf.OpSpec{
+		Type: "DeserializeIterator",
+		Input: []tf.Input{
+			resource_handle, serialized,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// TensorArrayConcatV2Attr is an optional argument to TensorArrayConcatV2.
+type TensorArrayConcatV2Attr func(optionalAttr)
 
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+// TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+// Deprecated. Use TensorArrayConcatV3
+func TensorArrayConcatV2(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV2Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
+		Type: "TensorArrayConcatV2",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
-}
-
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Push an element onto the stack.
+// Creates a dataset that batches and pads `batch_size` elements from the input.
 //
 // Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	padded_shapes: A list of int64 tensors representing the desired padded shapes
+// of the corresponding output components. These shapes may be partially
+// specified, using `-1` to indicate that a particular dimension should be
+// padded to the maximum size of all batch elements.
+//	padding_values: A list of scalars containing the padding value to use for
+// each of the outputs.
+//
+func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, padded_shapes []tf.Output, padding_values []tf.Output, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "StackPushV2",
+		Type: "PaddedBatchDataset",
 		Input: []tf.Input{
-			handle, elem,
+			input_dataset, batch_size, tf.OutputList(padded_shapes), tf.OutputList(padding_values),
 		},
 		Attrs: attrs,
 	}
@@ -23849,110 +22579,80 @@ func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...Sta
 	return op.Output(0)
 }
 
-// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
-type FusedBatchNormGradV2Attr func(optionalAttr)
-
-// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
+// Creates a dataset that batches input elements into a SparseTensor.
 //
-// value: A small float number added to the variance of x.
-// If not specified, defaults to 0.0001
-func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["epsilon"] = value
-	}
-}
-
-// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
+// Arguments:
+//	input_dataset: A handle to an input dataset. Must have a single component.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//	row_shape: A vector representing the dense shape of each row in the produced
+// SparseTensor. The shape may be partially specified, using `-1` to indicate
+// that a particular dimension should use the maximum size of all batch elements.
 //
-// value: The data format for y_backprop, x, x_backprop.
-// Either "NHWC" (default) or "NCHW".
-// If not specified, defaults to "NHWC"
-func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
 //
-// value: A bool value to indicate the operation is for training (default)
-// or inference.
-// If not specified, defaults to true
-func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
-	return func(m optionalAttr) {
-		m["is_training"] = value
+func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DenseToSparseBatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size, row_shape,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Gradient for batch normalization.
-//
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
-//
-// Arguments:
-//	y_backprop: A 4D Tensor for the gradient with respect to y.
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
-// mean to be reused in gradient computation. When is_training is
-// False, a 1D Tensor for the population mean to be reused in both
-// 1st and 2nd order gradient computation.
-//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
-// variance (inverted variance in the cuDNN case) to be reused in
-// gradient computation. When is_training is False, a 1D Tensor
-// for the population variance to be reused in both 1st and 2nd
-// order gradient computation.
+// Deprecated. Use TensorArrayGradV3
 //
-// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
-// in FusedBatchNorm.
-func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormGradV2",
+		Type: "TensorArrayGradV2",
 		Input: []tf.Input{
-			y_backprop, x, scale, reserve_space_1, reserve_space_2,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// DecodeCompressedAttr is an optional argument to DecodeCompressed.
-type DecodeCompressedAttr func(optionalAttr)
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
 
-// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
 //
-// value: A scalar containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-// If not specified, defaults to ""
-func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
 	return func(m optionalAttr) {
-		m["compression_type"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Decompress strings.
-//
-// This op decompresses each element of the `bytes` input `Tensor`, which
-// is assumed to be compressed using the given `compression_type`.
-//
-// The `output` is a string `Tensor` of the same shape as `bytes`,
-// each element containing the decompressed data from the corresponding
-// element in `bytes`.
+// var: Should be from a Variable().
 //
 // Arguments:
-//	bytes: A Tensor of string which is compressed.
 //
-// Returns A Tensor with the same shape as input `bytes`, uncompressed
-// from bytes.
-func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -23961,1253 +22661,1296 @@ func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompresse
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCompressed",
+		Type: "ResourceSparseApplyAdadelta",
 		Input: []tf.Input{
-			bytes,
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
 		},
 		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
+}
+
+// Identity op for gradient debugging.
+//
+// This op is hidden from public in Python. It is used by TensorFlow Debugger to
+// register gradient tensors for gradient debugging.
+// This op operates on non-reference-type tensors.
+func DebugGradientIdentity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DebugGradientIdentity",
+		Input: []tf.Input{
+			input,
+		},
+	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Creates a TensorArray for storing the gradients of values in the given handle.
+// Return substrings from `Tensor` of strings.
 //
-// If the given TensorArray gradient already exists, returns a reference to it.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
 //
-// Locks the size of the original TensorArray by disabling its dynamic size flag.
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
 //
-// **A note about the input flow_in:**
+// If `pos` is negative or specifies a character index larger than any of the input
+// strings, then an `InvalidArgumentError` is thrown.
 //
-// The handle flow_in forces the execution of the gradient lookup to occur
-// only after certain other operations have occurred.  For example, when
-// the forward TensorArray is dynamically sized, writes to this TensorArray
-// may resize the object.  The gradient TensorArray is statically sized based
-// on the size of the forward TensorArray when this operation executes.
-// Furthermore, the size of the forward TensorArray is frozen by this call.
-// As a result, the flow is used to ensure that the call to generate the gradient
-// TensorArray only happens after all writes are executed.
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
 //
-// In the case of dynamically sized TensorArrays, gradient computation should
-// only be performed on read operations that have themselves been chained via
-// flow to occur only after all writes have executed. That way the final size
-// of the forward TensorArray is known when this operation is called.
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
 //
-// **A note about the source attribute:**
+// ---
 //
-// TensorArray gradient calls use an accumulator TensorArray object.  If
-// multiple gradients are calculated and run in the same session, the multiple
-// gradient nodes may accidentally flow through the same accumulator TensorArray.
-// This double counts and generally breaks the TensorArray gradient flow.
+// Examples
 //
-// The solution is to identify which gradient call this particular
-// TensorArray gradient is being called in.  This is performed by identifying
-// a unique string (e.g. "gradients", "gradients_1", ...) from the input
-// gradient Tensor's name.  This string is used as a suffix when creating
-// the TensorArray gradient object here (the attribute `source`).
+// Using scalar `pos` and `len`:
 //
-// The attribute `source` is added as a suffix to the forward TensorArray's
-// name when performing the creation / lookup, so that each separate gradient
-// calculation gets its own TensorArray accumulator.
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
 //
 // Arguments:
-//	handle: The handle to the forward TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	source: The gradient source string, used to decide which gradient TensorArray
-// to return.
-func TensorArrayGradV3(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output, flow_out tf.Output) {
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"source": source}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV3",
+		Type: "Substr",
 		Input: []tf.Input{
-			handle, flow_in,
+			input, pos, len,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
-//
-// Each comparison returns a boolean `true` (if `input_value > threshold`)
-// or and `false` otherwise.
-//
-// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
-// algorithms that use hashing approximations of cosine and `L2` distances;
-// codes can be generated from an input via:
-//
-// ```python
-// codebook_size = 50
-// codebook_bits = codebook_size * 32
-// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
-//                            dtype=x.dtype,
-//                            initializer=tf.orthogonal_initializer())
-// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
-// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
-// # now codes has shape x.shape[:-1] + [codebook_size]
-// ```
-//
-// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
-// by 8.
-//
-// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
-// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
+// Creates a Dataset that returns pseudorandom numbers.
 //
 // Arguments:
-//	input: Values to compare against `threshold` and bitpack.
-//	threshold: Threshold to compare against.
+//	seed: A scalar seed for the random number generator. If either seed or
+// seed2 is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
 //
-// Returns The bitpacked comparisons.
-func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
+//
+func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "CompareAndBitpack",
+		Type: "RandomDataset",
 		Input: []tf.Input{
-			input, threshold,
+			seed, seed2,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Push an element onto the tensor_array.
+// Creates a dataset that shuffles and repeats elements from `input_dataset`
+//
+// pseudorandomly.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	index: The position to write to inside the TensorArray.
-//	value: The tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+//	buffer_size: The number of output elements to buffer in an iterator over
+// this dataset. Compare with the `min_after_dequeue` attr when creating a
+// `RandomShuffleQueue`.
+//	seed: A scalar seed for the random number generator. If either `seed` or
+// `seed2` is set to be non-zero, the random number generator is seeded
+// by the given seed.  Otherwise, a random seed is used.
+//	seed2: A second scalar seed to avoid seed collision.
+//	count: A scalar representing the number of times the underlying dataset
+// should be repeated. The default is `-1`, which results in infinite repetition.
+//
+//
+func ShuffleAndRepeatDataset(scope *Scope, input_dataset tf.Output, buffer_size tf.Output, seed tf.Output, seed2 tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV3",
+		Type: "ShuffleAndRepeatDataset",
 		Input: []tf.Input{
-			handle, index, value, flow_in,
+			input_dataset, buffer_size, seed, seed2, count,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Scatter the data from the input value into specific TensorArray elements.
+// Creates a dataset that caches elements from `input_dataset`.
 //
-// `indices` must be a vector, its length must match the first dim of `value`.
+// A CacheDataset will iterate over the input_dataset, and store tensors. If the
+// cache already exists, the cache will be used. If the cache is inappropriate
+// (e.g. cannot be opened, contains tensors of the wrong shape / size), an error
+// will the returned when used.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	indices: The locations at which to write the tensor elements.
-//	value: The concatenated tensor to write to the TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+//	filename: A path on the filesystem where we should cache the dataset. Note: this
+// will be a directory.
+//
+//
+func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV3",
+		Type: "CacheDataset",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			input_dataset, filename,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
-type TensorArrayConcatV3Attr func(optionalAttr)
-
-// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-//
-// value: The expected shape of an element, if known,
-// excluding the first dimension. Used to validate the shapes of
-// TensorArray elements. If this shape is not fully specified, concatenating
-// zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape_except0"] = value
-	}
-}
-
-// Concat the elements from the TensorArray into value `value`.
-//
-// Takes `T` elements of shapes
-//
-//   ```
-//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
-//   ```
-//
-// and concatenates them into a Tensor of shape:
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-//
-// All elements must have the same shape (excepting the first dimension).
+// Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
+//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
+//	data_source_name: A connection string to connect to the database.
+//	query: A SQL query to execute.
 //
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
+//
+func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
+		Type: "SqlDataset",
 		Input: []tf.Input{
-			handle, flow_in,
+			driver_name, data_source_name, query,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
+	return op.Output(0)
 }
 
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
+// Creates a dataset that emits the records from one or more binary files.
 //
 // Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+//	filenames: A scalar or a vector containing the name(s) of the file(s) to be
+// read.
+//	header_bytes: A scalar representing the number of bytes to skip at the
+// beginning of a file.
+//	record_bytes: A scalar representing the number of bytes in each record.
+//	footer_bytes: A scalar representing the number of bytes to skip at the end
+// of a file.
+//	buffer_size: A scalar representing the number of bytes to buffer. Must be > 0.
+func FixedLengthRecordDataset(scope *Scope, filenames tf.Output, header_bytes tf.Output, record_bytes tf.Output, footer_bytes tf.Output, buffer_size tf.Output) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
+		Type: "FixedLengthRecordDataset",
 		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
+			filenames, header_bytes, record_bytes, footer_bytes, buffer_size,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// Gradients for batch normalization.
 //
-// For example:
+// DEPRECATED at GraphDef version 9: Use tf.nn.batch_normalization()
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// This op is deprecated. See `tf.nn.batch_normalization`.
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//	t: A 4D input Tensor.
+//	m: A 1D mean Tensor with size matching the last dimension of t.
+// This is the first output from tf.nn.moments,
+// or a saved moving average thereof.
+//	v: A 1D variance Tensor with size matching the last dimension of t.
+// This is the second output from tf.nn.moments,
+// or a saved moving average thereof.
+//	gamma: A 1D gamma Tensor with size matching the last dimension of t.
+// If "scale_after_normalization" is true, this Tensor will be multiplied
+// with the normalized Tensor.
+//	backprop: 4D backprop Tensor.
+//	variance_epsilon: A small float number to avoid dividing by 0.
+//	scale_after_normalization: A bool indicating whether the resulted tensor
+// needs to be multiplied with gamma.
+//
+// Returns 4D backprop tensor for input.1D backprop tensor for mean.1D backprop tensor for variance.1D backprop tensor for beta.1D backprop tensor for gamma.
+func BatchNormWithGlobalNormalizationGrad(scope *Scope, t tf.Output, m tf.Output, v tf.Output, gamma tf.Output, backprop tf.Output, variance_epsilon float32, scale_after_normalization bool) (dx tf.Output, dm tf.Output, dv tf.Output, db tf.Output, dg tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"variance_epsilon": variance_epsilon, "scale_after_normalization": scale_after_normalization}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "BatchNormWithGlobalNormalizationGrad",
 		Input: []tf.Input{
-			diagonal,
+			t, m, v, gamma, backprop,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// Split the data from the input value into TensorArray elements.
-//
-// Assuming that `lengths` takes on values
+// Creates a dataset that emits the records from one or more TFRecord files.
 //
-//   ```(n0, n1, ..., n(T-1))```
+// Arguments:
+//	filenames: A scalar or vector containing the name(s) of the file(s) to be
+// read.
+//	compression_type: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+//	buffer_size: A scalar representing the number of bytes to buffer. A value of
+// 0 means no buffering will be performed.
+func TFRecordDataset(scope *Scope, filenames tf.Output, compression_type tf.Output, buffer_size tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TFRecordDataset",
+		Input: []tf.Input{
+			filenames, compression_type, buffer_size,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// BatchToSpace for 4-D tensors of type T.
 //
-// and that `value` has shape
+// This is a legacy version of the more general BatchToSpaceND.
 //
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+// Rearranges (permutes) data from batch into blocks of spatial data, followed by
+// cropping. This is the reverse transformation of SpaceToBatch. More specifically,
+// this op outputs a copy of the input tensor where values from the `batch`
+// dimension are moved in spatial blocks to the `height` and `width` dimensions,
+// followed by cropping along the `height` and `width` dimensions.
 //
-// this splits values into a TensorArray with T tensors.
+// Arguments:
+//	input: 4-D tensor with shape
+// `[batch*block_size*block_size, height_pad/block_size, width_pad/block_size,
+//   depth]`. Note that the batch size of the input tensor must be divisible by
+// `block_size * block_size`.
+//	crops: 2-D tensor of non-negative integers with shape `[2, 2]`. It specifies
+// how many elements to crop from the intermediate result across the spatial
+// dimensions as follows:
 //
-// TensorArray index t will be the subtensor of values with starting position
+//     crops = [[crop_top, crop_bottom], [crop_left, crop_right]]
 //
-//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
 //
-// and having size
+// Returns 4-D with shape `[batch, height, width, depth]`, where:
 //
-//   ```nt x d0 x d1 x ...```
+//       height = height_pad - crop_top - crop_bottom
+//       width = width_pad - crop_left - crop_right
 //
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	value: The concatenated tensor to write to the TensorArray.
-//	lengths: The vector of lengths, how to split the rows of value into the
-// TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
+// The attr `block_size` must be greater than one. It indicates the block size.
 //
-// Returns A float scalar that enforces proper chaining of operations.
-func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Some examples:
+//
+// (1) For the following input of shape `[4, 1, 1, 1]` and block_size of 2:
+//
+// ```
+// [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 1]` and value:
+//
+// ```
+// x = [[[[1], [2]], [[3], [4]]]]
+// ```
+//
+// (2) For the following input of shape `[4, 1, 1, 3]` and block_size of 2:
+//
+// ```
+// [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+// ```
+//
+// The output tensor has shape `[1, 2, 2, 3]` and value:
+//
+// ```
+// x = [[[[1, 2, 3], [4, 5, 6]],
+//       [[7, 8, 9], [10, 11, 12]]]]
+// ```
+//
+// (3) For the following input of shape `[4, 2, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]], [[9], [11]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[1, 4, 4, 1]` and value:
+//
+// ```
+// x = [[[1],   [2],  [3],  [4]],
+//      [[5],   [6],  [7],  [8]],
+//      [[9],  [10], [11],  [12]],
+//      [[13], [14], [15],  [16]]]
+// ```
+//
+// (4) For the following input of shape `[8, 1, 2, 1]` and block_size of 2:
+//
+// ```
+// x = [[[[1], [3]]], [[[9], [11]]], [[[2], [4]]], [[[10], [12]]],
+//      [[[5], [7]]], [[[13], [15]]], [[[6], [8]]], [[[14], [16]]]]
+// ```
+//
+// The output tensor has shape `[2, 2, 4, 1]` and value:
+//
+// ```
+// x = [[[[1], [3]], [[5], [7]]],
+//      [[[2], [4]], [[10], [12]]],
+//      [[[5], [7]], [[13], [15]]],
+//      [[[6], [8]], [[14], [16]]]]
+// ```
+func BatchToSpace(scope *Scope, input tf.Output, crops tf.Output, block_size int64) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"block_size": block_size}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV3",
+		Type: "BatchToSpace",
 		Input: []tf.Input{
-			handle, value, lengths, flow_in,
+			input, crops,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// SerializeSparseAttr is an optional argument to SerializeSparse.
-type SerializeSparseAttr func(optionalAttr)
-
-// SerializeSparseOutType sets the optional out_type attribute to value.
+// Makes a new iterator from the given `dataset` and stores it in `iterator`.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
+// This operation may be executed multiple times. Each execution will reset the
+// iterator in `iterator` to the first element of `dataset`.
+//
+// Returns the created operation.
+func MakeIterator(scope *Scope, dataset tf.Output, iterator tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MakeIterator",
+		Input: []tf.Input{
+			dataset, iterator,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
+// Adjust the contrast of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last 3 dimensions are
+// interpreted as `[height, width, channels]`.  The other dimensions only
+// represent a collection of images, such as `[batch, height, width, channels].`
+//
+// Contrast is adjusted independently for each channel of each image.
+//
+// For each channel, the Op first computes the mean of the image pixels in the
+// channel and then adjusts each component of each pixel to
+// `(x - mean) * contrast_factor + mean`.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
+//	images: Images to adjust.  At least 3-D.
+//	contrast_factor: A float multiplier for adjusting contrast.
+//
+// Returns The contrast-adjusted image or images.
+func AdjustContrastv2(scope *Scope, images tf.Output, contrast_factor tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SerializeSparse",
+		Type: "AdjustContrastv2",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			images, contrast_factor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
-type RandomShuffleQueueV2Attr func(optionalAttr)
-
-// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
-//
-// value: The shape of each component in a value. The length of this attr must
-// be either 0 or the same as the length of component_types. If the length of
-// this attr is 0, the shapes of queue elements are not constrained, and
-// only one element may be dequeued at a time.
-// If not specified, defaults to <>
-//
-// REQUIRES: len(value) >= 0
-func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["shapes"] = value
+// Gets the next output from the given iterator.
+func IteratorGetNext(scope *Scope, iterator tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "IteratorGetNext",
+		Input: []tf.Input{
+			iterator,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("IteratorGetNext", err)
+		return
 	}
+	return components
 }
 
-// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
+// Outputs the single element from the given dataset.
 //
-// value: The upper bound on the number of elements in this queue.
-// Negative numbers mean no limit.
-// If not specified, defaults to -1
-func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
+// Arguments:
+//	dataset: A handle to a dataset that contains a single element.
+//
+//
+//
+// Returns The components of the single element of `input`.
+func DatasetToSingleElement(scope *Scope, dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "DatasetToSingleElement",
+		Input: []tf.Input{
+			dataset,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("DatasetToSingleElement", err)
+		return
 	}
+	return components
 }
 
-// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
+// Converts the given `resource_handle` representing an iterator to a string.
 //
-// value: Dequeue will block unless there would be this
-// many elements after the dequeue or the queue is closed. This
-// ensures a minimum level of mixing of elements.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["min_after_dequeue"] = value
-	}
-}
-
-// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
+// Arguments:
+//	resource_handle: A handle to an iterator resource.
 //
-// value: If either seed or seed2 is set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Returns A string representation of the given handle.
+func IteratorToStringHandle(scope *Scope, resource_handle tf.Output) (string_handle tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "IteratorToStringHandle",
+		Input: []tf.Input{
+			resource_handle,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// RandomShuffleQueueV2Container sets the optional container attribute to value.
+// IteratorFromStringHandleAttr is an optional argument to IteratorFromStringHandle.
+type IteratorFromStringHandleAttr func(optionalAttr)
+
+// IteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
+// value: If specified, defines the type of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["container"] = value
+		m["output_types"] = value
 	}
 }
 
-// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
+// IteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
-// value: If non-empty, this queue will be shared under the given name
-// across multiple sessions.
-// If not specified, defaults to ""
-func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
+// value: If specified, defines the shape of each tuple component in an
+// element produced by the resulting iterator.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["output_shapes"] = value
 	}
 }
 
-// A queue that randomizes the order of elements.
+// Converts the given string representing a handle to an iterator to a resource.
 //
 // Arguments:
-//	component_types: The type of each component in a value.
+//	string_handle: A string representation of the given handle.
 //
-// Returns The handle to the queue.
-func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+// Returns A handle to an iterator resource.
+func IteratorFromStringHandle(scope *Scope, string_handle tf.Output, optional ...IteratorFromStringHandleAttr) (resource_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffleQueueV2",
-
+		Type: "IteratorFromStringHandle",
+		Input: []tf.Input{
+			string_handle,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Draw bounding boxes on a batch of images.
-//
-// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
-// boxes specified by the locations in `boxes`. The coordinates of the each
-// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
-// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
-// height of the underlying image.
-//
-// For example, if an image is 100 x 200 pixels (height x width) and the bounding
-// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
-// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
-//
-// Parts of the bounding box may fall outside the image.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
-//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
-// boxes.
+// Computes arctangent of `y/x` element-wise, respecting signs of the arguments.
 //
-// Returns 4-D with the same shape as `images`. The batch of input images with
-// bounding boxes drawn on the images.
-func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+// This is the angle \( \theta \in [-\pi, \pi] \) such that
+// \[ x = r \cos(\theta) \]
+// and
+// \[ y = r \sin(\theta) \]
+// where \(r = \sqrt(x^2 + y^2) \).
+func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DrawBoundingBoxes",
+		Type: "Atan2",
 		Input: []tf.Input{
-			images, boxes,
+			y, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
-type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
-
-// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
-//
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Generates labels for candidate sampling with a learned unigram distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
-//
-// For each batch, this op picks a single set of sampled candidate labels.
-//
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
-//
-// Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "LearnedUnigramCandidateSampler",
+		Type: "Identity",
 		Input: []tf.Input{
-			true_classes,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Computes gradients for the scaled exponential linear (Selu) operation.
+// Gather slices from `params` axis `axis` according to `indices`.
+//
+// `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
+// params.shape[axis + 1:]` where:
+//
+// ```python
+//     # Scalar indices (output is rank(params) - 1).
+//     output[a_0, ..., a_n, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices, b_0, ..., b_n]
+//
+//     # Vector indices (output is rank(params)).
+//     output[a_0, ..., a_n, i, b_0, ..., b_n] =
+//       params[a_0, ..., a_n, indices[i], b_0, ..., b_n]
+//
+//     # Higher rank indices (output is rank(params) + rank(indices) - 1).
+//     output[a_0, ..., a_n, i, ..., j, b_0, ... b_n] =
+//       params[a_0, ..., a_n, indices[i, ..., j], b_0, ..., b_n]
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/Gather.png" alt>
+// </div>
 //
 // Arguments:
-//	gradients: The backpropagated gradients to the corresponding Selu operation.
-//	outputs: The outputs of the corresponding Selu operation.
+//	params: The tensor from which to gather values. Must be at least rank
+// `axis + 1`.
+//	indices: Index tensor. Must be in range `[0, params.shape[axis])`.
+//	axis: The axis in `params` to gather `indices` from. Defaults to the first
+// dimension. Supports negative indexes.
 //
-// Returns The gradients: `gradients * (outputs + scale * alpha)`
-// if outputs < 0, `scale * gradients` otherwise.
-func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `params.shape[:axis] + indices.shape + params.shape[axis + 1:]`.
+func GatherV2(scope *Scope, params tf.Output, indices tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SeluGrad",
+		Type: "GatherV2",
 		Input: []tf.Input{
-			gradients, outputs,
+			params, indices, axis,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Get the current size of the TensorArray.
+// Converts the given `resource_handle` representing an iterator to a variant tensor.
 //
 // Arguments:
-//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
-//	flow_in: A float scalar that enforces proper chaining of operations.
+//	resource_handle: A handle to an iterator resource.
 //
-// Returns The current size of the TensorArray.
-func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// Returns A variant tensor storing the state of the iterator contained in the
+// resource.
+func SerializeIterator(scope *Scope, resource_handle tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV3",
+		Type: "SerializeIterator",
 		Input: []tf.Input{
-			handle, flow_in,
+			resource_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayGradV3
+// FIFOQueueV2Attr is an optional argument to FIFOQueueV2.
+type FIFOQueueV2Attr func(optionalAttr)
+
+// FIFOQueueV2Shapes sets the optional shapes attribute to value.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
-func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayWriteV2",
-		Input: []tf.Input{
-			handle, index, value, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
-type SparseReduceMaxAttr func(optionalAttr)
-
-// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
+// REQUIRES: len(value) >= 0
+func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
-//
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
-//
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
-//
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-//
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReduceMax",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
-		},
-		Attrs: attrs,
+		m["shapes"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// AsStringAttr is an optional argument to AsString.
-type AsStringAttr func(optionalAttr)
-
-// AsStringPrecision sets the optional precision attribute to value.
+// FIFOQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: The post-decimal precision to use for floating point numbers.
-// Only used if precision > -1.
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
 // If not specified, defaults to -1
-func AsStringPrecision(value int64) AsStringAttr {
-	return func(m optionalAttr) {
-		m["precision"] = value
-	}
-}
-
-// AsStringScientific sets the optional scientific attribute to value.
-//
-// value: Use scientific notation for floating point numbers.
-// If not specified, defaults to false
-func AsStringScientific(value bool) AsStringAttr {
-	return func(m optionalAttr) {
-		m["scientific"] = value
-	}
-}
-
-// AsStringShortest sets the optional shortest attribute to value.
-//
-// value: Use shortest representation (either scientific or standard) for
-// floating point numbers.
-// If not specified, defaults to false
-func AsStringShortest(value bool) AsStringAttr {
+func FIFOQueueV2Capacity(value int64) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shortest"] = value
+		m["capacity"] = value
 	}
 }
 
-// AsStringWidth sets the optional width attribute to value.
+// FIFOQueueV2Container sets the optional container attribute to value.
 //
-// value: Pad pre-decimal numbers to this width.
-// Applies to both floating point and integer numbers.
-// Only used if width > -1.
-// If not specified, defaults to -1
-func AsStringWidth(value int64) AsStringAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FIFOQueueV2Container(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["width"] = value
+		m["container"] = value
 	}
 }
 
-// AsStringFill sets the optional fill attribute to value.
+// FIFOQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: The value to pad if width > -1.  If empty, pads with spaces.
-// Another typical value is '0'.  String cannot be longer than 1 character.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
 // If not specified, defaults to ""
-func AsStringFill(value string) AsStringAttr {
+func FIFOQueueV2SharedName(value string) FIFOQueueV2Attr {
 	return func(m optionalAttr) {
-		m["fill"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Converts each entry in the given tensor to strings.  Supports many numeric
+// A queue that produces elements in first-in first-out order.
 //
-// types and boolean.
-func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AsString",
-		Input: []tf.Input{
-			input,
-		},
+		Type: "FIFOQueueV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Deprecated. Use TensorArrayScatterV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
-func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+// Produces a summary of any statistics recorded by the given statistics manager.
+func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayScatterV2",
+		Type: "StatsAggregatorSummary",
 		Input: []tf.Input{
-			handle, indices, value, flow_in,
+			iterator,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Applies sparse addition to `input` using individual values or slices
-//
-// from `updates` according to indices `indices`.  The updates are non-aliasing:
-// `input` is only modified in-place if no other operations will use it.
-// Otherwise, a copy of `input` is made.  This operation has a gradient with
-// respect to both `input` and `updates`.
-//
-// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `input`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
-// (if `K < P`) along the `K`th dimension of `input`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
-// ```
-//
-// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
-// elements. In Python, that addition would look like this:
+// Compute the pairwise cross product.
 //
-//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
-//     with tf.Session() as sess:
-//       print(sess.run(output))
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
 //
-// The resulting value `output` would look like this:
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
 //
-//     [1, 13, 3, 14, 14, 6, 7, 20]
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Performs a padding as a preprocess during a convolution.
 //
-// See @{tf.scatter_nd} for more details about how to make updates to slices.
+// Similar to FusedResizeAndPadConv2d, this op allows for an optimized
+// implementation where the spatial padding transformation stage is fused with the
+// im2col lookup, but in this case without the bilinear filtering required for
+// resizing. Fusing the padding prevents the need to write out the intermediate
+// results as whole tensors, reducing memory pressure, and we can get some latency
+// gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and 'NHWC'
+// order is used instead.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
 //
 // Arguments:
-//	input: A Tensor.
-//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
-// A tensor of indices into `input`.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
-// to add to `input`.
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
 //
-// Returns A `Tensor` with the same shape as `input`, containing values of `input`
-// updated with `updates`.
-func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
 	opspec := tf.OpSpec{
-		Type: "ScatterNdNonAliasingAdd",
+		Type: "FusedPadConv2D",
 		Input: []tf.Input{
-			input, indices, updates,
+			input, paddings, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
-type FractionalMaxPoolAttr func(optionalAttr)
+// Conv2DBackpropInputAttr is an optional argument to Conv2DBackpropInput.
+type Conv2DBackpropInputAttr func(optionalAttr)
 
-// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
-//
-// value: When set to True, generates the pooling sequence in a
-// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
-// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
-// difference between pseudorandom and random.
-// If not specified, defaults to false
-func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
+// Conv2DBackpropInputUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["pseudo_random"] = value
+		m["use_cudnn_on_gpu"] = value
 	}
 }
 
-// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
+// Conv2DBackpropInputDataFormat sets the optional data_format attribute to value.
 //
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["overlapping"] = value
+		m["data_format"] = value
 	}
 }
 
-// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
+// Conv2DBackpropInputDilations sets the optional dilations attribute to value.
 //
-// value: When set to True, a fixed pooling region will be used when
-// iterating over a FractionalMaxPool node in the computation graph. Mainly used
-// in unit test to make FractionalMaxPool deterministic.
-// If not specified, defaults to false
-func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
-		m["deterministic"] = value
+		m["dilations"] = value
 	}
 }
 
-// FractionalMaxPoolSeed sets the optional seed attribute to value.
+// Computes the gradients of convolution with respect to the input.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+// Arguments:
+//	input_sizes: An integer vector representing the shape of `input`,
+// where `input` is a 4-D `[batch, height, width, channels]` tensor.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+// Returns 4-D with shape `[batch, in_height, in_width, in_channels]`.  Gradient
+// w.r.t. the input of the convolution.
+func Conv2DBackpropInput(scope *Scope, input_sizes tf.Output, filter tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropInputAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropInput",
+		Input: []tf.Input{
+			input_sizes, filter, out_backprop,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Performs fractional max pooling on the input.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Fractional max pooling is slightly different than regular max pooling.  In
-// regular max pooling, you downsize an input set by taking the maximum value of
-// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
-// a factor of N, where N is an integer.  Fractional max pooling, as you might
-// expect from the word "fractional", means that the overall reduction ratio N
-// does not have to be an integer.
+// Builds a merged tensor such that
 //
-// The sizes of the pooling regions are generated randomly but are fairly uniform.
-// For example, let's look at the height dimension, and the constraints on the
-// list of rows that will be pool boundaries.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// First we define the following:
+// For example, if each `indices[m]` is scalar or vector, we have
 //
-// 1.  input_row_length : the number of rows from the input set
-// 2.  output_row_length : which will be smaller than the input
-// 3.  alpha = input_row_length / output_row_length : our reduction ratio
-// 4.  K = floor(alpha)
-// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// Then, row_pooling_sequence should satisfy:
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-// 1.  a[0] = 0 : the first value of the sequence is 0
-// 2.  a[end] = input_row_length : the last value of the sequence is the size
-// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
-// 4.  length(row_pooling_sequence) = output_row_length+1
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// For more details on fractional max pooling, see this paper:
-// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+//     merged.shape = [max(indices)] + constant
 //
-// Arguments:
-//	value: 4-D with shape `[batch, height, width, channels]`.
-//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
-// supports row and col dimension and should be >= 1.0. For example, a valid
-// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
-// must be 1.0 because we don't allow pooling on batch and channels
-// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
-// respectively.
+// Values are merged in order, so if an index appears in both `indices[m][i]` and
+// `indices[n][j]` for `(m,i) < (n,j)` the slice `data[n][j]` will appear in the
+// merged result. If you do not need this guarantee, ParallelDynamicStitch might
+// perform better on some devices.
 //
-// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
-func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPool",
+		Type: "DynamicStitch",
 		Input: []tf.Input{
-			value,
+			tf.OutputList(indices), tf.OutputList(data),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Deprecated. Use TensorArraySizeV3
+// Returns the truth value of (x == y) element-wise.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
-func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
+// *NOTE*: `Equal` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Equal(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArraySizeV2",
+		Type: "Equal",
 		Input: []tf.Input{
-			handle, flow_in,
+			x, y,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Conv2DAttr is an optional argument to Conv2D.
-type Conv2DAttr func(optionalAttr)
+// TensorArrayGatherV2Attr is an optional argument to TensorArrayGatherV2.
+type TensorArrayGatherV2Attr func(optionalAttr)
 
-// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+// TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
+		m["element_shape"] = value
 	}
 }
 
-// Conv2DDataFormat sets the optional data_format attribute to value.
+// Deprecated. Use TensorArrayGatherV3
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, height, width, channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, channels, height, width].
-// If not specified, defaults to "NHWC"
-func Conv2DDataFormat(value string) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
+// DEPRECATED at GraphDef version 26: Use TensorArrayGatherV3
+func TensorArrayGatherV2(scope *Scope, handle tf.Output, indices tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayGatherV2Attr) (value tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// Conv2DDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each
-// filter element on that dimension. The dimension order is determined by the
-// value of `data_format`, see above for details. Dilations in the batch and
-// depth dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DDilations(value []int64) Conv2DAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGatherV2",
+		Input: []tf.Input{
+			handle, indices, flow_in,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
+// Interleave the values from the `data` tensors into a single tensor.
 //
-// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-// and a filter / kernel tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`, this op
-// performs the following:
+// Builds a merged tensor such that
 //
-// 1. Flattens the filter to a 2-D matrix with shape
-//    `[filter_height * filter_width * in_channels, output_channels]`.
-// 2. Extracts image patches from the input tensor to form a *virtual*
-//    tensor of shape `[batch, out_height, out_width,
-//    filter_height * filter_width * in_channels]`.
-// 3. For each patch, right-multiplies the filter matrix and the image patch
-//    vector.
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
 //
-// In detail, with the default NHWC format,
+// For example, if each `indices[m]` is scalar or vector, we have
 //
-//     output[b, i, j, k] =
-//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
-//                         filter[di, dj, q, k]
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
 //
-// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
-// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
 //
-// Arguments:
-//	input: A 4-D tensor. The dimension order is interpreted according to the value
-// of `data_format`, see below for details.
-//	filter: A 4-D tensor of shape
-// `[filter_height, filter_width, in_channels, out_channels]`
-//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
-// dimension of `input`. The dimension order is determined by the value of
-// `data_format`, see below for details.
-//	padding: The type of padding algorithm to use.
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
 //
-// Returns A 4-D tensor. The dimension order is determined by the value of
-// `data_format`, see below for details.
-func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
+	opspec := tf.OpSpec{
+		Type: "ParallelDynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Conv2D",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			input, filter,
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// StageAttr is an optional argument to Stage.
-type StageAttr func(optionalAttr)
+// PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
+type PriorityQueueV2Attr func(optionalAttr)
 
-// StageCapacity sets the optional capacity attribute to value.
+// PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// value: The type of each component in a value.
+// If not specified, defaults to <>
 //
-// REQUIRES: value >= 0
-func StageCapacity(value int64) StageAttr {
+// REQUIRES: len(value) >= 0
+func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["component_types"] = value
 	}
 }
 
-// StageMemoryLimit sets the optional memory_limit attribute to value.
-//
-// value: The maximum number of bytes allowed for Tensors in the Staging Area.
-// If > 0, inserts will block until sufficient space is available.
-// If not specified, defaults to 0
+// PriorityQueueV2Capacity sets the optional capacity attribute to value.
 //
-// REQUIRES: value >= 0
-func StageMemoryLimit(value int64) StageAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func PriorityQueueV2Capacity(value int64) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
-		m["memory_limit"] = value
+		m["capacity"] = value
 	}
 }
 
-// StageContainer sets the optional container attribute to value.
+// PriorityQueueV2Container sets the optional container attribute to value.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
 // If not specified, defaults to ""
-func StageContainer(value string) StageAttr {
+func PriorityQueueV2Container(value string) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StageSharedName sets the optional shared_name attribute to value.
+// PriorityQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: It is necessary to match this name to the matching Unstage Op.
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
 // If not specified, defaults to ""
-func StageSharedName(value string) StageAttr {
+func PriorityQueueV2SharedName(value string) PriorityQueueV2Attr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Stage values similar to a lightweight Enqueue.
+// A queue that produces elements sorted by the first component value.
 //
-// The basic functionality of this Op is similar to a queue with many
-// fewer capabilities and options.  This Op is optimized for performance.
+// Note that the PriorityQueue requires the first component of any element
+// to be a scalar int64, in addition to the other elements declared by
+// component_types.  Therefore calls to Enqueue and EnqueueMany (resp. Dequeue
+// and DequeueMany) on a PriorityQueue will all require (resp. output) one extra
+// entry in their input (resp. output) lists.
 //
 // Arguments:
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
+//	shapes: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
 //
-// Returns the created operation.
-func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
+// Returns The handle to the queue.
+func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"shapes": shapes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Stage",
-		Input: []tf.Input{
-			tf.OutputList(values),
-		},
+		Type: "PriorityQueueV2",
+
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// StagePeekAttr is an optional argument to StagePeek.
-type StagePeekAttr func(optionalAttr)
+// UnstageAttr is an optional argument to Unstage.
+type UnstageAttr func(optionalAttr)
 
-// StagePeekCapacity sets the optional capacity attribute to value.
+// UnstageCapacity sets the optional capacity attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StagePeekCapacity(value int64) StagePeekAttr {
+func UnstageCapacity(value int64) UnstageAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// UnstageMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func StagePeekMemoryLimit(value int64) StagePeekAttr {
+func UnstageMemoryLimit(value int64) UnstageAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// StagePeekContainer sets the optional container attribute to value.
+// UnstageContainer sets the optional container attribute to value.
 // If not specified, defaults to ""
-func StagePeekContainer(value string) StagePeekAttr {
+func UnstageContainer(value string) UnstageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// StagePeekSharedName sets the optional shared_name attribute to value.
+// UnstageSharedName sets the optional shared_name attribute to value.
 // If not specified, defaults to ""
-func StagePeekSharedName(value string) StagePeekAttr {
+func UnstageSharedName(value string) UnstageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op peeks at the values at the specified index.  If the
+// Op is similar to a lightweight Dequeue.
 //
-// underlying container does not contain sufficient elements
-// this op will block until it does.   This Op is optimized for
-// performance.
-func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
+// The basic functionality is similar to dequeue with many fewer
+// capabilities and options.  This Op is optimized for performance.
+func Unstage(scope *Scope, dtypes []tf.DataType, optional ...UnstageAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25216,10 +23959,8 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StagePeek",
-		Input: []tf.Input{
-			index,
-		},
+		Type: "Unstage",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -25229,142 +23970,107 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 	var idx int
 	var err error
 	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("StagePeek", err)
+		scope.UpdateErr("Unstage", err)
 		return
 	}
 	return values
 }
 
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
+// QueueEnqueueV2Attr is an optional argument to QueueEnqueueV2.
+type QueueEnqueueV2Attr func(optionalAttr)
 
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// QueueEnqueueV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
+// value: If the queue is full, this operation will block for up to
+// timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueEnqueueV2TimeoutMs(value int64) QueueEnqueueV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Enqueues a tuple of one or more tensors in the given queue.
 //
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapStageContainer sets the optional container attribute to value.
+// The components input has k elements, which correspond to the components of
+// tuples stored in the given queue.
 //
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
+// N.B. If the queue is full, this operation will block until the given
+// element has been enqueued (or 'timeout_ms' elapses, if specified).
 //
 // Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
+//	handle: The handle to a queue.
+//	components: One or more tensors from which the enqueued tensors should be taken.
 //
 // Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+func QueueEnqueueV2(scope *Scope, handle tf.Output, components []tf.Output, optional ...QueueEnqueueV2Attr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapStage",
+		Type: "QueueEnqueueV2",
 		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
+			handle, tf.OutputList(components),
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
+// QueueDequeueManyV2Attr is an optional argument to QueueDequeueManyV2.
+type QueueDequeueManyV2Attr func(optionalAttr)
 
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// QueueDequeueManyV2TimeoutMs sets the optional timeout_ms attribute to value.
 //
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
+// value: If the queue has fewer than n elements, this operation
+// will block for up to timeout_ms milliseconds.
+// Note: This option is not supported yet.
+// If not specified, defaults to -1
+func QueueDequeueManyV2TimeoutMs(value int64) QueueDequeueManyV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["timeout_ms"] = value
 	}
 }
 
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Dequeues `n` tuples of one or more tensors from the given queue.
 //
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
+// If the queue is closed and there are fewer than `n` elements, then an
+// OutOfRange error is returned.
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+// This operation concatenates queue-element component tensors along the
+// 0th dimension to make a single component tensor.  All of the components
+// in the dequeued tuple will have size `n` in the 0th dimension.
+//
+// This operation has `k` outputs, where `k` is the number of components in
+// the tuples stored in the given queue, and output `i` is the ith
+// component of the dequeued tuple.
+//
+// N.B. If the queue is empty, this operation will block until `n` elements
+// have been dequeued (or 'timeout_ms' elapses, if specified).
+//
+// Arguments:
+//	handle: The handle to a queue.
+//	n: The number of tuples to dequeue.
+//	component_types: The type of each component in a tuple.
+//
+// Returns One or more tensors that were dequeued as a tuple.
+func QueueDequeueManyV2(scope *Scope, handle tf.Output, n tf.Output, component_types []tf.DataType, optional ...QueueDequeueManyV2Attr) (components []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"component_types": component_types}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapUnstage",
+		Type: "QueueDequeueManyV2",
 		Input: []tf.Input{
-			key, indices,
+			handle, n,
 		},
 		Attrs: attrs,
 	}
@@ -25374,239 +24080,352 @@ func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.Data
 	}
 	var idx int
 	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("QueueDequeueManyV2", err)
 		return
 	}
-	return values
+	return components
 }
 
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
+// EncodeBase64Attr is an optional argument to EncodeBase64.
+type EncodeBase64Attr func(optionalAttr)
 
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// EncodeBase64Pad sets the optional pad attribute to value.
 //
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
+// value: Bool whether padding is applied at the ends.
+// If not specified, defaults to false
+func EncodeBase64Pad(value bool) EncodeBase64Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["pad"] = value
 	}
 }
 
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Encode strings into web-safe base64 format.
 //
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+// Refer to the following article for more information on base64 format:
+// en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '=' at the
+// end so that the encoded has length multiple of 4. See Padding section of the
+// link above.
+//
+// Web-safe means that the encoder uses - and _ instead of + and /.
+//
+// Arguments:
+//	input: Strings to be encoded.
+//
+// Returns Input strings encoded in base64.
+func EncodeBase64(scope *Scope, input tf.Output, optional ...EncodeBase64Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapSize",
-
+		Type: "EncodeBase64",
+		Input: []tf.Input{
+			input,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Deprecated. Use TensorArrayCloseV3
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// DEPRECATED at GraphDef version 26: Use TensorArrayCloseV3
 //
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Returns the created operation.
+func TensorArrayCloseV2(scope *Scope, handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArrayCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
 	}
+	return scope.AddOperation(opspec)
 }
 
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+// Forwards the value of an available tensor from `inputs` to `output`.
+//
+// `Merge` waits for at least one of the tensors in `inputs` to become available.
+// It is usually combined with `Switch` to implement branching.
+//
+// `Merge` forwards the first tensor to become available to `output`, and sets
+// `value_index` to its index in `inputs`.
+//
+// Arguments:
+//	inputs: The input tensors, exactly one of which will become available.
+//
+// Returns Will be set to the available input tensor.The index of the chosen input tensor in `inputs`.
+func Merge(scope *Scope, inputs []tf.Output) (output tf.Output, value_index tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
+		Type: "Merge",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
-type OrderedMapUnstageAttr func(optionalAttr)
+// QueueCloseV2Attr is an optional argument to QueueCloseV2.
+type QueueCloseV2Attr func(optionalAttr)
 
-// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// QueueCloseV2CancelPendingEnqueues sets the optional cancel_pending_enqueues attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+// value: If true, all pending enqueue requests that are
+// blocked on the given queue will be canceled.
+// If not specified, defaults to false
+func QueueCloseV2CancelPendingEnqueues(value bool) QueueCloseV2Attr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["cancel_pending_enqueues"] = value
 	}
 }
 
-// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Closes the given queue.
 //
-// REQUIRES: value >= 0
-func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// This operation signals that no more elements will be enqueued in the
+// given queue. Subsequent Enqueue(Many) operations will fail.
+// Subsequent Dequeue(Many) operations will continue to succeed if
+// sufficient elements remain in the queue. Subsequent Dequeue(Many)
+// operations that would block will fail immediately.
+//
+// Arguments:
+//	handle: The handle to a queue.
+//
+// Returns the created operation.
+func QueueCloseV2(scope *Scope, handle tf.Output, optional ...QueueCloseV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueCloseV2",
+		Input: []tf.Input{
+			handle,
+		},
+		Attrs: attrs,
 	}
+	return scope.AddOperation(opspec)
 }
 
-// OrderedMapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Computes inverse hyperbolic tangent of x element-wise.
+func Atanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
+	opspec := tf.OpSpec{
+		Type: "Atanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// Returns true if queue is closed.
+//
+// This operation returns true if the queue is closed and false if the queue
+// is open.
+//
+// Arguments:
+//	handle: The handle to a queue.
+func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "QueueIsClosedV2",
+		Input: []tf.Input{
+			handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the batched diagonal part of a batched tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+//
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
+//
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the absolute value of a tensor.
+//
+// Given a tensor `x`, this operation returns a tensor containing the absolute
+// value of each element in `x`. For example, if x is an input element and y is
+// an output element, this operation computes \\(y = |x|\\).
+func Abs(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Abs",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackV2Attr is an optional argument to StackV2.
+type StackV2Attr func(optionalAttr)
+
+// StackV2StackName sets the optional stack_name attribute to value.
+//
+// value: Overrides the name used for the temporary stack resource. Default
+// value is the name of the 'Stack' op (which is guaranteed unique).
 // If not specified, defaults to ""
-func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+func StackV2StackName(value string) StackV2Attr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["stack_name"] = value
 	}
 }
 
-// Op removes and returns the values associated with the key
+// A stack that produces elements in first-in last-out order.
 //
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+// Arguments:
+//	max_size: The maximum size of the stack if non-negative. If negative, the stack
+// size is unlimited.
+//	elem_type: The type of the elements on the stack.
+//
+// Returns The handle to the stack.
+func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional ...StackV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{"elem_type": elem_type}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapUnstage",
+		Type: "StackV2",
 		Input: []tf.Input{
-			key, indices,
+			max_size,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("OrderedMapUnstage", err)
-		return
-	}
-	return values
+	return op.Output(0)
 }
 
-// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
-type OrderedMapSizeAttr func(optionalAttr)
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
 
-// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["capacity"] = value
 	}
 }
 
-// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
 // If not specified, defaults to 0
 //
 // REQUIRES: value >= 0
-func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["memory_limit"] = value
 	}
 }
 
-// OrderedMapSizeContainer sets the optional container attribute to value.
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
 // If not specified, defaults to ""
-func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["container"] = value
 	}
 }
 
-// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
 // If not specified, defaults to ""
-func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
 	return func(m optionalAttr) {
 		m["shared_name"] = value
 	}
 }
 
-// Op returns the number of elements in the underlying container.
-func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25615,29 +24434,36 @@ func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSi
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapSize",
-
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// ShapeNAttr is an optional argument to ShapeN.
-type ShapeNAttr func(optionalAttr)
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
 
-// ShapeNOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func ShapeNOutType(value tf.DataType) ShapeNAttr {
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["swap_memory"] = value
 	}
 }
 
-// Returns shape of tensors.
+// Push an element onto the stack.
 //
-// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
-func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25646,152 +24472,120 @@ func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ShapeN",
+		Type: "StackPushV2",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			handle, elem,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ShapeN", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
-type UniformCandidateSamplerAttr func(optionalAttr)
+// FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
+type FusedBatchNormGradV2Attr func(optionalAttr)
 
-// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+// FusedBatchNormGradV2Epsilon sets the optional epsilon attribute to value.
 //
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+// value: A small float number added to the variance of x.
+// If not specified, defaults to 0.0001
+func FusedBatchNormGradV2Epsilon(value float32) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["epsilon"] = value
 	}
 }
 
-// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// FusedBatchNormGradV2DataFormat sets the optional data_format attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+// value: The data format for y_backprop, x, x_backprop.
+// Either "NHWC" (default) or "NCHW".
+// If not specified, defaults to "NHWC"
+func FusedBatchNormGradV2DataFormat(value string) FusedBatchNormGradV2Attr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["data_format"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a uniform distribution.
-//
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// FusedBatchNormGradV2IsTraining sets the optional is_training attribute to value.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// value: A bool value to indicate the operation is for training (default)
+// or inference.
+// If not specified, defaults to true
+func FusedBatchNormGradV2IsTraining(value bool) FusedBatchNormGradV2Attr {
+	return func(m optionalAttr) {
+		m["is_training"] = value
+	}
+}
+
+// Gradient for batch normalization.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
+//	y_backprop: A 4D Tensor for the gradient with respect to y.
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	reserve_space_1: When is_training is True, a 1D Tensor for the computed batch
+// mean to be reused in gradient computation. When is_training is
+// False, a 1D Tensor for the population mean to be reused in both
+// 1st and 2nd order gradient computation.
+//	reserve_space_2: When is_training is True, a 1D Tensor for the computed batch
+// variance (inverted variance in the cuDNN case) to be reused in
+// gradient computation. When is_training is False, a 1D Tensor
+// for the population variance to be reused in both 1st and 2nd
+// order gradient computation.
 //
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+// Returns A 4D Tensor for the gradient with respect to x.A 1D Tensor for the gradient with respect to scale.A 1D Tensor for the gradient with respect to offset.Unused placeholder to match the mean input in FusedBatchNorm.Unused placeholder to match the variance input
+// in FusedBatchNorm.
+func FusedBatchNormGradV2(scope *Scope, y_backprop tf.Output, x tf.Output, scale tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output, optional ...FusedBatchNormGradV2Attr) (x_backprop tf.Output, scale_backprop tf.Output, offset_backprop tf.Output, reserve_space_3 tf.Output, reserve_space_4 tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "UniformCandidateSampler",
+		Type: "FusedBatchNormGradV2",
 		Input: []tf.Input{
-			true_classes,
+			y_backprop, x, scale, reserve_space_1, reserve_space_2,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// CTCLossAttr is an optional argument to CTCLoss.
-type CTCLossAttr func(optionalAttr)
-
-// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
-//
-// value: Scalar, if true then repeated labels are
-// collapsed prior to the CTC calculation.
-// If not specified, defaults to false
-func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["preprocess_collapse_repeated"] = value
-	}
-}
+// DecodeCompressedAttr is an optional argument to DecodeCompressed.
+type DecodeCompressedAttr func(optionalAttr)
 
-// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+// DecodeCompressedCompressionType sets the optional compression_type attribute to value.
 //
-// value: Scalar.  If set to false, *during* CTC calculation
-// repeated non-blank labels will not be merged and are interpreted as
-// individual labels.  This is a simplified version of CTC.
-// If not specified, defaults to true
-func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+// value: A scalar containing either (i) the empty string (no
+// compression), (ii) "ZLIB", or (iii) "GZIP".
+// If not specified, defaults to ""
+func DecodeCompressedCompressionType(value string) DecodeCompressedAttr {
 	return func(m optionalAttr) {
-		m["ctc_merge_repeated"] = value
+		m["compression_type"] = value
 	}
 }
 
-// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+// Decompress strings.
 //
-// value: Scalar. If set to true, during CTC
-// calculation, items that have longer output sequences than input sequences
-// are skipped: they don't contribute to the loss term and have zero-gradient.
-// If not specified, defaults to false
-func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
-	return func(m optionalAttr) {
-		m["ignore_longer_outputs_than_inputs"] = value
-	}
-}
-
-// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+// This op decompresses each element of the `bytes` input `Tensor`, which
+// is assumed to be compressed using the given `compression_type`.
 //
-// the gradient.  This class performs the softmax operation for you, so inputs
-// should be e.g. linear projections of outputs by an LSTM.
+// The `output` is a string `Tensor` of the same shape as `bytes`,
+// each element containing the decompressed data from the corresponding
+// element in `bytes`.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
-// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
-// `(batch b, time t)`.
-//	labels_values: The values (labels) associated with the given batch and time.
-//	sequence_length: A vector containing sequence lengths (batch).
+//	bytes: A Tensor of string which is compressed.
 //
-// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
-// `(max_time x batch_size x num_classes)`.
-func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+// Returns A Tensor with the same shape as input `bytes`, uncompressed
+// from bytes.
+func DecodeCompressed(scope *Scope, bytes tf.Output, optional ...DecodeCompressedAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -25800,256 +24594,325 @@ func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "CTCLoss",
+		Type: "DecodeCompressed",
 		Input: []tf.Input{
-			inputs, labels_indices, labels_values, sequence_length,
+			bytes,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
-type CTCGreedyDecoderAttr func(optionalAttr)
-
-// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+// Compare values of `input` to `threshold` and pack resulting bits into a `uint8`.
 //
-// value: If True, merge repeated classes in output.
-// If not specified, defaults to false
-func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
-	return func(m optionalAttr) {
-		m["merge_repeated"] = value
-	}
-}
-
-// Performs greedy decoding on the logits given in inputs.
+// Each comparison returns a boolean `true` (if `input_value > threshold`)
+// or and `false` otherwise.
 //
-// A note about the attribute merge_repeated: if enabled, when
-// consecutive logits' maximum indices are the same, only the first of
-// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
-// becomes "A B B" if merge_repeated = True and "A B B B B" if
-// merge_repeated = False.
+// This operation is useful for Locality-Sensitive-Hashing (LSH) and other
+// algorithms that use hashing approximations of cosine and `L2` distances;
+// codes can be generated from an input via:
 //
-// Regardless of the value of merge_repeated, if the maximum index of a given
-// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
-// element is emitted.
+// ```python
+// codebook_size = 50
+// codebook_bits = codebook_size * 32
+// codebook = tf.get_variable('codebook', [x.shape[-1].value, codebook_bits],
+//                            dtype=x.dtype,
+//                            initializer=tf.orthogonal_initializer())
+// codes = compare_and_threshold(tf.matmul(x, codebook), threshold=0.)
+// codes = tf.bitcast(codes, tf.int32)  # go from uint8 to int32
+// # now codes has shape x.shape[:-1] + [codebook_size]
+// ```
+//
+// **NOTE**: Currently, the innermost dimension of the tensor must be divisible
+// by 8.
+//
+// Given an `input` shaped `[s0, s1, ..., s_n]`, the output is
+// a `uint8` tensor shaped `[s0, s1, ..., s_n / 8]`.
 //
 // Arguments:
-//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
-//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//	input: Values to compare against `threshold` and bitpack.
+//	threshold: Threshold to compare against.
 //
-// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
-// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
-// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
-// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
-// log-probabilities.
-func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+// Returns The bitpacked comparisons.
+func CompareAndBitpack(scope *Scope, input tf.Output, threshold tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "CTCGreedyDecoder",
+		Type: "CompareAndBitpack",
 		Input: []tf.Input{
-			inputs, sequence_length,
+			input, threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+	return op.Output(0)
 }
 
-// Forwards `data` to the output port determined by `pred`.
-//
-// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
-// the data goes to `output_false`.
-//
-// See also `RefSwitch` and `Merge`.
+// Push an element onto the tensor_array.
 //
 // Arguments:
-//	data: The tensor to be forwarded to the appropriate output.
-//	pred: A scalar that specifies which output port will receive data.
+//	handle: The handle to a TensorArray.
+//	index: The position to write to inside the TensorArray.
+//	value: The tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
-func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayWriteV3(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Switch",
+		Type: "TensorArrayWriteV3",
 		Input: []tf.Input{
-			data, pred,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// Add all input tensors element wise.
+// Scatter the data from the input value into specific TensorArray elements.
+//
+// `indices` must be a vector, its length must match the first dim of `value`.
 //
 // Arguments:
-//	inputs: Must all be the same size and shape.
-func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+//	handle: The handle to a TensorArray.
+//	indices: The locations at which to write the tensor elements.
+//	value: The concatenated tensor to write to the TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArrayScatterV3(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AddN",
+		Type: "TensorArrayScatterV3",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			handle, indices, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// EnterAttr is an optional argument to Enter.
-type EnterAttr func(optionalAttr)
+// TensorArrayConcatV3Attr is an optional argument to TensorArrayConcatV3.
+type TensorArrayConcatV3Attr func(optionalAttr)
 
-// EnterIsConstant sets the optional is_constant attribute to value.
+// TensorArrayConcatV3ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
 //
-// value: If true, the output is constant within the child frame.
-// If not specified, defaults to false
-func EnterIsConstant(value bool) EnterAttr {
+// value: The expected shape of an element, if known,
+// excluding the first dimension. Used to validate the shapes of
+// TensorArray elements. If this shape is not fully specified, concatenating
+// zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
-		m["is_constant"] = value
+		m["element_shape_except0"] = value
 	}
 }
 
-// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+// Concat the elements from the TensorArray into value `value`.
 //
-// value: The number of iterations allowed to run in parallel.
-// If not specified, defaults to 10
-func EnterParallelIterations(value int64) EnterAttr {
-	return func(m optionalAttr) {
-		m["parallel_iterations"] = value
-	}
-}
-
-// Creates or finds a child frame, and makes `data` available to the child frame.
+// Takes `T` elements of shapes
 //
-// This op is used together with `Exit` to create loops in the graph.
-// The unique `frame_name` is used by the `Executor` to identify frames. If
-// `is_constant` is true, `output` is a constant in the child frame; otherwise
-// it may be changed in the child frame. At most `parallel_iterations` iterations
-// are run in parallel in the child frame.
+//   ```
+//   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
+//   ```
+//
+// and concatenates them into a Tensor of shape:
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
+//
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	data: The tensor to be made available to the child frame.
-//	frame_name: The name of the child frame.
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
 //
-// Returns The same tensor as `data`.
-func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"frame_name": frame_name}
+	attrs := map[string]interface{}{"dtype": dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Enter",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			data,
+			handle, flow_in,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			reader_handle,
+			shape, means, stdevs, minvals, maxvals,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Exits the current frame to its parent frame.
+// Returns a diagonal tensor with a given diagonal values.
 //
-// Exit makes its input `data` available to the parent frame.
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// Arguments:
-//	data: The tensor to be made available to the parent frame.
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Exit",
+		Type: "Diag",
 		Input: []tf.Input{
-			data,
+			diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns a copy of the input tensor.
-func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
+// Split the data from the input value into TensorArray elements.
+//
+// Assuming that `lengths` takes on values
+//
+//   ```(n0, n1, ..., n(T-1))```
+//
+// and that `value` has shape
+//
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```,
+//
+// this splits values into a TensorArray with T tensors.
+//
+// TensorArray index t will be the subtensor of values with starting position
+//
+//   ```(n0 + n1 + ... + n(t-1), 0, 0, ...)```
+//
+// and having size
+//
+//   ```nt x d0 x d1 x ...```
+//
+// Arguments:
+//	handle: The handle to a TensorArray.
+//	value: The concatenated tensor to write to the TensorArray.
+//	lengths: The vector of lengths, how to split the rows of value into the
+// TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//
+// Returns A float scalar that enforces proper chaining of operations.
+func TensorArraySplitV3(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Snapshot",
+		Type: "TensorArraySplitV3",
 		Input: []tf.Input{
-			input,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AbortAttr is an optional argument to Abort.
-type AbortAttr func(optionalAttr)
+// SerializeSparseAttr is an optional argument to SerializeSparse.
+type SerializeSparseAttr func(optionalAttr)
 
-// AbortErrorMsg sets the optional error_msg attribute to value.
+// SerializeSparseOutType sets the optional out_type attribute to value.
 //
-// value: A string which is the message associated with the exception.
-// If not specified, defaults to ""
-func AbortErrorMsg(value string) AbortAttr {
-	return func(m optionalAttr) {
-		m["error_msg"] = value
-	}
-}
-
-// AbortExitWithoutError sets the optional exit_without_error attribute to value.
-// If not specified, defaults to false
-func AbortExitWithoutError(value bool) AbortAttr {
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeSparseOutType(value tf.DataType) SerializeSparseAttr {
 	return func(m optionalAttr) {
-		m["exit_without_error"] = value
+		m["out_type"] = value
 	}
 }
 
-// Raise a exception to abort the process when called.
-//
-// If exit_without_error is true, the process will exit normally,
-// otherwise it will exit with a SIGABORT signal.
-//
-// Returns nothing but an exception.
+// Serialize a `SparseTensor` into a `[3]` `Tensor` object.
 //
-// Returns the created operation.
-func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeSparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26058,114 +24921,178 @@ func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Abort",
-
+		Type: "SerializeSparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
-type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+// RandomShuffleQueueV2Attr is an optional argument to RandomShuffleQueueV2.
+type RandomShuffleQueueV2Attr func(optionalAttr)
 
-// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+// RandomShuffleQueueV2Shapes sets the optional shapes attribute to value.
 //
-// value: Each valid line in this file (which should have a CSV-like format)
-// corresponds to a valid word ID. IDs are in sequential order, starting from
-// num_reserved_ids. The last entry in each line is expected to be a value
-// corresponding to the count or relative probability. Exactly one of vocab_file
-// and unigrams needs to be passed to this op.
-// If not specified, defaults to ""
-func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+// value: The shape of each component in a value. The length of this attr must
+// be either 0 or the same as the length of component_types. If the length of
+// this attr is 0, the shapes of queue elements are not constrained, and
+// only one element may be dequeued at a time.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["vocab_file"] = value
+		m["shapes"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+// RandomShuffleQueueV2Capacity sets the optional capacity attribute to value.
 //
-// value: The distortion is used to skew the unigram probability distribution.
-// Each weight is first raised to the distortion's power before adding to the
-// internal unigram distribution. As a result, distortion = 1.0 gives regular
-// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
-// a uniform distribution.
-// If not specified, defaults to 1
-func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+// value: The upper bound on the number of elements in this queue.
+// Negative numbers mean no limit.
+// If not specified, defaults to -1
+func RandomShuffleQueueV2Capacity(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["distortion"] = value
+		m["capacity"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+// RandomShuffleQueueV2MinAfterDequeue sets the optional min_after_dequeue attribute to value.
 //
-// value: Optionally some reserved IDs can be added in the range [0,
-// ..., num_reserved_ids) by the users. One use case is that a special unknown
-// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// value: Dequeue will block unless there would be this
+// many elements after the dequeue or the queue is closed. This
+// ensures a minimum level of mixing of elements.
 // If not specified, defaults to 0
-func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+func RandomShuffleQueueV2MinAfterDequeue(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["num_reserved_ids"] = value
+		m["min_after_dequeue"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
-//
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'shard') indicates the number of partitions that are being
-// used in the overall computation.
-// If not specified, defaults to 1
+// RandomShuffleQueueV2Seed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= 1
-func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
+// value: If either seed or seed2 is set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func RandomShuffleQueueV2Seed(value int64) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["num_shards"] = value
+		m["seed"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+// RandomShuffleQueueV2Seed2 sets the optional seed2 attribute to value.
 //
-// value: A sampler can be used to sample from a subset of the original range
-// in order to speed up the whole computation through parallelism. This parameter
-// (together with 'num_shards') indicates the particular partition number of a
-// sampler op, when partitioning is being used.
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
+func RandomShuffleQueueV2Seed2(value int64) RandomShuffleQueueV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomShuffleQueueV2Container sets the optional container attribute to value.
 //
-// REQUIRES: value >= 0
-func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+// value: If non-empty, this queue is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2Container(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["shard"] = value
+		m["container"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+// RandomShuffleQueueV2SharedName sets the optional shared_name attribute to value.
 //
-// value: A list of unigram counts or probabilities, one per ID in sequential
-// order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to <>
-func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+// value: If non-empty, this queue will be shared under the given name
+// across multiple sessions.
+// If not specified, defaults to ""
+func RandomShuffleQueueV2SharedName(value string) RandomShuffleQueueV2Attr {
 	return func(m optionalAttr) {
-		m["unigrams"] = value
+		m["shared_name"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+// A queue that randomizes the order of elements.
+//
+// Arguments:
+//	component_types: The type of each component in a value.
+//
+// Returns The handle to the queue.
+func RandomShuffleQueueV2(scope *Scope, component_types []tf.DataType, optional ...RandomShuffleQueueV2Attr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"component_types": component_types}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffleQueueV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Draw bounding boxes on a batch of images.
+//
+// Outputs a copy of `images` but draws on top of the pixels zero or more bounding
+// boxes specified by the locations in `boxes`. The coordinates of the each
+// bounding box in `boxes` are encoded as `[y_min, x_min, y_max, x_max]`. The
+// bounding box coordinates are floats in `[0.0, 1.0]` relative to the width and
+// height of the underlying image.
+//
+// For example, if an image is 100 x 200 pixels (height x width) and the bounding
+// box is `[0.1, 0.2, 0.5, 0.9]`, the upper-left and bottom-right coordinates of
+// the bounding box will be `(40, 10)` to `(100, 50)` (in (x,y) coordinates).
+//
+// Parts of the bounding box may fall outside the image.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, depth]`. A batch of images.
+//	boxes: 3-D with shape `[batch, num_bounding_boxes, 4]` containing bounding
+// boxes.
+//
+// Returns 4-D with the same shape as `images`. The batch of input images with
+// bounding boxes drawn on the images.
+func DrawBoundingBoxes(scope *Scope, images tf.Output, boxes tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DrawBoundingBoxes",
+		Input: []tf.Input{
+			images, boxes,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// LearnedUnigramCandidateSamplerAttr is an optional argument to LearnedUnigramCandidateSampler.
+type LearnedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// LearnedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
 // value: If either seed or seed2 are set to be non-zero, the random number
 // generator is seeded by the given seed.  Otherwise, it is seeded by a
 // random seed.
 // If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+func LearnedUnigramCandidateSamplerSeed(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed"] = value
 	}
 }
 
-// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// LearnedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
 // value: An second seed to avoid seed collision.
 // If not specified, defaults to 0
-func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+func LearnedUnigramCandidateSamplerSeed2(value int64) LearnedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["seed2"] = value
 	}
@@ -26173,13 +25100,8 @@ func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSampler
 
 // Generates labels for candidate sampling with a learned unigram distribution.
 //
-// A unigram sampler could use a fixed unigram distribution read from a
-// file or passed in as an in-memory array instead of building up the distribution
-// from data on the fly. There is also an option to skew the distribution by
-// applying a distortion power to the weights.
-//
-// The vocabulary file should be in CSV-like format, with the last field
-// being the weight associated with the word.
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
 //
 // For each batch, this op picks a single set of sampled candidate labels.
 //
@@ -26205,7 +25127,7 @@ func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSampler
 // candidate representing the number of times the candidate is expected
 // to occur in a batch of sampled candidates.  If unique=true, then this is a
 // probability.
-func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+func LearnedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...LearnedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26214,7 +25136,7 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "FixedUnigramCandidateSampler",
+		Type: "LearnedUnigramCandidateSampler",
 		Input: []tf.Input{
 			true_classes,
 		},
@@ -26224,132 +25146,114 @@ func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Elementwise computes the bitwise AND of `x` and `y`.
+// Computes gradients for the scaled exponential linear (Selu) operation.
 //
-// The result will have those bits set, that are set in both `x` and `y`. The
-// computation is performed on the underlying representations of `x` and `y`.
-func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BitwiseAnd",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding Selu operation.
+//	outputs: The outputs of the corresponding Selu operation.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The gradients: `gradients * (outputs + scale * alpha)`
+// if outputs < 0, `scale * gradients` otherwise.
+func SeluGrad(scope *Scope, gradients tf.Output, outputs tf.Output) (backprops tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "SeluGrad",
 		Input: []tf.Input{
-			x, y,
+			gradients, outputs,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Elementwise computes the bitwise right-shift of `x` and `y`.
+// Get the current size of the TensorArray.
 //
-// Performs a logical shift for unsigned integer types, and an arithmetic shift
-// for signed integer types.
+// Arguments:
+//	handle: The handle to a TensorArray (output of TensorArray or TensorArrayGrad).
+//	flow_in: A float scalar that enforces proper chaining of operations.
 //
-// If `y` is negative, or greater than or equal to than the width of `x` in bits
-// the result is implementation defined.
-func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Returns The current size of the TensorArray.
+func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "RightShift",
+		Type: "TensorArraySizeV3",
 		Input: []tf.Input{
-			x, y,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpretted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
+// Deprecated. Use TensorArrayGradV3
 //
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayWriteV3
+func TensorArrayWriteV2(scope *Scope, handle tf.Output, index tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "AdjustHue",
+		Type: "TensorArrayWriteV2",
 		Input: []tf.Input{
-			images, delta,
+			handle, index, value, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
-type AvgPool3DGradAttr func(optionalAttr)
+// SparseReduceMaxAttr is an optional argument to SparseReduceMax.
+type SparseReduceMaxAttr func(optionalAttr)
 
-// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
+// SparseReduceMaxKeepDims sets the optional keep_dims attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxKeepDims(value bool) SparseReduceMaxAttr {
 	return func(m optionalAttr) {
-		m["data_format"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Computes gradients of average pooling function.
+// Computes the max of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	orig_input_shape: The original input dimensions.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
-//	padding: The type of padding algorithm to use.
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
 //
-// Returns The backprop for input.
-func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AvgPool3DGrad",
+		Type: "SparseReduceMax",
 		Input: []tf.Input{
-			orig_input_shape, grad,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -26357,115 +25261,68 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 	return op.Output(0)
 }
 
-// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
-type ParseSingleSequenceExampleAttr func(optionalAttr)
+// AsStringAttr is an optional argument to AsString.
+type AsStringAttr func(optionalAttr)
 
-// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
-//
-// value: A list of Ncontext_sparse types; the data types of data in
-// each context Feature given in context_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// AsStringPrecision sets the optional precision attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: The post-decimal precision to use for floating point numbers.
+// Only used if precision > -1.
+// If not specified, defaults to -1
+func AsStringPrecision(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["context_sparse_types"] = value
+		m["precision"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to <>
+// AsStringScientific sets the optional scientific attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: Use scientific notation for floating point numbers.
+// If not specified, defaults to false
+func AsStringScientific(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_types"] = value
+		m["scientific"] = value
 	}
 }
 
-// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
-//
-// value: A list of Ncontext_dense shapes; the shapes of data in
-// each context Feature given in context_dense_keys.
-// The number of elements in the Feature corresponding to context_dense_key[j]
-// must always equal context_dense_shapes[j].NumEntries().
-// The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to <>
+// AsStringShortest sets the optional shortest attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+// value: Use shortest representation (either scientific or standard) for
+// floating point numbers.
+// If not specified, defaults to false
+func AsStringShortest(value bool) AsStringAttr {
 	return func(m optionalAttr) {
-		m["context_dense_shapes"] = value
+		m["shortest"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
-//
-// value: A list of Nfeature_list_sparse types; the data types
-// of data in each FeatureList given in feature_list_sparse_keys.
-// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to <>
+// AsStringWidth sets the optional width attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+// value: Pad pre-decimal numbers to this width.
+// Applies to both floating point and integer numbers.
+// Only used if width > -1.
+// If not specified, defaults to -1
+func AsStringWidth(value int64) AsStringAttr {
 	return func(m optionalAttr) {
-		m["feature_list_sparse_types"] = value
+		m["width"] = value
 	}
 }
 
-// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
-//
-// value: A list of Nfeature_list_dense shapes; the shapes of
-// data in each FeatureList given in feature_list_dense_keys.
-// The shape of each Feature in the FeatureList corresponding to
-// feature_list_dense_key[j] must always equal
-// feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to <>
+// AsStringFill sets the optional fill attribute to value.
 //
-// REQUIRES: len(value) >= 0
-func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+// value: The value to pad if width > -1.  If empty, pads with spaces.
+// Another typical value is '0'.  String cannot be longer than 1 character.
+// If not specified, defaults to ""
+func AsStringFill(value string) AsStringAttr {
 	return func(m optionalAttr) {
-		m["feature_list_dense_shapes"] = value
+		m["fill"] = value
 	}
 }
 
-// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+// Converts each entry in the given tensor to strings.  Supports many numeric
 //
-// Arguments:
-//	serialized: A scalar containing a binary serialized SequenceExample proto.
-//	feature_list_dense_missing_assumed_empty: A vector listing the
-// FeatureList keys which may be missing from the SequenceExample.  If the
-// associated FeatureList is missing, it is treated as empty.  By default,
-// any FeatureList not listed in this vector must exist in the SequenceExample.
-//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
-// The keys expected in the Examples' features associated with context_sparse
-// values.
-//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' context features associated with
-// dense values.
-//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
-// (scalars).  The keys expected in the FeatureLists associated with sparse
-// values.
-//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
-// The keys expected in the SequenceExamples' feature_lists associated
-// with lists of dense values.
-//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
-// context_dense_defaults[j] provides default values
-// when the SequenceExample's context map lacks context_dense_key[j].
-// If an empty Tensor is provided for context_dense_defaults[j],
-// then the Feature context_dense_keys[j] is required.
-// The input type is inferred from context_dense_defaults[j], even when it's
-// empty.  If context_dense_defaults[j] is not empty, its shape must match
-// context_dense_shapes[j].
-//	debug_name: A scalar containing the name of the serialized proto.
-// May contain, for example, table key (descriptive) name for the
-// corresponding serialized proto.  This is purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty scalar if no name is available.
-func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
+// types and boolean.
+func AsString(scope *Scope, input tf.Output, optional ...AsStringAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26474,463 +25331,394 @@ func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ParseSingleSequenceExample",
+		Type: "AsString",
 		Input: []tf.Input{
-			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
+			input,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
-		scope.UpdateErr("ParseSingleSequenceExample", err)
-		return
-	}
-	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+	return op.Output(0)
 }
 
-// DecodeWavAttr is an optional argument to DecodeWav.
-type DecodeWavAttr func(optionalAttr)
-
-// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+// Deprecated. Use TensorArrayScatterV3
 //
-// value: Number of sample channels wanted.
-// If not specified, defaults to -1
-func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_channels"] = value
+// DEPRECATED at GraphDef version 26: Use TensorArrayScatterV3
+func TensorArrayScatterV2(scope *Scope, handle tf.Output, indices tf.Output, value tf.Output, flow_in tf.Output) (flow_out tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
-//
-// value: Length of audio requested.
-// If not specified, defaults to -1
-func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
-	return func(m optionalAttr) {
-		m["desired_samples"] = value
+	opspec := tf.OpSpec{
+		Type: "TensorArrayScatterV2",
+		Input: []tf.Input{
+			handle, indices, value, flow_in,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Decode a 16-bit PCM WAV file to a float tensor.
+// Applies sparse addition to `input` using individual values or slices
 //
-// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
+// from `updates` according to indices `indices`.  The updates are non-aliasing:
+// `input` is only modified in-place if no other operations will use it.
+// Otherwise, a copy of `input` is made.  This operation has a gradient with
+// respect to both `input` and `updates`.
 //
-// When desired_channels is set, if the input contains fewer channels than this
-// then the last channel will be duplicated to give the requested number, else if
-// the input has more channels than requested then the additional channels will be
-// ignored.
+// `input` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
 //
-// If desired_samples is set, then the audio will be cropped or padded with zeroes
-// to the requested length.
+// `indices` must be integer tensor, containing indices into `input`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 //
-// The first output contains a Tensor with the content of the audio samples. The
-// lowest dimension will be the number of channels, and the second will be the
-// number of samples. For example, a ten-sample-long stereo WAV file should give an
-// output shape of [10, 2].
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or `(P-K)`-dimensional slices
+// (if `K < P`) along the `K`th dimension of `input`.
 //
-// Arguments:
-//	contents: The WAV-encoded audio, usually from a file.
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 //
-// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
-func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
+// ```
+// [d_0, ..., d_{Q-2}, input.shape[K], ..., input.shape[P-1]].
+// ```
+//
+// For example, say we want to add 4 scattered elements to a rank-1 tensor to 8
+// elements. In Python, that addition would look like this:
+//
+//     input = tf.constant([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     output = tf.scatter_nd_non_aliasing_add(input, indices, updates)
+//     with tf.Session() as sess:
+//       print(sess.run(output))
+//
+// The resulting value `output` would look like this:
+//
+//     [1, 13, 3, 14, 14, 6, 7, 20]
+//
+// See @{tf.scatter_nd} for more details about how to make updates to slices.
+//
+// Arguments:
+//	input: A Tensor.
+//	indices: A Tensor. Must be one of the following types: `int32`, `int64`.
+// A tensor of indices into `input`.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated values
+// to add to `input`.
+//
+// Returns A `Tensor` with the same shape as `input`, containing values of `input`
+// updated with `updates`.
+func ScatterNdNonAliasingAdd(scope *Scope, input tf.Output, indices tf.Output, updates tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "DecodeWav",
+		Type: "ScatterNdNonAliasingAdd",
 		Input: []tf.Input{
-			contents,
+			input, indices, updates,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
 
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
+// FractionalMaxPoolAttr is an optional argument to FractionalMaxPool.
+type FractionalMaxPoolAttr func(optionalAttr)
 
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
+// FractionalMaxPoolPseudoRandom sets the optional pseudo_random attribute to value.
+//
+// value: When set to True, generates the pooling sequence in a
+// pseudorandom fashion, otherwise, in a random fashion. Check paper [Benjamin
+// Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071) for
+// difference between pseudorandom and random.
+// If not specified, defaults to false
+func FractionalMaxPoolPseudoRandom(value bool) FractionalMaxPoolAttr {
 	return func(m optionalAttr) {
-		m["out_idx"] = value
+		m["pseudo_random"] = value
 	}
 }
 
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
+// FractionalMaxPoolOverlapping sets the optional overlapping attribute to value.
 //
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
 //
-// For example:
+// `index  0  1  2  3  4`
 //
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
+// `value  20 5  16 3  7`
 //
-// Arguments:
-//	x: 1-D.
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolOverlapping(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["overlapping"] = value
+	}
+}
+
+// FractionalMaxPoolDeterministic sets the optional deterministic attribute to value.
 //
-// Returns 1-D.1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
+// value: When set to True, a fixed pooling region will be used when
+// iterating over a FractionalMaxPool node in the computation graph. Mainly used
+// in unit test to make FractionalMaxPool deterministic.
+// If not specified, defaults to false
+func FractionalMaxPoolDeterministic(value bool) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["deterministic"] = value
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
+}
+
+// FractionalMaxPoolSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "Unique",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
+}
+
+// FractionalMaxPoolSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FractionalMaxPoolSeed2(value int64) FractionalMaxPoolAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
 }
 
-// Concatenates a list of `N` tensors along the first dimension.
+// Performs fractional max pooling on the input.
 //
-// The input tensors are all required to have size 1 in the first dimension.
+// Fractional max pooling is slightly different than regular max pooling.  In
+// regular max pooling, you downsize an input set by taking the maximum value of
+// smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+// a factor of N, where N is an integer.  Fractional max pooling, as you might
+// expect from the word "fractional", means that the overall reduction ratio N
+// does not have to be an integer.
 //
-// For example:
+// The sizes of the pooling regions are generated randomly but are fairly uniform.
+// For example, let's look at the height dimension, and the constraints on the
+// list of rows that will be pool boundaries.
 //
-// ```
-// # 'x' is [[1, 4]]
-// # 'y' is [[2, 5]]
-// # 'z' is [[3, 6]]
-// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
-// ```
+// First we define the following:
 //
-// The difference between concat and parallel_concat is that concat requires all
-// of the inputs be computed before the operation will begin but doesn't require
-// that the input shapes be known during graph construction.  Parallel concat
-// will copy pieces of the input into the output as they become available, in
-// some situations this can provide a performance benefit.
+// 1.  input_row_length : the number of rows from the input set
+// 2.  output_row_length : which will be smaller than the input
+// 3.  alpha = input_row_length / output_row_length : our reduction ratio
+// 4.  K = floor(alpha)
+// 5.  row_pooling_sequence : this is the result list of pool boundary rows
+//
+// Then, row_pooling_sequence should satisfy:
+//
+// 1.  a[0] = 0 : the first value of the sequence is 0
+// 2.  a[end] = input_row_length : the last value of the sequence is the size
+// 3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+// 4.  length(row_pooling_sequence) = output_row_length+1
+//
+// For more details on fractional max pooling, see this paper:
+// [Benjamin Graham, Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
 //
 // Arguments:
-//	values: Tensors to be concatenated. All must have size 1 in the first dimension
-// and same shape.
-//	shape: the final shape of the result; should be equal to the shapes of any input
-// but with the number of input values in the first dimension.
+//	value: 4-D with shape `[batch, height, width, channels]`.
+//	pooling_ratio: Pooling ratio for each dimension of `value`, currently only
+// supports row and col dimension and should be >= 1.0. For example, a valid
+// pooling ratio looks like [1.0, 1.44, 1.73, 1.0]. The first and last elements
+// must be 1.0 because we don't allow pooling on batch and channels
+// dimensions. 1.44 and 1.73 are pooling ratio on height and width dimensions
+// respectively.
 //
-// Returns The concatenated tensor.
-func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
+// Returns output tensor after fractional max pooling.row pooling sequence, needed to calculate gradient.column pooling sequence, needed to calculate gradient.
+func FractionalMaxPool(scope *Scope, value tf.Output, pooling_ratio []float32, optional ...FractionalMaxPoolAttr) (output tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{"pooling_ratio": pooling_ratio}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ParallelConcat",
+		Type: "FractionalMaxPool",
 		Input: []tf.Input{
-			tf.OutputList(values),
+			value,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
+// Deprecated. Use TensorArraySizeV3
 //
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArraySizeV3
+func TensorArraySizeV2(scope *Scope, handle tf.Output, flow_in tf.Output) (size tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Igamma",
+		Type: "TensorArraySizeV2",
 		Input: []tf.Input{
-			a, x,
+			handle, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
+// Conv2DAttr is an optional argument to Conv2D.
+type Conv2DAttr func(optionalAttr)
+
+// Conv2DUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DDataFormat sets the optional data_format attribute to value.
 //
-// This is typically used by gradient computations for a concat operation.
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, height, width, channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, channels, height, width].
+// If not specified, defaults to "NHWC"
+func Conv2DDataFormat(value string) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DDilations sets the optional dilations attribute to value.
 //
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
-//
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "Split",
-		Input: []tf.Input{
-			axis, value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each
+// filter element on that dimension. The dimension order is determined by the
+// value of `data_format`, see above for details. Dilations in the batch and
+// depth dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DDilations(value []int64) Conv2DAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
 	}
-	return output
 }
 
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
+// Computes a 2-D convolution given 4-D `input` and `filter` tensors.
 //
+// Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+// and a filter / kernel tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`, this op
+// performs the following:
 //
-// Returns Tensors whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SplitV",
-		Input: []tf.Input{
-			value, size_splits, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
-		return
-	}
-	return output
-}
-
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
+// 1. Flattens the filter to a 2-D matrix with shape
+//    `[filter_height * filter_width * in_channels, output_channels]`.
+// 2. Extracts image patches from the input tensor to form a *virtual*
+//    tensor of shape `[batch, out_height, out_width,
+//    filter_height * filter_width * in_channels]`.
+// 3. For each patch, right-multiplies the filter matrix and the image patch
+//    vector.
 //
-// The runtime is then free to make optimizations based on this.
+// In detail, with the default NHWC format,
 //
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
+//     output[b, i, j, k] =
+//         sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+//                         filter[di, dj, q, k]
 //
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a tensor of zeros with the same shape and type as x.
+// Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+// horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
 //
 // Arguments:
-//	x: a tensor of type T.
+//	input: A 4-D tensor. The dimension order is interpreted according to the value
+// of `data_format`, see below for details.
+//	filter: A 4-D tensor of shape
+// `[filter_height, filter_width, in_channels, out_channels]`
+//	strides: 1-D tensor of length 4.  The stride of the sliding window for each
+// dimension of `input`. The dimension order is determined by the value of
+// `data_format`, see below for details.
+//	padding: The type of padding algorithm to use.
 //
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+// Returns A 4-D tensor. The dimension order is determined by the value of
+// `data_format`, see below for details.
+func Conv2D(scope *Scope, input tf.Output, filter tf.Output, strides []int64, padding string, optional ...Conv2DAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ZerosLike",
+		Type: "Conv2D",
 		Input: []tf.Input{
-			x,
+			input, filter,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
-//
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
-	}
-}
+// StageAttr is an optional argument to Stage.
+type StageAttr func(optionalAttr)
 
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+// StageCapacity sets the optional capacity attribute to value.
 //
-// value: Output in `y_min` if `output_range_given` is True.
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
 // If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+//
+// REQUIRES: value >= 0
+func StageCapacity(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["given_y_min"] = value
+		m["capacity"] = value
 	}
 }
 
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+// StageMemoryLimit sets the optional memory_limit attribute to value.
 //
-// value: Output in `y_max` if `output_range_given` is True.
+// value: The maximum number of bytes allowed for Tensors in the Staging Area.
+// If > 0, inserts will block until sufficient space is available.
 // If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+//
+// REQUIRES: value >= 0
+func StageMemoryLimit(value int64) StageAttr {
 	return func(m optionalAttr) {
-		m["given_y_max"] = value
+		m["memory_limit"] = value
 	}
 }
 
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+// StageContainer sets the optional container attribute to value.
 //
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func StageContainer(value string) StageAttr {
 	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
+		m["container"] = value
 	}
 }
 
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+// StageSharedName sets the optional shared_name attribute to value.
 //
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func StageSharedName(value string) StageAttr {
 	return func(m optionalAttr) {
-		m["min_separation"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Quantized Instance normalization.
+// Stage values similar to a lightweight Enqueue.
+//
+// The basic functionality of this Op is similar to a queue with many
+// fewer capabilities and options.  This Op is optimized for performance.
 //
 // Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+// Returns the created operation.
+func Stage(scope *Scope, values []tf.Output, optional ...StageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -26939,432 +25727,1536 @@ func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
+		Type: "Stage",
 		Input: []tf.Input{
-			x, x_min, x_max,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return scope.AddOperation(opspec)
 }
 
-// Returns the diagonal part of the tensor.
+// StagePeekAttr is an optional argument to StagePeek.
+type StagePeekAttr func(optionalAttr)
+
+// StagePeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
-//
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
-//
-// Arguments:
-//	input: Rank k tensor where k is even and not zero.
+// REQUIRES: value >= 0
+func StagePeekCapacity(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StagePeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
+// REQUIRES: value >= 0
+func StagePeekMemoryLimit(value int64) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
 	}
-	opspec := tf.OpSpec{
-		Type: "DiagPart",
-		Input: []tf.Input{
-			input,
-		},
+}
+
+// StagePeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StagePeekContainer(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+// StagePeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StagePeekSharedName(value string) StagePeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified index.  If the
 //
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+// underlying container does not contain sufficient elements
+// this op will block until it does.   This Op is optimized for
+// performance.
+func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...StagePeekAttr) (values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
+		Type: "StagePeek",
 		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+			index,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
-		Input: []tf.Input{
-			input, diagonal,
-		},
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("StagePeek", err)
+		return
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return values
 }
 
-// EditDistanceAttr is an optional argument to EditDistance.
-type EditDistanceAttr func(optionalAttr)
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
 
-// EditDistanceNormalize sets the optional normalize attribute to value.
+// MapStageCapacity sets the optional capacity attribute to value.
 //
-// value: boolean (if true, edit distances are normalized by length of truth).
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
 //
-// The output is:
-// If not specified, defaults to true
-func EditDistanceNormalize(value bool) EditDistanceAttr {
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
 	return func(m optionalAttr) {
-		m["normalize"] = value
+		m["capacity"] = value
 	}
 }
 
-// Computes the (possibly normalized) Levenshtein Edit Distance.
-//
-// The inputs are variable-length sequences provided by SparseTensors
-//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
-// and
-//   (truth_indices, truth_values, truth_shape).
-//
-// The inputs are:
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// Arguments:
-//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
-// This is an N x R int64 matrix.
-//	hypothesis_values: The values of the hypothesis list SparseTensor.
-// This is an N-length vector.
-//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
-// This is an R-length vector.
-//	truth_indices: The indices of the truth list SparseTensor.
-// This is an M x R int64 matrix.
-//	truth_values: The values of the truth list SparseTensor.
-// This is an M-length vector.
-//	truth_shape: truth indices, vector.
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
 //
-// Returns A dense float tensor with rank R - 1.
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
 //
-// For the example input:
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
 //
-//     // hypothesis represents a 2x1 matrix with variable-length values:
-//     //   (0,0) = ["a"]
-//     //   (1,0) = ["b"]
-//     hypothesis_indices = [[0, 0, 0],
-//                           [1, 0, 0]]
-//     hypothesis_values = ["a", "b"]
-//     hypothesis_shape = [2, 1, 1]
+// Arguments:
+//	key: int64
 //
-//     // truth represents a 2x2 matrix with variable-length values:
-//     //   (0,0) = []
-//     //   (0,1) = ["a"]
-//     //   (1,0) = ["b", "c"]
-//     //   (1,1) = ["a"]
-//     truth_indices = [[0, 1, 0],
-//                      [1, 0, 0],
-//                      [1, 0, 1],
-//                      [1, 1, 0]]
-//     truth_values = ["a", "b", "c", "a"]
-//     truth_shape = [2, 2, 2]
-//     normalize = true
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
 //
-// The output will be:
 //
-//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
-//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
-//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
-func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"dtypes": dtypes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "EditDistance",
+		Type: "MapStage",
 		Input: []tf.Input{
-			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
+			key, indices, tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// Gather slices from `params` into a Tensor with shape specified by `indices`.
-//
-// `indices` is an K-dimensional integer tensor, best thought of as a
-// (K-1)-dimensional tensor of indices into `params`, where each element defines a
-// slice of `params`:
-//
-//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
 //
-// Whereas in @{tf.gather} `indices` defines slices into the first
-// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
-// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
 //
-// The last dimension of `indices` can be at most the rank of
-// `params`:
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapUnstageAttr is an optional argument to OrderedMapUnstage.
+type OrderedMapUnstageAttr func(optionalAttr)
+
+// OrderedMapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageCapacity(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapUnstageMemoryLimit(value int64) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageContainer(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapUnstageSharedName(value string) OrderedMapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func OrderedMapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...OrderedMapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("OrderedMapUnstage", err)
+		return
+	}
+	return values
+}
+
+// OrderedMapSizeAttr is an optional argument to OrderedMapSize.
+type OrderedMapSizeAttr func(optionalAttr)
+
+// OrderedMapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeCapacity(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapSizeMemoryLimit(value int64) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeContainer(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapSizeSharedName(value string) OrderedMapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func OrderedMapSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ShapeNAttr is an optional argument to ShapeN.
+type ShapeNAttr func(optionalAttr)
+
+// ShapeNOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func ShapeNOutType(value tf.DataType) ShapeNAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Returns shape of tensors.
+//
+// This operation returns N 1-D integer tensors representing shape of `input[i]s`.
+func ShapeN(scope *Scope, input []tf.Output, optional ...ShapeNAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ShapeN",
+		Input: []tf.Input{
+			tf.OutputList(input),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("ShapeN", err)
+		return
+	}
+	return output
+}
+
+// UniformCandidateSamplerAttr is an optional argument to UniformCandidateSampler.
+type UniformCandidateSamplerAttr func(optionalAttr)
+
+// UniformCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// UniformCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func UniformCandidateSamplerSeed2(value int64) UniformCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a uniform distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
+//
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
+//
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func UniformCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...UniformCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UniformCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// CTCLossAttr is an optional argument to CTCLoss.
+type CTCLossAttr func(optionalAttr)
+
+// CTCLossPreprocessCollapseRepeated sets the optional preprocess_collapse_repeated attribute to value.
+//
+// value: Scalar, if true then repeated labels are
+// collapsed prior to the CTC calculation.
+// If not specified, defaults to false
+func CTCLossPreprocessCollapseRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["preprocess_collapse_repeated"] = value
+	}
+}
+
+// CTCLossCtcMergeRepeated sets the optional ctc_merge_repeated attribute to value.
+//
+// value: Scalar.  If set to false, *during* CTC calculation
+// repeated non-blank labels will not be merged and are interpreted as
+// individual labels.  This is a simplified version of CTC.
+// If not specified, defaults to true
+func CTCLossCtcMergeRepeated(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ctc_merge_repeated"] = value
+	}
+}
+
+// CTCLossIgnoreLongerOutputsThanInputs sets the optional ignore_longer_outputs_than_inputs attribute to value.
+//
+// value: Scalar. If set to true, during CTC
+// calculation, items that have longer output sequences than input sequences
+// are skipped: they don't contribute to the loss term and have zero-gradient.
+// If not specified, defaults to false
+func CTCLossIgnoreLongerOutputsThanInputs(value bool) CTCLossAttr {
+	return func(m optionalAttr) {
+		m["ignore_longer_outputs_than_inputs"] = value
+	}
+}
+
+// Calculates the CTC Loss (log probability) for each batch entry.  Also calculates
+//
+// the gradient.  This class performs the softmax operation for you, so inputs
+// should be e.g. linear projections of outputs by an LSTM.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	labels_indices: The indices of a `SparseTensor<int32, 2>`.
+// `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
+// `(batch b, time t)`.
+//	labels_values: The values (labels) associated with the given batch and time.
+//	sequence_length: A vector containing sequence lengths (batch).
+//
+// Returns A vector (batch) containing log-probabilities.The gradient of `loss`.  3-D, shape:
+// `(max_time x batch_size x num_classes)`.
+func CTCLoss(scope *Scope, inputs tf.Output, labels_indices tf.Output, labels_values tf.Output, sequence_length tf.Output, optional ...CTCLossAttr) (loss tf.Output, gradient tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCLoss",
+		Input: []tf.Input{
+			inputs, labels_indices, labels_values, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// CTCGreedyDecoderAttr is an optional argument to CTCGreedyDecoder.
+type CTCGreedyDecoderAttr func(optionalAttr)
+
+// CTCGreedyDecoderMergeRepeated sets the optional merge_repeated attribute to value.
+//
+// value: If True, merge repeated classes in output.
+// If not specified, defaults to false
+func CTCGreedyDecoderMergeRepeated(value bool) CTCGreedyDecoderAttr {
+	return func(m optionalAttr) {
+		m["merge_repeated"] = value
+	}
+}
+
+// Performs greedy decoding on the logits given in inputs.
+//
+// A note about the attribute merge_repeated: if enabled, when
+// consecutive logits' maximum indices are the same, only the first of
+// these is emitted.  Labeling the blank '*', the sequence "A B B * B B"
+// becomes "A B B" if merge_repeated = True and "A B B B B" if
+// merge_repeated = False.
+//
+// Regardless of the value of merge_repeated, if the maximum index of a given
+// time and batch corresponds to the blank, index `(num_classes - 1)`, no new
+// element is emitted.
+//
+// Arguments:
+//	inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
+//	sequence_length: A vector containing sequence lengths, size `(batch_size)`.
+//
+// Returns Indices matrix, size `(total_decoded_outputs x 2)`,
+// of a `SparseTensor<int64, 2>`.  The rows store: [batch, time].Values vector, size: `(total_decoded_outputs)`,
+// of a `SparseTensor<int64, 2>`.  The vector stores the decoded classes.Shape vector, size `(2)`, of the decoded SparseTensor.
+// Values are: `[batch_size, max_decoded_length]`.Matrix, size `(batch_size x 1)`, containing sequence
+// log-probabilities.
+func CTCGreedyDecoder(scope *Scope, inputs tf.Output, sequence_length tf.Output, optional ...CTCGreedyDecoderAttr) (decoded_indices tf.Output, decoded_values tf.Output, decoded_shape tf.Output, log_probability tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CTCGreedyDecoder",
+		Input: []tf.Input{
+			inputs, sequence_length,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// Forwards `data` to the output port determined by `pred`.
+//
+// If `pred` is true, the `data` input is forwarded to `output_true`. Otherwise,
+// the data goes to `output_false`.
+//
+// See also `RefSwitch` and `Merge`.
+//
+// Arguments:
+//	data: The tensor to be forwarded to the appropriate output.
+//	pred: A scalar that specifies which output port will receive data.
+//
+// Returns If `pred` is false, data will be forwarded to this output.If `pred` is true, data will be forwarded to this output.
+func Switch(scope *Scope, data tf.Output, pred tf.Output) (output_false tf.Output, output_true tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Switch",
+		Input: []tf.Input{
+			data, pred,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Add all input tensors element wise.
+//
+// Arguments:
+//	inputs: Must all be the same size and shape.
+func AddN(scope *Scope, inputs []tf.Output) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AddN",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// EnterAttr is an optional argument to Enter.
+type EnterAttr func(optionalAttr)
+
+// EnterIsConstant sets the optional is_constant attribute to value.
+//
+// value: If true, the output is constant within the child frame.
+// If not specified, defaults to false
+func EnterIsConstant(value bool) EnterAttr {
+	return func(m optionalAttr) {
+		m["is_constant"] = value
+	}
+}
+
+// EnterParallelIterations sets the optional parallel_iterations attribute to value.
+//
+// value: The number of iterations allowed to run in parallel.
+// If not specified, defaults to 10
+func EnterParallelIterations(value int64) EnterAttr {
+	return func(m optionalAttr) {
+		m["parallel_iterations"] = value
+	}
+}
+
+// Creates or finds a child frame, and makes `data` available to the child frame.
 //
-//     indices.shape[-1] <= params.rank
+// This op is used together with `Exit` to create loops in the graph.
+// The unique `frame_name` is used by the `Executor` to identify frames. If
+// `is_constant` is true, `output` is a constant in the child frame; otherwise
+// it may be changed in the child frame. At most `parallel_iterations` iterations
+// are run in parallel in the child frame.
 //
-// The last dimension of `indices` corresponds to elements
-// (if `indices.shape[-1] == params.rank`) or slices
-// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
-// of `params`.  The output tensor has shape
+// Arguments:
+//	data: The tensor to be made available to the child frame.
+//	frame_name: The name of the child frame.
 //
-//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+// Returns The same tensor as `data`.
+func Enter(scope *Scope, data tf.Output, frame_name string, optional ...EnterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"frame_name": frame_name}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Enter",
+		Input: []tf.Input{
+			data,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
 //
-// Some examples below.
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
 //
-// Simple indexing into a matrix:
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Exits the current frame to its parent frame.
 //
-// ```python
-//     indices = [[0, 0], [1, 1]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = ['a', 'd']
-// ```
+// Exit makes its input `data` available to the parent frame.
 //
-// Slice indexing into a matrix:
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
 //
-// ```python
-//     indices = [[1], [0]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['c', 'd'], ['a', 'b']]
-// ```
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a copy of the input tensor.
+func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Snapshot",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AbortAttr is an optional argument to Abort.
+type AbortAttr func(optionalAttr)
+
+// AbortErrorMsg sets the optional error_msg attribute to value.
+//
+// value: A string which is the message associated with the exception.
+// If not specified, defaults to ""
+func AbortErrorMsg(value string) AbortAttr {
+	return func(m optionalAttr) {
+		m["error_msg"] = value
+	}
+}
+
+// AbortExitWithoutError sets the optional exit_without_error attribute to value.
+// If not specified, defaults to false
+func AbortExitWithoutError(value bool) AbortAttr {
+	return func(m optionalAttr) {
+		m["exit_without_error"] = value
+	}
+}
+
+// Raise a exception to abort the process when called.
+//
+// If exit_without_error is true, the process will exit normally,
+// otherwise it will exit with a SIGABORT signal.
+//
+// Returns nothing but an exception.
+//
+// Returns the created operation.
+func Abort(scope *Scope, optional ...AbortAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Abort",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// FixedUnigramCandidateSamplerAttr is an optional argument to FixedUnigramCandidateSampler.
+type FixedUnigramCandidateSamplerAttr func(optionalAttr)
+
+// FixedUnigramCandidateSamplerVocabFile sets the optional vocab_file attribute to value.
+//
+// value: Each valid line in this file (which should have a CSV-like format)
+// corresponds to a valid word ID. IDs are in sequential order, starting from
+// num_reserved_ids. The last entry in each line is expected to be a value
+// corresponding to the count or relative probability. Exactly one of vocab_file
+// and unigrams needs to be passed to this op.
+// If not specified, defaults to ""
+func FixedUnigramCandidateSamplerVocabFile(value string) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["vocab_file"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerDistortion sets the optional distortion attribute to value.
+//
+// value: The distortion is used to skew the unigram probability distribution.
+// Each weight is first raised to the distortion's power before adding to the
+// internal unigram distribution. As a result, distortion = 1.0 gives regular
+// unigram sampling (as defined by the vocab file), and distortion = 0.0 gives
+// a uniform distribution.
+// If not specified, defaults to 1
+func FixedUnigramCandidateSamplerDistortion(value float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["distortion"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumReservedIds sets the optional num_reserved_ids attribute to value.
+//
+// value: Optionally some reserved IDs can be added in the range [0,
+// ..., num_reserved_ids) by the users. One use case is that a special unknown
+// word token is used as ID 0. These IDs will have a sampling probability of 0.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerNumReservedIds(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_reserved_ids"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerNumShards sets the optional num_shards attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'shard') indicates the number of partitions that are being
+// used in the overall computation.
+// If not specified, defaults to 1
+//
+// REQUIRES: value >= 1
+func FixedUnigramCandidateSamplerNumShards(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["num_shards"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerShard sets the optional shard attribute to value.
+//
+// value: A sampler can be used to sample from a subset of the original range
+// in order to speed up the whole computation through parallelism. This parameter
+// (together with 'num_shards') indicates the particular partition number of a
+// sampler op, when partitioning is being used.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["shard"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerUnigrams sets the optional unigrams attribute to value.
+//
+// value: A list of unigram counts or probabilities, one per ID in sequential
+// order. Exactly one of vocab_file and unigrams should be passed to this op.
+// If not specified, defaults to <>
+func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["unigrams"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// FixedUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+//
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func FixedUnigramCandidateSamplerSeed2(value int64) FixedUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// A unigram sampler could use a fixed unigram distribution read from a
+// file or passed in as an in-memory array instead of building up the distribution
+// from data on the fly. There is also an option to skew the distribution by
+// applying a distortion power to the weights.
 //
-// Indexing into a 3-tensor:
+// The vocabulary file should be in CSV-like format, with the last field
+// being the weight associated with the word.
 //
-// ```python
-//     indices = [[1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+// For each batch, this op picks a single set of sampled candidate labels.
 //
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
-//     indices = [[0, 1], [1, 0]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['c0', 'd0'], ['a1', 'b1']]
+// Arguments:
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func FixedUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...FixedUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FixedUnigramCandidateSampler",
+		Input: []tf.Input{
+			true_classes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// WholeFileReaderV2Attr is an optional argument to WholeFileReaderV2.
+type WholeFileReaderV2Attr func(optionalAttr)
+
+// WholeFileReaderV2Container sets the optional container attribute to value.
 //
-//     indices = [[0, 0, 1], [1, 0, 1]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = ['b0', 'b1']
-// ```
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func WholeFileReaderV2Container(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// WholeFileReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Batched indexing into a matrix:
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func WholeFileReaderV2SharedName(value string) WholeFileReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the entire contents of a file as a value.
 //
-// ```python
-//     indices = [[[0, 0]], [[0, 1]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [['a'], ['b']]
-// ```
+// To use, enqueue filenames in a Queue.  The output of ReaderRead will
+// be a filename (key) and the contents of that file (value).
 //
-// Batched slice indexing into a matrix:
+// Returns The handle to reference the Reader.
+func WholeFileReaderV2(scope *Scope, optional ...WholeFileReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "WholeFileReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a tf.Example proto (as a string) into typed tensors.
 //
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [['a', 'b'], ['c', 'd']]
-//     output = [[['c', 'd']], [['a', 'b']]]
-// ```
+// Arguments:
+//	serialized: A vector containing a batch of binary serialized Example protos.
+//	dense_defaults: A list of Tensors (some may be empty), whose length matches
+// the length of `dense_keys`. dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse features to be parsed from the example. This
+// must match the lengths of `sparse_keys` and `sparse_types`.
+//	sparse_keys: A list of `num_sparse` strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: The keys expected in the Examples' features associated with dense
+// values.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each
+// Feature given in sparse_keys.
+// Currently the ParseSingleExample op supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	dense_shapes: The shapes of data in each Feature given in dense_keys.
+// The length of this list must match the length of `dense_keys`.  The
+// number of elements in the Feature corresponding to dense_key[j] must
+// always equal dense_shapes[j].NumEntries().  If dense_shapes[j] ==
+// (D0, D1, ..., DN) then the shape of output Tensor dense_values[j]
+// will be (D0, D1, ..., DN): In the case dense_shapes[j] = (-1, D1,
+// ..., DN), the shape of the output Tensor dense_values[j] will be (M,
+// D1, .., DN), where M is the number of blocks of elements of length
+// D1 * .... * DN, in the input.
+func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseSingleExample",
+		Input: []tf.Input{
+			serialized, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleExample", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values
+}
+
+// Elementwise computes the bitwise AND of `x` and `y`.
 //
-// Batched indexing into a 3-tensor:
+// The result will have those bits set, that are set in both `x` and `y`. The
+// computation is performed on the underlying representations of `x` and `y`.
+func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BitwiseAnd",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise left-shift of `x` and `y`.
 //
-// ```python
-//     indices = [[[1]], [[0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
-//               [[['a0', 'b0'], ['c0', 'd0']]]]
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Elementwise computes the bitwise right-shift of `x` and `y`.
 //
-//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [[['c0', 'd0'], ['a1', 'b1']],
-//               [['a0', 'b0'], ['c1', 'd1']]]
+// Performs a logical shift for unsigned integer types, and an arithmetic shift
+// for signed integer types.
 //
+// If `y` is negative, or greater than or equal to than the width of `x` in bits
+// the result is implementation defined.
+func RightShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RightShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adjust the hue of one or more images.
 //
-//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
-//     params = [[['a0', 'b0'], ['c0', 'd0']],
-//               [['a1', 'b1'], ['c1', 'd1']]]
-//     output = [['b0', 'b1'], ['d0', 'c1']]
-// ```
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpretted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
 //
 // Arguments:
-//	params: The tensor from which to gather values.
-//	indices: Index tensor.
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
 //
-// Returns Values from `params` gathered from indices given by `indices`, with
-// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
-func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "GatherNd",
+		Type: "AdjustHue",
 		Input: []tf.Input{
-			params, indices,
+			images, delta,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Eagerly executes a python function to compute func(input)->output. The
+// AvgPool3DGradAttr is an optional argument to AvgPool3DGrad.
+type AvgPool3DGradAttr func(optionalAttr)
+
+// AvgPool3DGradDataFormat sets the optional data_format attribute to value.
 //
-// semantics of the input, output, and attributes are the same as those for
-// PyFunc.
-func EagerPyFunc(scope *Scope, input []tf.Output, token string, Tout []tf.DataType) (output []tf.Output) {
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func AvgPool3DGradDataFormat(value string) AvgPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of average pooling function.
+//
+// Arguments:
+//	orig_input_shape: The original input dimensions.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The backprop for input.
+func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPool3DGradAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"token": token, "Tout": Tout}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "EagerPyFunc",
+		Type: "AvgPool3DGrad",
 		Input: []tf.Input{
-			tf.OutputList(input),
+			orig_input_shape, grad,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
+	return op.Output(0)
+}
+
+// ParseSingleSequenceExampleAttr is an optional argument to ParseSingleSequenceExample.
+type ParseSingleSequenceExampleAttr func(optionalAttr)
+
+// ParseSingleSequenceExampleContextSparseTypes sets the optional context_sparse_types attribute to value.
+//
+// value: A list of Ncontext_sparse types; the data types of data in
+// each context Feature given in context_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_sparse_types"] = value
 	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("EagerPyFunc", err)
-		return
+}
+
+// ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_types"] = value
 	}
-	return output
 }
 
-// Stops gradient computation.
+// ParseSingleSequenceExampleContextDenseShapes sets the optional context_dense_shapes attribute to value.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// value: A list of Ncontext_dense shapes; the shapes of data in
+// each context Feature given in context_dense_keys.
+// The number of elements in the Feature corresponding to context_dense_key[j]
+// must always equal context_dense_shapes[j].NumEntries().
+// The shape of context_dense_values[j] will match context_dense_shapes[j].
+// If not specified, defaults to <>
 //
-// When building ops to compute gradients, this op prevents the contribution of
-// its inputs to be taken into account.  Normally, the gradient generator adds ops
-// to a graph to compute the derivatives of a specified 'loss' by recursively
-// finding out inputs that contributed to its computation.  If you insert this op
-// in the graph it inputs are masked from the gradient generator.  They are not
-// taken into account for computing gradients.
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["context_dense_shapes"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListSparseTypes sets the optional feature_list_sparse_types attribute to value.
 //
-// This is useful any time you want to compute a value with TensorFlow but need
-// to pretend that the value was a constant. Some examples include:
+// value: A list of Nfeature_list_sparse types; the data types
+// of data in each FeatureList given in feature_list_sparse_keys.
+// Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+// If not specified, defaults to <>
 //
-// *  The *EM* algorithm where the *M-step* should not involve backpropagation
-//    through the output of the *E-step*.
-// *  Contrastive divergence training of Boltzmann machines where, when
-//    differentiating the energy function, the training must not backpropagate
-//    through the graph that generated the samples from the model.
-// *  Adversarial training, where no backprop should happen through the adversarial
-//    example generation process.
-func StopGradient(scope *Scope, input tf.Output) (output tf.Output) {
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_sparse_types"] = value
+	}
+}
+
+// ParseSingleSequenceExampleFeatureListDenseShapes sets the optional feature_list_dense_shapes attribute to value.
+//
+// value: A list of Nfeature_list_dense shapes; the shapes of
+// data in each FeatureList given in feature_list_dense_keys.
+// The shape of each Feature in the FeatureList corresponding to
+// feature_list_dense_key[j] must always equal
+// feature_list_dense_shapes[j].NumEntries().
+// If not specified, defaults to <>
+//
+// REQUIRES: len(value) >= 0
+func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
+	return func(m optionalAttr) {
+		m["feature_list_dense_shapes"] = value
+	}
+}
+
+// Transforms a scalar brain.SequenceExample proto (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar containing a binary serialized SequenceExample proto.
+//	feature_list_dense_missing_assumed_empty: A vector listing the
+// FeatureList keys which may be missing from the SequenceExample.  If the
+// associated FeatureList is missing, it is treated as empty.  By default,
+// any FeatureList not listed in this vector must exist in the SequenceExample.
+//	context_sparse_keys: A list of Ncontext_sparse string Tensors (scalars).
+// The keys expected in the Examples' features associated with context_sparse
+// values.
+//	context_dense_keys: A list of Ncontext_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' context features associated with
+// dense values.
+//	feature_list_sparse_keys: A list of Nfeature_list_sparse string Tensors
+// (scalars).  The keys expected in the FeatureLists associated with sparse
+// values.
+//	feature_list_dense_keys: A list of Nfeature_list_dense string Tensors (scalars).
+// The keys expected in the SequenceExamples' feature_lists associated
+// with lists of dense values.
+//	context_dense_defaults: A list of Ncontext_dense Tensors (some may be empty).
+// context_dense_defaults[j] provides default values
+// when the SequenceExample's context map lacks context_dense_key[j].
+// If an empty Tensor is provided for context_dense_defaults[j],
+// then the Feature context_dense_keys[j] is required.
+// The input type is inferred from context_dense_defaults[j], even when it's
+// empty.  If context_dense_defaults[j] is not empty, its shape must match
+// context_dense_shapes[j].
+//	debug_name: A scalar containing the name of the serialized proto.
+// May contain, for example, table key (descriptive) name for the
+// corresponding serialized proto.  This is purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty scalar if no name is available.
+func ParseSingleSequenceExample(scope *Scope, serialized tf.Output, feature_list_dense_missing_assumed_empty tf.Output, context_sparse_keys []tf.Output, context_dense_keys []tf.Output, feature_list_sparse_keys []tf.Output, feature_list_dense_keys []tf.Output, context_dense_defaults []tf.Output, debug_name tf.Output, optional ...ParseSingleSequenceExampleAttr) (context_sparse_indices []tf.Output, context_sparse_values []tf.Output, context_sparse_shapes []tf.Output, context_dense_values []tf.Output, feature_list_sparse_indices []tf.Output, feature_list_sparse_values []tf.Output, feature_list_sparse_shapes []tf.Output, feature_list_dense_values []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "StopGradient",
+		Type: "ParseSingleSequenceExample",
 		Input: []tf.Input{
-			input,
+			serialized, feature_list_dense_missing_assumed_empty, tf.OutputList(context_sparse_keys), tf.OutputList(context_dense_keys), tf.OutputList(feature_list_sparse_keys), tf.OutputList(feature_list_dense_keys), tf.OutputList(context_dense_defaults), debug_name,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
+	var idx int
+	var err error
+	if context_sparse_indices, idx, err = makeOutputList(op, idx, "context_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_values, idx, err = makeOutputList(op, idx, "context_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_sparse_shapes, idx, err = makeOutputList(op, idx, "context_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if context_dense_values, idx, err = makeOutputList(op, idx, "context_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_indices, idx, err = makeOutputList(op, idx, "feature_list_sparse_indices"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_values, idx, err = makeOutputList(op, idx, "feature_list_sparse_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_sparse_shapes, idx, err = makeOutputList(op, idx, "feature_list_sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	if feature_list_dense_values, idx, err = makeOutputList(op, idx, "feature_list_dense_values"); err != nil {
+		scope.UpdateErr("ParseSingleSequenceExample", err)
+		return
+	}
+	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values
+}
+
+// DecodeWavAttr is an optional argument to DecodeWav.
+type DecodeWavAttr func(optionalAttr)
+
+// DecodeWavDesiredChannels sets the optional desired_channels attribute to value.
+//
+// value: Number of sample channels wanted.
+// If not specified, defaults to -1
+func DecodeWavDesiredChannels(value int64) DecodeWavAttr {
+	return func(m optionalAttr) {
+		m["desired_channels"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// PreventGradientAttr is an optional argument to PreventGradient.
-type PreventGradientAttr func(optionalAttr)
-
-// PreventGradientMessage sets the optional message attribute to value.
+// DecodeWavDesiredSamples sets the optional desired_samples attribute to value.
 //
-// value: Will be printed in the error when anyone tries to differentiate
-// this operation.
-// If not specified, defaults to ""
-func PreventGradientMessage(value string) PreventGradientAttr {
+// value: Length of audio requested.
+// If not specified, defaults to -1
+func DecodeWavDesiredSamples(value int64) DecodeWavAttr {
 	return func(m optionalAttr) {
-		m["message"] = value
+		m["desired_samples"] = value
 	}
 }
 
-// An identity op that triggers an error if a gradient is requested.
+// Decode a 16-bit PCM WAV file to a float tensor.
 //
-// When executed in a graph, this op outputs its input tensor as-is.
+// The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
 //
-// When building ops to compute gradients, the TensorFlow gradient system
-// will return an error when trying to lookup the gradient of this op,
-// because no gradient must ever be registered for this function.  This
-// op exists to prevent subtle bugs from silently returning unimplemented
-// gradients in some corner cases.
+// When desired_channels is set, if the input contains fewer channels than this
+// then the last channel will be duplicated to give the requested number, else if
+// the input has more channels than requested then the additional channels will be
+// ignored.
+//
+// If desired_samples is set, then the audio will be cropped or padded with zeroes
+// to the requested length.
+//
+// The first output contains a Tensor with the content of the audio samples. The
+// lowest dimension will be the number of channels, and the second will be the
+// number of samples. For example, a ten-sample-long stereo WAV file should give an
+// output shape of [10, 2].
 //
 // Arguments:
-//	input: any tensor.
+//	contents: The WAV-encoded audio, usually from a file.
 //
-// Returns the same input tensor.
-func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientAttr) (output tf.Output) {
+// Returns 2-D with shape `[length, channels]`.Scalar holding the sample rate found in the WAV header.
+func DecodeWav(scope *Scope, contents tf.Output, optional ...DecodeWavAttr) (audio tf.Output, sample_rate tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27373,33 +27265,51 @@ func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientA
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "PreventGradient",
+		Type: "DecodeWav",
 		Input: []tf.Input{
-			input,
+			contents,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Checks a tensor for NaN and Inf values.
+// Concatenates a list of `N` tensors along the first dimension.
 //
-// When run, reports an `InvalidArgument` error if `tensor` has any values
-// that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
+// The input tensors are all required to have size 1 in the first dimension.
+//
+// For example:
+//
+// ```
+// # 'x' is [[1, 4]]
+// # 'y' is [[2, 5]]
+// # 'z' is [[3, 6]]
+// parallel_concat([x, y, z]) => [[1, 4], [2, 5], [3, 6]]  # Pack along first dim.
+// ```
+//
+// The difference between concat and parallel_concat is that concat requires all
+// of the inputs be computed before the operation will begin but doesn't require
+// that the input shapes be known during graph construction.  Parallel concat
+// will copy pieces of the input into the output as they become available, in
+// some situations this can provide a performance benefit.
 //
 // Arguments:
+//	values: Tensors to be concatenated. All must have size 1 in the first dimension
+// and same shape.
+//	shape: the final shape of the result; should be equal to the shapes of any input
+// but with the number of input values in the first dimension.
 //
-//	message: Prefix of the error message.
-func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Output) {
+// Returns The concatenated tensor.
+func ParallelConcat(scope *Scope, values []tf.Output, shape tf.Shape) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"message": message}
+	attrs := map[string]interface{}{"shape": shape}
 	opspec := tf.OpSpec{
-		Type: "CheckNumerics",
+		Type: "ParallelConcat",
 		Input: []tf.Input{
-			tensor,
+			tf.OutputList(values),
 		},
 		Attrs: attrs,
 	}
@@ -27407,251 +27317,259 @@ func CheckNumerics(scope *Scope, tensor tf.Output, message string) (output tf.Ou
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation and conjugate the result.
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
 //
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-//   `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
-func ConjugateTranspose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConjugateTranspose",
+		Type: "Igamma",
 		Input: []tf.Input{
-			x, perm,
+			a, x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// UniqueV2Attr is an optional argument to UniqueV2.
-type UniqueV2Attr func(optionalAttr)
-
-// UniqueV2OutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueV2OutIdx(value tf.DataType) UniqueV2Attr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`. This operation also returns a
-// tensor `idx` the same size as `x` that contains the index of each value of `x`
-// in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+// Computes offsets of concat inputs within its output.
 //
 // For example:
 //
 // ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
 // ```
 //
+// This is typically used by gradient computations for a concat operation.
+//
 // Arguments:
-//	x: A `Tensor`.
-//	axis: A `Tensor` of type `int64` (default: 0). The axis of the Tensor to
-// find the unique elements.
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
 //
-// Returns A `Tensor`. Unique elements along the `axis` of `Tensor` x.A 1-D Tensor. Has the same type as x that contains the index of each
-// value of x in the output y.
-func UniqueV2(scope *Scope, x tf.Output, axis tf.Output, optional ...UniqueV2Attr) (y tf.Output, idx tf.Output) {
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "UniqueV2",
+		Type: "ConcatOffset",
 		Input: []tf.Input{
-			x, axis,
+			concat_dim, tf.OutputList(shape),
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
 }
 
-// Return a slice from 'input'.
-//
-// The output tensor is a tensor with dimensions described by 'size'
-// whose values are extracted from 'input' starting at the offsets in
-// 'begin'.
-//
-// *Requirements*:
-//   0 <= begin[i] <= begin[i] + size[i] <= Di  for i in [0, n)
+// Splits a tensor into `num_split` tensors along one dimension.
 //
 // Arguments:
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
 //
-//	begin: begin[i] specifies the offset into the 'i'th dimension of
-// 'input' to slice from.
-//	size: size[i] specifies the number of elements of the 'i'th dimension
-// of 'input' to slice. If size[i] is -1, all remaining elements in dimension
-// i are included in the slice (i.e. this is equivalent to setting
-// size[i] = input.dim_size(i) - begin[i]).
-func Slice(scope *Scope, input tf.Output, begin tf.Output, size tf.Output) (output tf.Output) {
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"num_split": num_split}
 	opspec := tf.OpSpec{
-		Type: "Slice",
+		Type: "Split",
 		Input: []tf.Input{
-			input, begin, size,
+			axis, value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StridedSliceGradAttr is an optional argument to StridedSliceGrad.
-type StridedSliceGradAttr func(optionalAttr)
-
-// StridedSliceGradBeginMask sets the optional begin_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradBeginMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["begin_mask"] = value
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StridedSliceGradEndMask sets the optional end_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEndMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["end_mask"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
 	}
+	return output
 }
 
-// StridedSliceGradEllipsisMask sets the optional ellipsis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradEllipsisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["ellipsis_mask"] = value
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, axis,
+		},
+		Attrs: attrs,
 	}
-}
-
-// StridedSliceGradNewAxisMask sets the optional new_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradNewAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["new_axis_mask"] = value
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StridedSliceGradShrinkAxisMask sets the optional shrink_axis_mask attribute to value.
-// If not specified, defaults to 0
-func StridedSliceGradShrinkAxisMask(value int64) StridedSliceGradAttr {
-	return func(m optionalAttr) {
-		m["shrink_axis_mask"] = value
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
 	}
+	return output
 }
 
-// Returns the gradient of `StridedSlice`.
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
 //
-// Since `StridedSlice` cuts out pieces of its `input` which is size
-// `shape`, its gradient will have the same shape (which is passed here
-// as `shape`). The gradient will be zero in any element that the slice
-// does not select.
+// The runtime is then free to make optimizations based on this.
 //
-// Arguments are the same as StridedSliceGrad with the exception that
-// `dy` is the input gradient to be propagated and `shape` is the
-// shape of `StridedSlice`'s `input`.
-func StridedSliceGrad(scope *Scope, shape tf.Output, begin tf.Output, end tf.Output, strides tf.Output, dy tf.Output, optional ...StridedSliceGradAttr) (output tf.Output) {
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StridedSliceGrad",
+		Type: "GuaranteeConst",
 		Input: []tf.Input{
-			shape, begin, end, strides, dy,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the gradient of `Tile`.
+// Returns a tensor of zeros with the same shape and type as x.
 //
-// DEPRECATED at GraphDef version 3: TileGrad has been replaced with reduce_sum
+// Arguments:
+//	x: a tensor of type T.
 //
-// Since `Tile` takes an input and repeats the input `multiples` times
-// along each dimension, `TileGrad` takes in `multiples` and aggregates
-// each repeated tile of `input` into `output`.
-func TileGrad(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TileGrad",
+		Type: "ZerosLike",
 		Input: []tf.Input{
-			input, multiples,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// QuantizeAndDequantizeAttr is an optional argument to QuantizeAndDequantize.
-type QuantizeAndDequantizeAttr func(optionalAttr)
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
 
-// QuantizeAndDequantizeSignedInput sets the optional signed_input attribute to value.
-// If not specified, defaults to true
-func QuantizeAndDequantizeSignedInput(value bool) QuantizeAndDequantizeAttr {
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["signed_input"] = value
+		m["output_range_given"] = value
 	}
 }
 
-// QuantizeAndDequantizeNumBits sets the optional num_bits attribute to value.
-// If not specified, defaults to 8
-func QuantizeAndDequantizeNumBits(value int64) QuantizeAndDequantizeAttr {
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["num_bits"] = value
+		m["given_y_min"] = value
 	}
 }
 
-// QuantizeAndDequantizeRangeGiven sets the optional range_given attribute to value.
-// If not specified, defaults to false
-func QuantizeAndDequantizeRangeGiven(value bool) QuantizeAndDequantizeAttr {
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+//
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["range_given"] = value
+		m["given_y_max"] = value
 	}
 }
 
-// QuantizeAndDequantizeInputMin sets the optional input_min attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMin(value float32) QuantizeAndDequantizeAttr {
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+//
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["input_min"] = value
+		m["variance_epsilon"] = value
 	}
 }
 
-// QuantizeAndDequantizeInputMax sets the optional input_max attribute to value.
-// If not specified, defaults to 0
-func QuantizeAndDequantizeInputMax(value float32) QuantizeAndDequantizeAttr {
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+//
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
 	return func(m optionalAttr) {
-		m["input_max"] = value
+		m["min_separation"] = value
 	}
 }
 
-// Use QuantizeAndDequantizeV2 instead.
+// Quantized Instance normalization.
 //
-// DEPRECATED at GraphDef version 22: Replaced by QuantizeAndDequantizeV2
-func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAndDequantizeAttr) (output tf.Output) {
+// Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
+//
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27660,181 +27578,185 @@ func QuantizeAndDequantize(scope *Scope, input tf.Output, optional ...QuantizeAn
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "QuantizeAndDequantize",
+		Type: "QuantizedInstanceNorm",
 		Input: []tf.Input{
-			input,
+			x, x_min, x_max,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// QueueDequeueV2Attr is an optional argument to QueueDequeueV2.
-type QueueDequeueV2Attr func(optionalAttr)
-
-// QueueDequeueV2TimeoutMs sets the optional timeout_ms attribute to value.
+// Returns the diagonal part of the tensor.
 //
-// value: If the queue is empty, this operation will block for up to
-// timeout_ms milliseconds.
-// Note: This option is not supported yet.
-// If not specified, defaults to -1
-func QueueDequeueV2TimeoutMs(value int64) QueueDequeueV2Attr {
-	return func(m optionalAttr) {
-		m["timeout_ms"] = value
-	}
-}
-
-// Dequeues a tuple of one or more tensors from the given queue.
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
 //
-// This operation has k outputs, where k is the number of components
-// in the tuples stored in the given queue, and output i is the ith
-// component of the dequeued tuple.
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
 //
-// N.B. If the queue is empty, this operation will block until an element
-// has been dequeued (or 'timeout_ms' elapses, if specified).
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
 //
 // Arguments:
-//	handle: The handle to a queue.
-//	component_types: The type of each component in a tuple.
+//	input: Rank k tensor where k is even and not zero.
 //
-// Returns One or more tensors that were dequeued as a tuple.
-func QueueDequeueV2(scope *Scope, handle tf.Output, component_types []tf.DataType, optional ...QueueDequeueV2Attr) (components []tf.Output) {
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"component_types": component_types}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "QueueDequeueV2",
+		Type: "DiagPart",
 		Input: []tf.Input{
-			handle,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	var idx int
-	var err error
-	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
-		scope.UpdateErr("QueueDequeueV2", err)
-		return
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
 	}
-	return components
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
 }
 
-// Returns locations of nonzero / true values in a tensor.
-//
-// This operation returns the coordinates of true elements in `condition`. The
-// coordinates are returned in a 2-D tensor where the first dimension (rows)
-// represents the number of true elements, and the second dimension (columns)
-// represents the coordinates of the true elements. Keep in mind, the shape of
-// the output tensor can vary depending on how many true values there are in
-// `condition`. Indices are output in row-major order.
+// Returns a batched matrix tensor with new batched diagonal values.
 //
-// For example:
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
 //
-// ```
-// # 'input' tensor is [[True, False]
-// #                    [True, False]]
-// # 'input' has two true values, so output has two coordinates.
-// # 'input' has rank of 2, so coordinates have two indices.
-// where(input) ==> [[0, 0],
-//                   [1, 0]]
+// The output is computed as follows:
 //
-// # `condition` tensor is [[[True, False]
-// #                     [True, False]]
-// #                    [[False, True]
-// #                     [False, True]]
-// #                    [[False, False]
-// #                     [False, True]]]
-// # 'input' has 5 true values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
 //
-// # `condition` tensor is [[[1.5,  0.0]
-// #                     [-0.5, 0.0]]
-// #                    [[0.0,  0.25]
-// #                     [0.0,  0.75]]
-// #                    [[0.0,  0.0]
-// #                     [0.0,  0.01]]]
-// # 'input' has 5 nonzero values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
 //
-// # `condition` tensor is [[[1.5 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.5j, 0.0  + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.25 + 1.5j]
-// #                     [0.0 + 0.0j, 0.75 + 0.0j]]
-// #                    [[0.0 + 0.0j, 0.0  + 0.0j]
-// #                     [0.0 + 0.0j, 0.01 + 0.0j]]]
-// # 'input' has 5 nonzero magnitude values, so output has 5 coordinates.
-// # 'input' has rank of 3, so coordinates have three indices.
-// where(input) ==> [[0, 0, 0],
-//                   [0, 1, 0],
-//                   [1, 0, 1],
-//                   [1, 1, 1],
-//                   [2, 1, 1]]
-// ```
-func Where(scope *Scope, condition tf.Output) (index tf.Output) {
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "Where",
+		Type: "MatrixSetDiag",
 		Input: []tf.Input{
-			condition,
+			input, diagonal,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// EditDistanceAttr is an optional argument to EditDistance.
+type EditDistanceAttr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// EditDistanceNormalize sets the optional normalize attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["src_format"] = value
-	}
-}
-
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+// value: boolean (if true, edit distances are normalized by length of truth).
 //
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+// The output is:
+// If not specified, defaults to true
+func EditDistanceNormalize(value bool) EditDistanceAttr {
 	return func(m optionalAttr) {
-		m["dst_format"] = value
+		m["normalize"] = value
 	}
 }
 
-// Returns the dimension index in the destination data format given the one in
+// Computes the (possibly normalized) Levenshtein Edit Distance.
 //
-// the source data format.
+// The inputs are variable-length sequences provided by SparseTensors
+//   (hypothesis_indices, hypothesis_values, hypothesis_shape)
+// and
+//   (truth_indices, truth_values, truth_shape).
+//
+// The inputs are:
 //
 // Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
+//	hypothesis_indices: The indices of the hypothesis list SparseTensor.
+// This is an N x R int64 matrix.
+//	hypothesis_values: The values of the hypothesis list SparseTensor.
+// This is an N-length vector.
+//	hypothesis_shape: The shape of the hypothesis list SparseTensor.
+// This is an R-length vector.
+//	truth_indices: The indices of the truth list SparseTensor.
+// This is an M x R int64 matrix.
+//	truth_values: The values of the truth list SparseTensor.
+// This is an M-length vector.
+//	truth_shape: truth indices, vector.
 //
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+// Returns A dense float tensor with rank R - 1.
+//
+// For the example input:
+//
+//     // hypothesis represents a 2x1 matrix with variable-length values:
+//     //   (0,0) = ["a"]
+//     //   (1,0) = ["b"]
+//     hypothesis_indices = [[0, 0, 0],
+//                           [1, 0, 0]]
+//     hypothesis_values = ["a", "b"]
+//     hypothesis_shape = [2, 1, 1]
+//
+//     // truth represents a 2x2 matrix with variable-length values:
+//     //   (0,0) = []
+//     //   (0,1) = ["a"]
+//     //   (1,0) = ["b", "c"]
+//     //   (1,1) = ["a"]
+//     truth_indices = [[0, 1, 0],
+//                      [1, 0, 0],
+//                      [1, 0, 1],
+//                      [1, 1, 0]]
+//     truth_values = ["a", "b", "c", "a"]
+//     truth_shape = [2, 2, 2]
+//     normalize = true
+//
+// The output will be:
+//
+//     // output is a 2x2 matrix with edit distances normalized by truth lengths.
+//     output = [[inf, 1.0],  // (0,0): no truth, (0,1): no hypothesis
+//               [0.5, 1.0]]  // (1,0): addition, (1,1): no hypothesis
+func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values tf.Output, hypothesis_shape tf.Output, truth_indices tf.Output, truth_values tf.Output, truth_shape tf.Output, optional ...EditDistanceAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -27843,9 +27765,9 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
+		Type: "EditDistance",
 		Input: []tf.Input{
-			x,
+			hypothesis_indices, hypothesis_values, hypothesis_shape, truth_indices, truth_values, truth_shape,
 		},
 		Attrs: attrs,
 	}
@@ -27853,37 +27775,123 @@ func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAtt
 	return op.Output(0)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// Gather slices from `params` into a Tensor with shape specified by `indices`.
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
-		Input: []tf.Input{
-			s0, s1,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Return the reduction indices for computing gradients of s0 op s1 with broadcast.
+// `indices` is an K-dimensional integer tensor, best thought of as a
+// (K-1)-dimensional tensor of indices into `params`, where each element defines a
+// slice of `params`:
 //
-// This is typically used by gradient computations for a broadcasting operation.
-func BroadcastGradientArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output, r1 tf.Output) {
+//     output[i_0, ..., i_{K-2}] = params[indices[i0, ..., i_{K-2}]]
+//
+// Whereas in @{tf.gather} `indices` defines slices into the first
+// dimension of `params`, in `tf.gather_nd`, `indices` defines slices into the
+// first `N` dimensions of `params`, where `N = indices.shape[-1]`.
+//
+// The last dimension of `indices` can be at most the rank of
+// `params`:
+//
+//     indices.shape[-1] <= params.rank
+//
+// The last dimension of `indices` corresponds to elements
+// (if `indices.shape[-1] == params.rank`) or slices
+// (if `indices.shape[-1] < params.rank`) along dimension `indices.shape[-1]`
+// of `params`.  The output tensor has shape
+//
+//     indices.shape[:-1] + params.shape[indices.shape[-1]:]
+//
+// Some examples below.
+//
+// Simple indexing into a matrix:
+//
+// ```python
+//     indices = [[0, 0], [1, 1]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = ['a', 'd']
+// ```
+//
+// Slice indexing into a matrix:
+//
+// ```python
+//     indices = [[1], [0]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['c', 'd'], ['a', 'b']]
+// ```
+//
+// Indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['a1', 'b1'], ['c1', 'd1']]]
+//
+//
+//     indices = [[0, 1], [1, 0]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['c0', 'd0'], ['a1', 'b1']]
+//
+//
+//     indices = [[0, 0, 1], [1, 0, 1]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = ['b0', 'b1']
+// ```
+//
+// Batched indexing into a matrix:
+//
+// ```python
+//     indices = [[[0, 0]], [[0, 1]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [['a'], ['b']]
+// ```
+//
+// Batched slice indexing into a matrix:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [['a', 'b'], ['c', 'd']]
+//     output = [[['c', 'd']], [['a', 'b']]]
+// ```
+//
+// Batched indexing into a 3-tensor:
+//
+// ```python
+//     indices = [[[1]], [[0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[[['a1', 'b1'], ['c1', 'd1']]],
+//               [[['a0', 'b0'], ['c0', 'd0']]]]
+//
+//     indices = [[[0, 1], [1, 0]], [[0, 0], [1, 1]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [[['c0', 'd0'], ['a1', 'b1']],
+//               [['a0', 'b0'], ['c1', 'd1']]]
+//
+//
+//     indices = [[[0, 0, 1], [1, 0, 1]], [[0, 1, 1], [1, 1, 0]]]
+//     params = [[['a0', 'b0'], ['c0', 'd0']],
+//               [['a1', 'b1'], ['c1', 'd1']]]
+//     output = [['b0', 'b1'], ['d0', 'c1']]
+// ```
+//
+// Arguments:
+//	params: The tensor from which to gather values.
+//	indices: Index tensor.
+//
+// Returns Values from `params` gathered from indices given by `indices`, with
+// shape `indices.shape[:-1] + params.shape[indices.shape[-1]:]`.
+func GatherNd(scope *Scope, params tf.Output, indices tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastGradientArgs",
+		Type: "GatherNd",
 		Input: []tf.Input{
-			s0, s1,
+			params, indices,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
+	return op.Output(0)
 }
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index 7296205e2403f68587991e1d4c9ce57899eece92..ab7d698a45b7fc0cd498f8367fc1cecf07e4ba3c 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -312,7 +312,7 @@ tf_cc_test(
         "src/gen/cc/source_writer_test.cc",
     ],
     data = [
-        "src/gen/resources/test.snippet.java",
+        "src/gen/resources/test.java.snippet",
     ],
     deps = [
         ":java_op_gen_lib",
@@ -387,15 +387,3 @@ genrule(
     cmd = "cp $< $@",
     output_to_bindir = 1,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 0b69a8cbe530a13dc35aad3a5c859f77f0deca2a..c99d04869a714c95e78db6f14caab515a175cb38 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 541876f7f5e4fadcbc9336f15b319389dcddbf51..4561c2c8ade2826f779ff20c2ae1702fc97fa797 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index d8933e5238149337b08e70b3f407385887aef0a0..82a2b8e7694d15b7de921c1005ece30236d755ee 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 6286fd73df6dec5643fceda8f6f652220d75e1a7..4c1ec0cc8032009e7b206537dd15f1fedece4855 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.7.0-rc1</version>
+  <version>1.7.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 4e881f5a631f0b2e389b31a9b24028902eac6301..fcd8236bad315319cb1b8b57cf6ec5eb5f277705 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index d512a7eda9638d428e02beda442ba4d4db9adf62..241581713ad9b129d8df4a297e9e4a3e712117fc 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.7.0-rc1</version>
+    <version>1.7.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index 214999af9a6f9ee244d336a64830238e6b7ea872..a02f75ad6e7f5f1a9f22ad976e488ae5bf02a731 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -23,10 +23,20 @@ namespace tensorflow {
 namespace java {
 
 SourceWriter::SourceWriter() {
-  // push an empty generic namespace at start, for simplification
+  // Push an empty generic namespace at start, for simplification.
   generic_namespaces_.push(new GenericNamespace());
 }
 
+SourceWriter::~SourceWriter() {
+  // Remove empty generic namespace added at start as well as any other
+  // namespace objects that haven't been removed.
+  while (!generic_namespaces_.empty()) {
+    GenericNamespace* generic_namespace = generic_namespaces_.top();
+    generic_namespaces_.pop();
+    delete generic_namespace;
+  }
+}
+
 SourceWriter& SourceWriter::Indent(int tab) {
   left_margin_.resize(
       std::max(static_cast<int>(left_margin_.size() + tab), 0), ' ');
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index 6abe13b5d217b30d826d013e14a590eeb91719fb..637072c0df1c8dd0c21888f3ec95a259074c3182 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -32,7 +32,8 @@ namespace java {
 class SourceWriter {
  public:
   SourceWriter();
-  virtual ~SourceWriter() = default;
+
+  virtual ~SourceWriter();
 
   // Indents following lines with white spaces.
   //
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index 6926a5a411d070e25f2382c72589d879d3ca2180..4bce2fea7040a0e5cb9256dc2672399c3af8a03d 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -259,7 +259,9 @@ TEST(StreamTest, Types) {
 
 TEST(StreamTest, FileSnippet) {
   SourceBufferWriter writer;
-  const string& fname = "tensorflow/java/src/gen/resources/test.snippet.java";
+  const string fname = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "java/src/gen/resources/test.java.snippet");
 
   writer.WriteFromFile(fname)
         .BeginBlock()
diff --git a/tensorflow/java/src/gen/resources/test.snippet.java b/tensorflow/java/src/gen/resources/test.java.snippet
similarity index 100%
rename from tensorflow/java/src/gen/resources/test.snippet.java
rename to tensorflow/java/src/gen/resources/test.java.snippet
diff --git a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
index 489e95c3102557d7a75d83789c46106aa5aa3ed4..3948991c84d35009217f7c05844551fdcc49fb22 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/examples/LabelImage.java
@@ -101,6 +101,7 @@ public class LabelImage {
                   b.constant("mean", mean)),
               b.constant("scale", scale));
       try (Session s = new Session(g)) {
+        // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
         return s.runner().fetch(output.op().name()).run().get(0).expect(Float.class);
       }
     }
@@ -110,6 +111,7 @@ public class LabelImage {
     try (Graph g = new Graph()) {
       g.importGraphDef(graphDef);
       try (Session s = new Session(g);
+          // Generally, there may be multiple output tensors, all of them must be closed to prevent resource leaks.
           Tensor<Float> result =
               s.runner().feed("input", image).fetch("output").run().get(0).expect(Float.class)) {
         final long[] rshape = result.shape();
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index ae7e3e73aed1e43bd78e9f1d4b02bb02c854580d..2e174255247f374abd0ca85a225282687f060a34 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -28,6 +28,8 @@ load("//tensorflow:tensorflow.bzl", "py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library_additional_deps_impl")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow/core:platform/default/build_config.bzl", "pyx_library")
@@ -58,9 +60,10 @@ py_library(
         "//tensorflow/tools/api/generator:__pkg__",
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
-    deps = [":no_contrib"] + if_not_windows([
+    deps = [
+        ":no_contrib",
         "//tensorflow/contrib:contrib_py",
-    ]),
+    ],
 )
 
 py_library(
@@ -73,6 +76,7 @@ py_library(
     deps = [
         ":array_ops",
         ":bitwise_ops",
+        ":boosted_trees_ops",
         ":check_ops",
         ":client",
         ":client_testlib",
@@ -282,6 +286,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "py_exception_registry",
+    srcs = ["lib/core/py_exception_registry.cc"],
+    hdrs = ["lib/core/py_exception_registry.h"],
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/core:lib",
+        "//util/python:python_headers",
+    ],
+)
+
 cc_library(
     name = "kernel_registry",
     srcs = ["util/kernel_registry.cc"],
@@ -298,6 +313,7 @@ cc_library(
     srcs = ["util/util.cc"],
     hdrs = ["util/util.h"],
     deps = [
+        ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//util/python:python_headers",
@@ -411,6 +427,7 @@ tf_cc_shared_object(
             "-lm",
         ],
         "//tensorflow:darwin": [],
+        "//tensorflow:windows": [],
     }),
     deps = [
         "//tensorflow/core:framework_headers_lib",
@@ -818,6 +835,7 @@ py_library(
     srcs = ["framework/tensor_shape.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dtypes",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
@@ -958,7 +976,6 @@ py_test(
     srcs = ["framework/contrib_test.py"],
     main = "framework/contrib_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
@@ -1328,7 +1345,6 @@ py_test(
     srcs = ["framework/dtypes_test.py"],
     main = "framework/dtypes_test.py",
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -1373,6 +1389,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "boosted_trees_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "summary_ops_gen",
     visibility = ["//tensorflow:__subpackages__"],
@@ -1593,7 +1617,10 @@ py_library(
 
 py_library(
     name = "array_ops",
-    srcs = ["ops/array_ops.py"],
+    srcs = [
+        "ops/array_ops.py",
+        "ops/inplace_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops_gen",
@@ -1622,6 +1649,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "boosted_trees_ops",
+    srcs = ["ops/boosted_trees_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees_ops_gen",
+        ":framework",
+        ":ops",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -1683,7 +1723,6 @@ py_test(
     size = "small",
     srcs = ["ops/clip_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":clip_ops",
@@ -2235,7 +2274,6 @@ py_library(
         ":clip_ops",
         ":framework_for_generated_wrappers",
         ":init_ops",
-        ":layers_base",
         ":math_ops",
         ":nn_ops",
         ":partitioned_variables",
@@ -2512,6 +2550,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "summary_ops_v2",
+    srcs = ["ops/summary_ops_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":array_ops",
+        ":constant_op",
+        ":control_flow_ops",
+        ":dtypes",
+        ":framework_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":smart_cond",
+        ":summary_op_util",
+        ":summary_ops_gen",
+        ":training_util",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "template",
     srcs = ["ops/template.py"],
@@ -2752,7 +2814,6 @@ cuda_py_test(
     ],
     data = ["//tensorflow/core:image_testdata"],
     shard_count = 5,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2875,7 +2936,10 @@ py_library(
     name = "training",
     srcs = glob(
         ["training/**/*.py"],
-        exclude = ["**/*test*"],
+        exclude = [
+            "**/*test*",
+            "training/training_util.py",  # See :training_util
+        ],
     ),
     srcs_version = "PY2AND3",
     deps = [
@@ -2909,14 +2973,17 @@ py_library(
         ":string_ops",
         ":summary",
         ":training_ops_gen",
+        ":training_util",
         ":util",
         ":variable_scope",
         ":variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
+        # `layers` dependency only exists due to the use of a small utility.
+        "//tensorflow/python/keras:layers",
     ],
 )
 
@@ -3144,6 +3211,8 @@ tf_proto_library(
     srcs = ["framework/cpp_shape_inference.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
+    # TODO(b/74620627): remove when _USE_C_SHAPES is removed
+    visibility = ["//tensorflow:internal"],
 )
 
 py_test(
@@ -3228,6 +3297,7 @@ cuda_py_tests(
         ":client_testlib",
         ":framework_test_lib",
         ":platform_test",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -3287,6 +3357,7 @@ tf_py_wrap_cc(
         "grappler/model_analyzer.i",
         "grappler/tf_optimizer.i",
         "lib/core/bfloat16.i",
+        "lib/core/py_exception_registry.i",
         "lib/core/py_func.i",
         "lib/core/strings.i",
         "lib/io/file_io.i",
@@ -3305,6 +3376,10 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
+    win_def_file = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
+        "//conditions:default": None,
+    }),
     deps = [
         ":bfloat16_lib",
         ":cost_analyzer_lib",
@@ -3314,6 +3389,7 @@ tf_py_wrap_cc(
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
+        ":py_exception_registry",
         ":py_func_lib",
         ":py_record_reader_lib",
         ":py_record_writer_lib",
@@ -3324,6 +3400,7 @@ tf_py_wrap_cc(
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
         "//tensorflow/core/grappler:grappler_item",
@@ -3348,6 +3425,65 @@ tf_py_wrap_cc(
          tf_additional_gdr_deps()),
 )
 
+# ** Targets for Windows build (start) **
+# We need the following targets to expose symbols from _pywrap_tensorflow.dll
+
+# Build a cc_binary from tf_custom_op_library_additional_deps_impl,
+# it contains all object code from its dependencies.
+tf_native_cc_binary(
+    name = "tf_custom_op_library_additional_deps.so",
+    linkshared = 1,
+    linkstatic = 1,
+    deps = tf_custom_op_library_additional_deps_impl(),
+)
+
+# Get a DEF file generated by parsing all object files
+# of tf_custom_op_library_additional_deps.so
+filegroup(
+    name = "pywrap_tensorflow_def_file",
+    srcs = [":tf_custom_op_library_additional_deps.so"],
+    output_group = "def_file",
+)
+
+# Filter the DEF file to reduce the number of symbols to 64K or less.
+# Note that we also write the name of the pyd file into DEF file so that
+# the dynamic libraries of custom ops can find it at runtime.
+genrule(
+    name = "pywrap_tensorflow_filtered_def_file",
+    srcs = [":pywrap_tensorflow_def_file"],
+    outs = ["pywrap_tensorflow_filtered_def_file.def"],
+    cmd = select({
+        "//tensorflow:windows": """
+              $(location @local_config_def_file_filter//:def_file_filter) \\
+              --input $(location :pywrap_tensorflow_def_file) \\
+              --output $@ \\
+              --target _pywrap_tensorflow_internal.pyd
+          """,
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    tools = ["@local_config_def_file_filter//:def_file_filter"],
+)
+
+# Get the import library of  _pywrap_tensorflow_internal.dll
+filegroup(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":_pywrap_tensorflow_internal.so"],
+    output_group = "interface_library",
+)
+
+# Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
+# so that custom ops' dynamic libraries can link against it.
+cc_import(
+    name = "pywrap_tensorflow_import_lib",
+    interface_library = select({
+        "//tensorflow:windows": ":pywrap_tensorflow_import_lib_file",
+        "//conditions:default": "not_exsiting_on_unix.lib",  # Just a placeholder for Unix platforms
+    }),
+    system_provided = 1,
+)
+
+# ** Targets for Windows build (end) **
+
 py_library(
     name = "lib",
     srcs = [
@@ -3681,6 +3817,7 @@ cuda_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -3720,7 +3857,6 @@ py_test(
     size = "small",
     srcs = ["lib/core/bfloat16_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":client_testlib",
         ":lib",
@@ -3913,6 +4049,7 @@ py_test(
     srcs = ["training/saver_large_partitioned_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_windows",
         "noasan",  # http://b/30782289
         "notsan",  # http://b/30782289
     ],
@@ -4028,7 +4165,6 @@ py_test(
     size = "small",
     srcs = ["training/checkpoint_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":checkpoint_ops_gen",
         ":client",
@@ -4069,10 +4205,7 @@ py_test(
     size = "medium",
     srcs = ["training/monitored_session_test.py"],
     srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",
-        "notsan",  # b/67945581
-    ],
+    tags = ["notsan"],  # b/67945581
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -4090,6 +4223,25 @@ py_test(
     ],
 )
 
+py_library(
+    name = "training_util",
+    srcs = ["training/training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework",
+        ":framework_ops",
+        ":init_ops",
+        ":platform",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":util",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
 py_test(
     name = "training_util_test",
     size = "small",
@@ -4100,6 +4252,7 @@ py_test(
         ":framework",
         ":platform",
         ":training",
+        ":training_util",
         ":variables",
     ],
 )
@@ -4144,6 +4297,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":client",
+        ":constant_op",
         ":errors",
         ":framework",
         ":framework_for_generated_wrappers",
@@ -4155,6 +4310,7 @@ py_library(
         ":summary_op_util",
         ":summary_ops",
         ":summary_ops_gen",
+        ":summary_ops_v2",
         ":util",
         "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
@@ -4181,7 +4337,7 @@ py_tests(
         ":platform",
         ":platform_test",
         ":summary",
-        ":training",
+        ":summary_ops_v2",
         "//tensorflow/core:protos_all_py",
     ],
 )
@@ -4206,6 +4362,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
 )
@@ -4242,6 +4399,7 @@ py_library(
         ":variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -4382,18 +4540,6 @@ py_test(
     ],
 )
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cuda_py_test(
     name = "accumulate_n_benchmark",
     size = "large",
@@ -4695,6 +4841,7 @@ py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
         "//third_party/py/numpy",
@@ -4755,6 +4902,29 @@ py_test(
     ],
 )
 
+cuda_py_test(
+    name = "constant_folding_test",
+    size = "medium",
+    srcs = [
+        "grappler/constant_folding_test.py",
+    ],
+    additional_deps = [
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":array_ops",
+        ":control_flow_ops",
+        ":dtypes",
+        ":functional_ops",
+        ":math_ops",
+        ":ops",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+    ],
+    tags = [
+        "grappler",
+    ],
+)
+
 cuda_py_test(
     name = "layout_optimizer_test",
     size = "medium",
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 3346937904885c216d7a8de86fc6036604376173..13f8420a670fe64615037975139f3ee1f16820b6 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -98,6 +98,8 @@ from tensorflow.python.summary import summary
 from tensorflow.python.user_ops import user_ops
 from tensorflow.python.util import compat
 
+# Import boosted trees ops to make sure the ops are registered (but unused).
+from tensorflow.python.ops import gen_boosted_trees_ops as _gen_boosted_trees_ops
 
 # Import cudnn rnn ops to make sure their ops are registered.
 from tensorflow.python.ops import gen_cudnn_rnn_ops as _
@@ -147,6 +149,19 @@ from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.eager.context import executing_eagerly
 from tensorflow.python.framework.ops import enable_eager_execution
 
+# Necessary for the symbols in this module to be taken into account by
+# the namespace management system (API decorators).
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell
+
+# Required due to `rnn` and `rnn_cell` not being imported in `nn` directly
+# (due to a circular dependency issue: rnn depends on layers).
+nn.dynamic_rnn = rnn.dynamic_rnn
+nn.static_rnn = rnn.static_rnn
+nn.raw_rnn = rnn.raw_rnn
+nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
+nn.rnn_cell = rnn_cell
+
 # Symbols whitelisted for export without documentation.
 # TODO(cwhipkey): review these and move to contrib, expose through
 # documentation, or remove.
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 51c04584a5492e13f5fead627685954d4f810dfa..944e855cee2ab9da7a4a801d1b993bec4d8ebc55 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -15,19 +15,39 @@ limitations under the License.
 
 %include "tensorflow/python/platform/base.i"
 
+%typemap(in) const tensorflow::ConfigProto& (tensorflow::ConfigProto temp) {
+  char* c_string;
+  Py_ssize_t py_size;
+  if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
+    // Python has raised an error (likely TypeError or UnicodeEncodeError).
+    SWIG_fail;
+  }
+
+  if (!temp.ParseFromString(string(c_string, py_size))) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "The ConfigProto could not be parsed as a valid protocol buffer");
+    SWIG_fail;
+  }
+  $1 = &temp;
+}
+
 %{
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 namespace swig {
 
-static std::vector<string> ListDevices(TF_Status* out_status) {
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status) {
   std::vector<string> output;
   SessionOptions options;
+  options.config = config;
   std::vector<Device*> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
@@ -35,7 +55,8 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(), devices.end());
+  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
+                                                     devices.end());
 
   for (const Device* device : devices) {
     const DeviceAttributes& attr = device->attributes();
@@ -53,6 +74,11 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
   return output;
 }
 
+std::vector<string> ListDevices(TF_Status* out_status) {
+  tensorflow::ConfigProto session_config;
+  return ListDevicesWithSessionConfig(session_config, out_status);
+}
+
 }  // namespace swig
 }  // namespace tensorflow
 
@@ -62,21 +88,28 @@ static std::vector<string> ListDevices(TF_Status* out_status) {
 
 %unignore tensorflow;
 %unignore tensorflow::swig;
+%unignore tensorflow::swig::ListDevicesWithSessionConfig;
 %unignore tensorflow::swig::ListDevices;
 
 // Wrap this function
 namespace tensorflow {
 namespace swig {
 std::vector<string> ListDevices(TF_Status* out_status);
+static std::vector<string> ListDevicesWithSessionConfig(
+    const tensorflow::ConfigProto& config, TF_Status* out_status);
 }  // namespace swig
 }  // namespace tensorflow
 
 %insert("python") %{
-def list_devices():
+def list_devices(session_config=None):
   from tensorflow.python.framework import errors
 
   with errors.raise_exception_on_not_ok_status() as status:
-    return ListDevices(status)
+    if session_config:
+      return ListDevicesWithSessionConfig(session_config.SerializeToString(),
+                                          status)
+    else:
+      return ListDevices(status)
 %}
 
 %unignoreall
diff --git a/tensorflow/python/client/device_lib.py b/tensorflow/python/client/device_lib.py
index ad430cbae5a42a388cc8c41bf8be9db253aa92f2..9d90d5395e288e5988c60df64b9d962f5cccc22a 100644
--- a/tensorflow/python/client/device_lib.py
+++ b/tensorflow/python/client/device_lib.py
@@ -22,9 +22,12 @@ from tensorflow.core.framework import device_attributes_pb2
 from tensorflow.python import pywrap_tensorflow
 
 
-def list_local_devices():
+def list_local_devices(session_config=None):
   """List the available devices available in the local process.
 
+  Args:
+    session_config: a session config proto or None to use the default config.
+
   Returns:
     A list of `DeviceAttribute` protocol buffers.
   """
@@ -33,4 +36,7 @@ def list_local_devices():
     m.ParseFromString(pb_str)
     return m
 
-  return [_convert(s) for s in pywrap_tensorflow.list_devices()]
+  return [
+      _convert(s)
+      for s in pywrap_tensorflow.list_devices(session_config=session_config)
+  ]
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index aaf41626ab0078489026036d2b838f33a893a540..fec41f50b6c130704d587d6c7b80297c95183005 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -31,6 +32,10 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     self.assertGreater(len(devices), 0)
     self.assertEqual(devices[0].device_type, "CPU")
 
+    devices = device_lib.list_local_devices(config_pb2.ConfigProto())
+    self.assertGreater(len(devices), 0)
+    self.assertEqual(devices[0].device_type, "CPU")
+
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index da5dc6f5998bd6f63445dc3694e53d1032e3d1ab..4c84d78f2e11922e4819e45aaee79374c8c5ec34 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -27,7 +27,6 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -629,14 +628,12 @@ class BaseSession(SessionInterface):
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          # pylint: disable=protected-access
-          self._session = tf_session.TF_NewSession(self._graph._c_graph, opts,
-                                                   status)
-          # pylint: enable=protected-access
-        else:
-          self._session = tf_session.TF_NewDeprecatedSession(opts, status)
+      if self._created_with_new_api:
+        # pylint: disable=protected-access
+        self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+        # pylint: enable=protected-access
+      else:
+        self._session = tf_session.TF_NewDeprecatedSession(opts)
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
@@ -663,22 +660,20 @@ class BaseSession(SessionInterface):
     Returns:
       A list of devices in the session.
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        raw_device_list = tf_session.TF_SessionListDevices(
-            self._session, status)
-      else:
-        raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
-            self._session, status)
-      device_list = []
-      size = tf_session.TF_DeviceListCount(raw_device_list)
-      for i in range(size):
-        name = tf_session.TF_DeviceListName(raw_device_list, i, status)
-        device_type = tf_session.TF_DeviceListType(raw_device_list, i, status)
-        memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i, status)
-        device_list.append(_DeviceAttributes(name, device_type, memory))
-      tf_session.TF_DeleteDeviceList(raw_device_list)
-      return device_list
+    if self._created_with_new_api:
+      raw_device_list = tf_session.TF_SessionListDevices(self._session)
+    else:
+      raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
+          self._session)
+    device_list = []
+    size = tf_session.TF_DeviceListCount(raw_device_list)
+    for i in range(size):
+      name = tf_session.TF_DeviceListName(raw_device_list, i)
+      device_type = tf_session.TF_DeviceListType(raw_device_list, i)
+      memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i)
+      device_list.append(_DeviceAttributes(name, device_type, memory))
+    tf_session.TF_DeleteDeviceList(raw_device_list)
+    return device_list
 
   def close(self):
     """Closes this session.
@@ -692,15 +687,13 @@ class BaseSession(SessionInterface):
     if self._created_with_new_api:
       if self._session and not self._closed:
         self._closed = True
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.TF_CloseSession(self._session, status)
+        tf_session.TF_CloseSession(self._session)
 
     else:
       with self._extend_lock:
         if self._opened and not self._closed:
           self._closed = True
-          with errors.raise_exception_on_not_ok_status() as status:
-            tf_session.TF_CloseDeprecatedSession(self._session, status)
+          tf_session.TF_CloseDeprecatedSession(self._session)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -710,11 +703,10 @@ class BaseSession(SessionInterface):
       pass
     if self._session is not None:
       try:
-        status = c_api_util.ScopedTFStatus()
         if self._created_with_new_api:
-          tf_session.TF_DeleteSession(self._session, status)
+          tf_session.TF_DeleteSession(self._session)
         else:
-          tf_session.TF_DeleteDeprecatedSession(self._session, status)
+          tf_session.TF_DeleteDeprecatedSession(self._session)
       except AttributeError:
         # At shutdown, `c_api_util` or `tf_session` may have been garbage
         # collected, causing the above method calls to fail. In this case,
@@ -1031,11 +1023,11 @@ class BaseSession(SessionInterface):
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
-      with errors.raise_exception_on_not_ok_status() as status:
-        if self._created_with_new_api:
-          return tf_session.TF_SessionPRunSetup_wrapper(
-              session, feed_list, fetch_list, target_list, status)
-        else:
+      if self._created_with_new_api:
+        return tf_session.TF_SessionPRunSetup_wrapper(
+            session, feed_list, fetch_list, target_list)
+      else:
+        with errors.raise_exception_on_not_ok_status() as status:
           return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
                                          target_list, status)
 
@@ -1345,8 +1337,7 @@ class BaseSession(SessionInterface):
   def _extend_graph(self):
     if self._created_with_new_api:
       with self._graph._lock:  # pylint: disable=protected-access
-        with errors.raise_exception_on_not_ok_status() as status:
-          tf_session.ExtendSession(self._session, status)
+        tf_session.ExtendSession(self._session)
     else:
       # Ensure any changes to the graph are reflected in the runtime.
       with self._extend_lock:
@@ -1412,25 +1403,82 @@ class BaseSession(SessionInterface):
 
   def _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list,
                           run_metadata):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionRun_wrapper(
-            self._session, options, feed_dict, fetch_list, target_list,
-            run_metadata, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionRun_wrapper(
+          self._session, options, feed_dict, fetch_list, target_list,
+          run_metadata)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_Run(
             self._session, options, feed_dict, fetch_list, target_list,
             status, run_metadata)
 
   def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):
-    with errors.raise_exception_on_not_ok_status() as status:
-      if self._created_with_new_api:
-        return tf_session.TF_SessionPRun_wrapper(
-            self._session, handle, feed_dict, fetch_list, status)
-      else:
+    if self._created_with_new_api:
+      return tf_session.TF_SessionPRun_wrapper(
+          self._session, handle, feed_dict, fetch_list)
+    else:
+      with errors.raise_exception_on_not_ok_status() as status:
         return tf_session.TF_PRun(
             self._session, handle, feed_dict, fetch_list, status)
 
+  # pylint: disable=protected-access
+  class _Callable(object):
+    """Experimental wrapper for the C++ `Session::MakeCallable()` API."""
+
+    def __init__(self, session, callable_options):
+      self._session = session
+      self._handle = None
+      options_ptr = tf_session.TF_NewBufferFromString(
+          compat.as_bytes(callable_options.SerializeToString()))
+      try:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if session._created_with_new_api:
+            self._handle = tf_session.TF_SessionMakeCallable(
+                session._session, options_ptr, status)
+          else:
+            self._handle = tf_session.TF_DeprecatedSessionMakeCallable(
+                session._session, options_ptr, status)
+      finally:
+        tf_session.TF_DeleteBuffer(options_ptr)
+
+    def __call__(self, *args):
+      # TODO(b/74355905): Support argument and return value nested structures,
+      # and tensor-like objects such as SparseTensors.
+      with errors.raise_exception_on_not_ok_status() as status:
+        if self._session._created_with_new_api:
+          return tf_session.TF_SessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+        else:
+          return tf_session.TF_DeprecatedSessionRunCallable(
+              self._session._session, self._handle, args, status, None)
+
+    def __del__(self):
+      if self._handle is not None:
+        with errors.raise_exception_on_not_ok_status() as status:
+          if self._session._created_with_new_api:
+            tf_session.TF_SessionReleaseCallable(
+                self._session._session, self._handle, status)
+          else:
+            tf_session.TF_DeprecatedSessionReleaseCallable(
+                self._session._session, self._handle, status)
+  # pylint: enable=protected-access
+
+  # TODO(b/74355905): Reimplement `Session.make_callable()` using this method
+  # where possible.
+  def _make_callable_from_options(self, callable_options):
+    """Returns a handle to a "callable" with the given options.
+
+    Args:
+      callable_options: A `CallableOptions` protocol buffer message describing
+        the computation that will be performed by the callable.
+
+    Returns:
+      A handle to the new callable.
+    """
+    self._extend_graph()
+    return BaseSession._Callable(self, callable_options)
+
 
 @tf_export('Session')
 class Session(BaseSession):
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 5a7413c12e9db92cb85d54a69602753ff6476425..38a3acb2dc304968915e84c8054621e441294e61 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -23,7 +23,6 @@ from tensorflow.core.protobuf import cluster_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_session
 from tensorflow.python.client import session
-from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -42,21 +41,13 @@ class SessionListDevicesTestMethods(object):
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_session = tf_session.TF_NewSession(
-          ops.get_default_graph()._c_graph, opts, status)
-      raw_device_list = tf_session.TF_SessionListDevices(
-          c_session, status)
+    c_session = tf_session.TF_NewSession(ops.get_default_graph()._c_graph, opts)
+    raw_device_list = tf_session.TF_SessionListDevices(c_session)
     size = tf_session.TF_DeviceListCount(raw_device_list)
-    # Test that invalid device numbers return -1 rather than a Swig-wrapped
-    # pointer.
-    status_no_exception = c_api_util.ScopedTFStatus()
-    memory = tf_session.TF_DeviceListMemoryBytes(
-        raw_device_list, size, status_no_exception)
-    self.assertEqual(memory, -1)
+    with self.assertRaises(errors.InvalidArgumentError):
+      tf_session.TF_DeviceListMemoryBytes(raw_device_list, size)
     tf_session.TF_DeleteDeviceList(raw_device_list)
-    with errors.raise_exception_on_not_ok_status() as status:
-      tf_session.TF_CloseSession(c_session, status)
+    tf_session.TF_CloseSession(c_session)
 
   def testListDevicesGrpcSession(self):
     server = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 6e2640efd1d58ab524e42b62f62ad3d38f360c0e..92497272c66b5c3be36aba75b9e3b7f3d99b062d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1371,6 +1371,18 @@ class SessionTest(test_util.TensorFlowTestCase):
                               run_metadata=run_metadata))
       self.assertGreater(len(run_metadata.step_stats.dev_stats), 0)
 
+  def testOptimizedMakeCallable(self):
+    with session.Session() as sess:
+      ph = array_ops.placeholder(dtypes.float32)
+      a = math_ops.add(ph, 1.0)
+      callable_opts = config_pb2.CallableOptions()
+      callable_opts.feed.append(ph.name)
+      callable_opts.fetch.append(a.name)
+      for _ in range(3):
+        callable_fn = sess._make_callable_from_options(callable_opts)
+        for _ in range(5):
+          self.assertEqual([2.0], callable_fn(np.array(1.0, dtype=np.float32)))
+
   def testFeedError(self):
     with session.Session() as sess:
       feed_t = array_ops.placeholder(dtype=dtypes.float32)
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index e88fc0c01a8bb7534f47e2a0389965c102bbad7b..b82182d5d3690e4601b4fe8423cef972139f2283 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -18,11 +18,11 @@ limitations under the License.
 %{
 
 #include "tensorflow/c/python_api.h"
-#include "tensorflow/python/client/tf_session_helper.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/python/client/tf_session_helper.h"
 
 // Helper function to convert a Python list of Tensors to a C++ vector of
 // TF_Outputs.
@@ -72,7 +72,7 @@ void PyInt64ListToVector(PyObject* py_int_seq, std::vector<int64_t>* vec) {
   int size = PySequence_Fast_GET_SIZE(py_int_seq);
   for (int i = 0; i < size; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(py_int_seq, i);
-    vec->push_back(PyInt_AsLong(item));
+    vec->push_back(PyLong_AsLongLong(item));
   }
 }
 
@@ -157,6 +157,25 @@ tensorflow::ImportNumpy();
   }
 }
 
+// We use TF_OperationGetControlOutputs_wrapper instead of
+// TF_OperationGetControlOutputs
+%ignore TF_OperationGetControlOutputs;
+%unignore TF_OperationGetControlOutputs_wrapper;
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_OperationGetControlOutputs_wrapper;
+
+// Build a Python list of TF_Operation* and return it.
+%typemap(out) std::vector<TF_Operation*> tensorflow::TF_OperationGetControlOutputs_wrapper {
+  $result = PyList_New($1.size());
+  if (!$result) {
+    SWIG_exception_fail(SWIG_MemoryError, "$symname: couldn't create list");
+  }
+
+  for (size_t i = 0; i < $1.size(); ++i) {
+    PyList_SET_ITEM($result, i, CreateWrappedTFOperation($1[i]));
+  }
+}
+
 %ignore TF_OperationOutputConsumers;
 %unignore TF_OperationOutputConsumers_wrapper;
 // See comment for "%noexception TF_SessionRun_wrapper;"
@@ -419,6 +438,30 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
   $result = new_result;
 }
 
+%typemap(in, numinputs=0) int64_t* out_handle (int64_t out_handle) {
+  $1 = &out_handle;
+}
+
+%typemap(argout) int64_t* out_handle {
+  $result = PyLong_FromLongLong(*$1);
+}
+
+%typemap(in) int64_t handle {
+  if (!PyLong_Check($input)) {
+    SWIG_exception_fail(
+        SWIG_TypeError,
+        tensorflow::strings::Printf(
+            "Expected a python long for conversion to callable handle but got %s",
+            Py_TYPE($input)->tp_name).c_str());
+  }
+  $1 = PyLong_AsLongLong($input);
+}
+
+// Override default py3 behavior of attempting to encode into Unicode.
+%typemap(out) std::string tensorflow::ResourceHandleShapeAndType {
+  $result = PyBytes_FromStringAndSize($1.data(), $1.size());
+}
+
 // TODO(skyewm): SWIG emits a warning for the const char* in TF_WhileParams,
 // skip for now
 %ignore TF_WhileParams;
@@ -452,6 +495,17 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
 // See comment for "%noexception TF_SessionRun_wrapper;"
 %noexception TF_SessionPRun_wrapper;
 
+%unignore TF_DeprecatedSessionMakeCallable;
+%unignore TF_SessionMakeCallable;
+%unignore TF_DeprecatedSessionRunCallable;
+%unignore TF_SessionRunCallable;
+%unignore TF_DeprecatedSessionReleaseCallable;
+%unignore TF_SessionReleaseCallable;
+
+// See comment for "%noexception TF_SessionRun_wrapper;"
+%noexception TF_DeprecatedSessionRunCallable;
+%noexception TF_SessionRunCallable;
+
 %rename("_TF_SetTarget") TF_SetTarget;
 %rename("_TF_SetConfig") TF_SetConfig;
 %rename("_TF_NewSessionOptions") TF_NewSessionOptions;
@@ -469,9 +523,8 @@ TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper{
       _TF_SetTarget(opts, target)
     if config is not None:
       from tensorflow.python.framework import errors
-      with errors.raise_exception_on_not_ok_status() as status:
-        config_str = config.SerializeToString()
-        _TF_SetConfig(opts, config_str, status)
+      config_str = config.SerializeToString()
+      _TF_SetConfig(opts, config_str)
     return opts
 %}
 
@@ -723,6 +776,7 @@ def TF_Reset(target, containers=None, config=None):
 %unignore TF_TryEvaluateConstant_wrapper;
 %noexception TF_TryEvaluateConstant_wrapper;
 %unignore ExtendSession;
+%unignore ResourceHandleShapeAndType;
 
 %include "tensorflow/python/client/tf_session_helper.h"
 
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index a8ab91749a86749a1eef25e2674634334682d0f3..b6481e7e29e4057f08e1c78b310bf5581afc5411 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -155,6 +155,156 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
   ClearDecrefCache();
 }
 
+namespace {
+void MakeCallableHelper(tensorflow::Session* session,
+                        const TF_Buffer* callable_options, int64_t* out_handle,
+                        TF_Status* out_status) {
+  tensorflow::CallableOptions callable_options_proto;
+  if (callable_options != nullptr &&
+      !callable_options_proto.ParseFromArray(callable_options->data,
+                                             callable_options->length)) {
+    Set_TF_Status_from_Status(
+        out_status,
+        errors::InvalidArgument("Unparseable CallableOptions proto"));
+    return;
+  }
+  tensorflow::Session::CallableHandle handle;
+  Status s = session->MakeCallable(callable_options_proto, &handle);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+  *out_handle = handle;
+}
+}  // namespace
+
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status) {
+  MakeCallableHelper(session->session, callable_options, out_handle,
+                     out_status);
+}
+
+namespace {
+void RunCallableHelper(tensorflow::Session* session, int64_t handle,
+                       PyObject* feed_values, TF_Status* out_status,
+                       PyObjectVector* out_values, TF_Buffer* run_metadata) {
+  // Convert feed values to a vector of tensorflow::Tensor objects.
+  std::vector<Tensor> input_tensors;
+  Status s;
+  {
+    feed_values =
+        PySequence_Fast(feed_values, "feed_values must be a sequence");
+    if (feed_values == nullptr) return;
+    Safe_PyObjectPtr feed_values_holder(make_safe(feed_values));
+    Py_ssize_t len = PySequence_Fast_GET_SIZE(feed_values);
+    input_tensors.reserve(len);
+    for (Py_ssize_t i = 0; i < len; ++i) {
+      PyObject* elem = PySequence_Fast_GET_ITEM(feed_values, i);
+      if (!elem) {
+        Set_TF_Status_from_Status(
+            out_status, errors::Internal("Could not get feed value ", i));
+        return;
+      }
+      Tensor t;
+      s = NdarrayToTensor(elem, &t);
+      if (!s.ok()) {
+        Set_TF_Status_from_Status(out_status, s);
+        return;
+      }
+      input_tensors.push_back(std::move(t));
+    }
+  }
+
+  // Allocate a RunMetadata protobuf object to receive the metadata,
+  // if the caller is expecting any.
+  std::unique_ptr<RunMetadata> run_metadata_proto;
+  if (run_metadata != nullptr) {
+    run_metadata_proto.reset(new RunMetadata);
+  }
+
+  // Run the callable.
+  std::vector<Tensor> output_tensors;
+  Py_BEGIN_ALLOW_THREADS;
+  s = session->RunCallable(handle, input_tensors, &output_tensors,
+                           run_metadata_proto.get());
+  Py_END_ALLOW_THREADS;
+
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(out_status, s);
+    return;
+  }
+
+  // If requested, serialize the RunMetadata to pass it back to the caller.
+  if (run_metadata != nullptr) {
+    s = MessageToBuffer(*run_metadata_proto, run_metadata);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+  }
+
+  // Convert results to NumPy arrays. Since this can fail, stage the
+  // results via a safe container that takes care of decreasing the
+  // reference count on failure.
+  std::vector<Safe_PyObjectPtr> py_outputs_safe;
+  py_outputs_safe.reserve(output_tensors.size());
+  for (const Tensor& output : output_tensors) {
+    PyObject* py_array;
+    s = TensorToNdarray(output, &py_array);
+    if (!s.ok()) {
+      Set_TF_Status_from_Status(out_status, s);
+      return;
+    }
+    py_outputs_safe.push_back(make_safe(py_array));
+  }
+
+  // If we reach this point, we have successfully built a list of objects
+  // so we can release them from the safe container.
+  out_values->reserve(py_outputs_safe.size());
+  for (auto& output : py_outputs_safe) {
+    out_values->push_back(output.release());
+  }
+}
+}  // namespace
+
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values,
+                           TF_Buffer* run_metadata) {
+  RunCallableHelper(session->session, handle, feed_values, out_status,
+                    out_values, run_metadata);
+  ClearDecrefCache();
+}
+
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle,
+                                         TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status) {
+  Set_TF_Status_from_Status(out_status,
+                            session->session->ReleaseCallable(handle));
+}
+
 // Wrapper for TF_PRunSetup that converts the arguments to appropriate types.
 // If *out_status is OK, the caller becomes the owner of *out_handle.
 void TF_PRunSetup_wrapper(TF_DeprecatedSession* session,
@@ -400,6 +550,15 @@ std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
   return control_inputs;
 }
 
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper) {
+  std::vector<TF_Operation*> control_outputs(
+      TF_OperationNumControlOutputs(oper));
+  TF_OperationGetControlOutputs(oper, control_outputs.data(),
+                                control_outputs.size());
+  return control_outputs;
+}
+
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
     TF_Output oper_out) {
   int num_consumers = TF_OperationOutputNumConsumers(oper_out);
@@ -470,15 +629,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
   TF_GraphSetTensorShape(graph, output, dims.data(), dims.size(), status);
 }
 
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status) {
-  std::vector<int64_t> dims(num_dims);
-  TF_GraphGetTensorShape(graph, output, dims.data(), num_dims, status);
-  return dims;
-}
-
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results) {
   int num_missing_unused_input_mappings;
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index 83318dc178f6da3828a8dc41e81b7fc3e2e19e22..cfd27c2bee990ab4e2829652a532761e674ed8e0 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -59,6 +59,31 @@ void TF_Run_wrapper(TF_DeprecatedSession* session, const TF_Buffer* run_options,
                     const NameVector& target_nodes, TF_Status* out_status,
                     PyObjectVector* out_values, TF_Buffer* run_outputs);
 
+// Python wrappers for the `Session::MakeCallable()` API.
+void TF_DeprecatedSessionMakeCallable(TF_DeprecatedSession* session,
+                                      const TF_Buffer* callable_options,
+                                      int64_t* out_handle,
+                                      TF_Status* out_status);
+void TF_SessionMakeCallable(TF_Session* session,
+                            const TF_Buffer* callable_options,
+                            int64_t* out_handle, TF_Status* out_status);
+
+// Python wrappers for the `Session::RunCallable()` API.
+void TF_DeprecatedSessionRunCallable(TF_DeprecatedSession* session,
+                                     int64_t handle, PyObject* feed_values,
+                                     TF_Status* out_status,
+                                     PyObjectVector* out_values,
+                                     TF_Buffer* run_metadata);
+void TF_SessionRunCallable(TF_Session* session, int64_t handle,
+                           PyObject* feed_values, TF_Status* out_status,
+                           PyObjectVector* out_values, TF_Buffer* run_metadata);
+
+// Python wrappers for the `Session::ReleaseCallable()` API.
+void TF_DeprecatedSessionReleaseCallable(TF_DeprecatedSession* session,
+                                         int64_t handle, TF_Status* out_status);
+void TF_SessionReleaseCallable(TF_Session* session, int64_t handle,
+                               TF_Status* out_status);
+
 // Set up the graph with the intended feeds and fetches for partial run.
 // *out_handle is owned by the caller.
 //
@@ -111,8 +136,7 @@ string EqualAttrValueWrapper(const string& actual, const string& expected);
 //
 // If shape is unknown, sets unknown_shape to true.
 tensorflow::gtl::InlinedVector<int64_t, 6> TF_GraphGetTensorShapeHelper(
-    TF_Graph* graph, TF_Output output, TF_Status* out_status,
-    bool* unknown_shape);
+    TF_Graph* graph, TF_Output output, TF_Status* status, bool* unknown_shape);
 
 // Runs the graph associated with the session starting with the supplied inputs.
 // On success, `py_outputs` is populated with a numpy ndarray for each output
@@ -124,7 +148,7 @@ void TF_SessionRun_wrapper(TF_Session* session, const TF_Buffer* run_options,
                            const std::vector<PyObject*>& input_ndarrays,
                            const std::vector<TF_Output>& outputs,
                            const std::vector<TF_Operation*>& targets,
-                           TF_Buffer* run_metadata, TF_Status* out_status,
+                           TF_Buffer* run_metadata, TF_Status* status,
                            std::vector<PyObject*>* py_outputs);
 
 // Set up the graph with the intended feeds (inputs) and fetches (output) for
@@ -140,8 +164,7 @@ void TF_SessionPRunSetup_wrapper(TF_Session* session,
                                  const std::vector<TF_Output>& inputs,
                                  const std::vector<TF_Output>& outputs,
                                  const std::vector<TF_Operation*>& targets,
-                                 const char** out_handle,
-                                 TF_Status* out_status);
+                                 const char** out_handle, TF_Status* status);
 
 // Continue to run the graph with additional feeds and fetches. The
 // execution state is uniquely identified by the handle.
@@ -157,7 +180,7 @@ void TF_SessionPRun_wrapper(TF_Session* session, const char* handle,
                             const std::vector<TF_Output>& inputs,
                             const std::vector<PyObject*>& input_ndarrays,
                             const std::vector<TF_Output>& outputs,
-                            TF_Status* out_status,
+                            TF_Status* status,
                             std::vector<PyObject*>* py_outputs);
 
 // Retrieves the inputs of this operation.
@@ -167,6 +190,10 @@ std::vector<TF_Output> GetOperationInputs(TF_Operation* oper);
 std::vector<TF_Operation*> TF_OperationGetControlInputs_wrapper(
     TF_Operation* oper);
 
+// Retrieves the control outputs of this operation.
+std::vector<TF_Operation*> TF_OperationGetControlOutputs_wrapper(
+    TF_Operation* oper);
+
 // Retrieves the op names of the consumers of `oper_out`. The returned strings
 // have the lifetime of the underlying TF_Graph.
 std::vector<const char*> TF_OperationOutputConsumers_wrapper(
@@ -179,7 +206,7 @@ TF_Function* TF_GraphToFunction_wrapper(
     const std::vector<TF_Operation*>* opers,
     const std::vector<TF_Output>& inputs, const std::vector<TF_Output>& outputs,
     const NameVector& output_names, const TF_FunctionOptions* opts,
-    const char* description, TF_Status* out_status);
+    const char* description, TF_Status* status);
 
 // Set the shapes and types for the output's handle.
 //
@@ -202,13 +229,6 @@ void TF_GraphSetTensorShape_wrapper(TF_Graph* graph, TF_Output output,
                                     const std::vector<int64_t>& dims,
                                     bool unknown_shape, TF_Status* status);
 
-// Return the shape of output. `num_dims` should be the output of
-// TF_GraphGetTensorNumDims. If `num_dims = -1`, this should not be called.
-std::vector<int64_t> TF_GraphGetTensorShape_wrapper(TF_Graph* graph,
-                                                    TF_Output output,
-                                                    int num_dims,
-                                                    TF_Status* status);
-
 // Returns the string representations of the missing unused input mappings.
 std::vector<string> TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
     TF_ImportGraphDefResults* results);
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index 5e6b5acdb02e4c8c167485520a8d84ac43db7511..c046e9cfd45d7d7677a1dbab0a7168e526c89bca 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.client import timeline
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -155,9 +156,7 @@ class TimelineTest(test.TestCase):
     ctf = step_analysis.chrome_trace.format_to_string()
     self._validateTrace(ctf)
     maximums = step_analysis.allocator_maximums
-    cpuname = 'cpu'
-    if 'mklcpu' in maximums:
-      cpuname = 'mkl' + cpuname
+    cpuname = 'mklcpu' if test_util.IsMklEnabled() else 'cpu'
     self.assertTrue(cpuname in maximums)
     cpu_max = maximums[
         'cuda_host_bfc'] if 'cuda_host_bfc' in maximums else maximums[cpuname]
diff --git a/tensorflow/python/data/BUILD b/tensorflow/python/data/BUILD
index b5bee36dcdfd463056d0e883acb3c701509b1eee..3e08c1587e3e0df70e3cd5be58d24103c4a78339 100644
--- a/tensorflow/python/data/BUILD
+++ b/tensorflow/python/data/BUILD
@@ -15,15 +15,3 @@ py_library(
         "//tensorflow/python/data/ops:readers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 8b8adefa65a5c54d40bc28d8f50953513cfd3605..ed0c11e6c117dcbb810fd3acfc484128ed3519fa 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -367,15 +367,3 @@ tf_py_test(
         "no_windows",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 4a14a915bdb33f1ac6e8fc1839b32bc81fa8de05..0af282a02475384cb2d0f8e273324d6211e1b50d 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -717,6 +718,14 @@ class IteratorTest(test.TestCase):
       self.assertTrue(
           iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE in str(warning.message))
 
+  def testEagerIteratorAsync(self):
+    with context.eager_mode(), context.execution_mode(context.ASYNC):
+      val = 0
+      dataset = dataset_ops.Dataset.range(10)
+      for foo in dataset:
+        self.assertEqual(val, foo.numpy())
+        val += 1
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 3119ab003794cb9bc0c748dfeb47597e0877f5fd..fa2e86eab18b0b97ea01a96e309b0ea82d91b267 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -59,15 +59,3 @@ py_library(
         "//tensorflow/python/eager:context",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index c0a6283be433aba80eab2375cbaed6f187e3c4c3..c28de3d05443f40526eb589e155cb58e98acb14a 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -121,7 +121,7 @@ class Dataset(object):
       An `Iterator` over the elements of this dataset.
 
     Raises:
-      RuntimeError: If eager execution is enabled.
+      RuntimeError: If eager execution is not enabled.
     """
     if context.executing_eagerly():
       return iterator_ops.EagerIterator(self)
@@ -2043,6 +2043,8 @@ class PrefetchDataset(Dataset):
     """See `Dataset.prefetch()` for details."""
     super(PrefetchDataset, self).__init__()
     self._input_dataset = input_dataset
+    if buffer_size is None:
+      buffer_size = -1  # This is the sentinel for auto-tuning.
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index d79b9d6011b6ebd00a47d572165cdbba8a31bd32..0c76afd29d4626be9120c059d60218daab5cc0ac 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -488,23 +488,27 @@ class EagerIterator(object):
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
     """
-    with ops.device(self._device):
-      # TODO(ashankar): Consider removing this ops.device() contextmanager
-      # and instead mimic ops placement in graphs: Operations on resource
-      # handles execute on the same device as where the resource is placed.
-      # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
-      # because in eager mode this code will run synchronously on the calling
-      # thread. Therefore we do not need to make a defensive context switch
-      # to a background thread, and can achieve a small constant performance
-      # boost by invoking the iterator synchronously.
-      ret = gen_dataset_ops.iterator_get_next_sync(
-          self._resource,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self._output_types, ret), self._output_types,
-        self._output_shapes, self._output_classes)
+    # This runs in sync mode as iterators use an error status to communicate
+    # that there is no more data to iterate over.
+    # TODO(b/77291417): Fix
+    with context.execution_mode(context.SYNC):
+      with ops.device(self._device):
+        # TODO(ashankar): Consider removing this ops.device() contextmanager
+        # and instead mimic ops placement in graphs: Operations on resource
+        # handles execute on the same device as where the resource is placed.
+        # NOTE(mrry): Here we use the "_sync" variant of `iterator_get_next`
+        # because in eager mode this code will run synchronously on the calling
+        # thread. Therefore we do not need to make a defensive context switch
+        # to a background thread, and can achieve a small constant performance
+        # boost by invoking the iterator synchronously.
+        ret = gen_dataset_ops.iterator_get_next_sync(
+            self._resource,
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+
+      return sparse.deserialize_sparse_tensors(
+          nest.pack_sequence_as(self._output_types, ret), self._output_types,
+          self._output_shapes, self._output_classes)
 
   def next(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index b1bdbdab37b63667b475c732df7a47d9e57f2b19..0fc32d51b9fe581a54519139f3bf12118f8f4028 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -109,15 +109,3 @@ py_test(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 512d292ee2ffa3e61cca0952c0d530c5ec9b3d2a..250b4b1b6ab983c8073b5de3d2d29d02a50c71a8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -913,6 +913,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
+    tags = ["no_windows"],  # TODO: needs investigation on Windows
 )
 
 py_test(
@@ -920,6 +921,7 @@ py_test(
     size = "small",
     srcs = ["cli/profile_analyzer_cli_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":debugger_cli_common",
         ":profile_analyzer_cli",
@@ -1095,15 +1097,3 @@ sh_test(
         ":offline_analyzer",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 0e089a26eb88061ece54008a68c51de41b7b362b..8c0d3feeceab1bf29c1dabc668176a6ef7806421 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -398,21 +398,6 @@ py_test(
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "imperative_grad",
     srcs = ["imperative_grad.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index c54a5a1445df73e16688e776eddd4edf9d026535..92774d4d50e00c85599ceaef1cc99bb062bd3ce3 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -31,7 +31,6 @@ from tensorflow.python.eager import imperative_grad
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -50,12 +49,10 @@ def op_attr_type(op_type, attr_name):
   try:
     return _op_attr_type_cache[(op_type, attr_name)]
   except KeyError:
-    with errors.raise_exception_on_not_ok_status() as status:
-      h = context.context()._handle  # pylint: disable=protected-access
-      attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(
-          h, op_type, attr_name, status)
-    _op_attr_type_cache[(op_type, attr_name)] = attr_type
-    return attr_type
+    h = context.context()._handle  # pylint: disable=protected-access
+    attr_type = pywrap_tensorflow.TFE_OpNameGetAttrType(h, op_type, attr_name)
+  _op_attr_type_cache[(op_type, attr_name)] = attr_type
+  return attr_type
 
 
 def make_attr(attr_type, value):
@@ -646,6 +643,13 @@ _default_vspace = imperative_grad.VSpace(
     ones=_ones)
 
 
+def _handle_or_self(x):
+  """If x is ResourceVariable, return its handle, else x."""
+  if isinstance(x, resource_variable_ops.ResourceVariable):
+    x = x.handle
+  return x
+
+
 @tf_export("GradientTape")
 class GradientTape(object):
   """Record operations for automatic differentiation.
@@ -723,9 +727,7 @@ class GradientTape(object):
       tensor: a Tensor or list of Tensors.
     """
     for t in nest.flatten(tensor):
-      if isinstance(t, resource_variable_ops.ResourceVariable):
-        t = t.handle
-      tape.watch(t)
+      tape.watch(_handle_or_self(t))
 
   def watched_variables(self):
     # Sorting variables by id, which is monotonically increasing in construction
@@ -739,14 +741,15 @@ class GradientTape(object):
 
     Args:
       target: Tensor to be differentiated.
-      sources: a list of Tensors or Variables. `target` will be differentiated
-        against elements in `sources`.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
         target. Defaults to None.
 
     Returns:
-      a list of Tensors (or IndexedSlices, or None), one for each element in
-      `sources`.
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
 
     Raises:
       RuntimeError: if called inside the context of the tape, or if called more
@@ -756,12 +759,15 @@ class GradientTape(object):
       raise RuntimeError("GradientTape.gradient can only be called once "
                          "on non-persistent tapes, and "
                          "only when the context manager has exited.")
-    sources = [x.handle if isinstance(x, resource_variable_ops.ResourceVariable)
-               else x
-               for x in sources]
-    grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], sources,
+    flat_sources = nest.flatten(sources)
+    flat_sources = [_handle_or_self(x) for x in flat_sources]
+
+    flat_grad = imperative_grad.imperative_grad(
+        _default_vspace, self._tape, [target], flat_sources,
         output_gradients=output_gradients)
+
     if not self._persistent:
       self._tape = None
+
+    grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index f04d89a6d976d1c1f71b385322032e74d42949b5..991b4dbe7a688c8f6dc6420b6d6b7f7158d6bf86 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -369,6 +369,53 @@ class BackpropTest(test.TestCase):
 
     self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=False) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = 2 * x
+    grad = g.gradient(target=y, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [2.0, 2.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testPersistentGradientTapeRepeatedSource(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      y = constant_op.constant(5.0)
+      g.watch(x)
+      g.watch(y)
+      z = x * x + x * y
+    grad = g.gradient(target=z, sources=[x, x])
+    self.assertEqual(self.evaluate(grad), [11.0, 11.0])
+    grad = g.gradient(target=z, sources=[y, x])
+    self.assertEqual(self.evaluate(grad), [3.0, 11.0])
+
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testGradientTapeStructure(self):
+    with backprop.GradientTape(persistent=True) as g:
+      # Using different constant values because constant tensors are
+      # cached, leading to a different gradient then what one might expect.
+      x1 = constant_op.constant(3.0)
+      x2 = constant_op.constant(3.1)
+      x3 = constant_op.constant(3.2)
+      g.watch(x1)
+      g.watch(x2)
+      g.watch(x3)
+      y = x1  + 2 * x2  + 3 * x3
+    self.assertEqual(self.evaluate(g.gradient(y, x1)), [1.0])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1,))), (1.0,))
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2))), (1.0, 2.0))
+    self.assertEqual(self.evaluate(g.gradient(y, [(x1, x2), (x2, x3)])),
+                     [(1.0, 2.0), (2.0, 3.0)])
+    self.assertEqual(self.evaluate(g.gradient(y, (x1, x2, [x1, x3]))),
+                     (1.0, 2.0, [1.0, 3.0]))
+    self.assertEqual(self.evaluate(g.gradient(y, [x1, {'x2': x2, 'x3': x3}])),
+                     [1.0, {'x2': 2.0, 'x3': 3.0}])
+
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes()
   def testGradientTape(self):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9ca5041c38ed07b39fd73b9f110ab06e8e903251..3aad4a114a710280b5046666256b6b43dc0d5523 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -201,6 +201,9 @@ class MicroBenchmarks(test.Benchmark):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
+  def benchmark_slowpath_tf_identity(self):
+    self._run(lambda: gen_array_ops.identity(1), 30000)
+
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
@@ -214,10 +217,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_identity(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
+          30000)
 
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
@@ -233,10 +237,11 @@ class MicroBenchmarks(test.Benchmark):
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
-    m = self._m_2
-    self._run(
-        lambda: backprop.gradients_function(lambda x: x, [0])(m),
-        30000)
+    with context.device(CPU):
+      m = gen_array_ops.identity(self._m_2)
+      self._run(
+          lambda: backprop.gradients_function(lambda x: x, [0])(m),
+          30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -268,11 +273,12 @@ class MicroBenchmarks(test.Benchmark):
     # pylint: disable=protected-access
     ctx_handle = context.context()._handle
     # pylint: enable=protected-access
+    device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "MatMul", inputs,
-                                       attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
+                                       inputs, attrs, 1)
 
     self._run(func, num_iters)
 
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 6c9a14730c0db4bdf23fc10b23d63b758349bdc1..9e146f021e813886b42ca72b07122b485901a24b 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -28,7 +28,6 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import errors
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
@@ -86,6 +85,7 @@ class _EagerContext(threading.local):
     self.device_spec = pydev.DeviceSpec.from_string("")
     self.device_name = self.device_spec.to_string()
     self.mode = _default_mode
+    self.is_eager = _default_mode == EAGER_MODE
     self.scope_name = ""
     self.recording_summaries = False
     self.summary_writer_resource = None
@@ -223,34 +223,27 @@ class Context(object):
       assert self._context_devices is None
       opts = pywrap_tensorflow.TFE_NewContextOptions()
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          if self._config is not None:
-            config_str = self._config.SerializeToString()
-            pywrap_tensorflow.TFE_ContextOptionsSetConfig(
-                opts, config_str, len(config_str), status)
-          if self._device_policy is not None:
-            pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
-                opts, self._device_policy)
-          if self._execution_mode == ASYNC:
-            pywrap_tensorflow.TFE_ContextOptionsSetAsync(True)
-          self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
+        if self._config is not None:
+          config_str = self._config.SerializeToString()
+          pywrap_tensorflow.TFE_ContextOptionsSetConfig(opts, config_str)
+        if self._device_policy is not None:
+          pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
+              opts, self._device_policy)
+        if self._execution_mode == ASYNC:
+          pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
+        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
       finally:
         pywrap_tensorflow.TFE_DeleteContextOptions(opts)
       # Store list of devices
       self._context_devices = []
-      with errors.raise_exception_on_not_ok_status() as status:
-        device_list = pywrap_tensorflow.TFE_ContextListDevices(
-            self._context_handle, status)
+      device_list = pywrap_tensorflow.TFE_ContextListDevices(
+          self._context_handle)
       try:
         self._num_gpus = 0
         for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_name = pywrap_tensorflow.TF_DeviceListName(
-                device_list, i, status)
+          dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
           self._context_devices.append(pydev.canonical_name(dev_name))
-          with errors.raise_exception_on_not_ok_status() as status:
-            dev_type = pywrap_tensorflow.TF_DeviceListType(
-                device_list, i, status)
+          dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
           if dev_type == "GPU":
             self._num_gpus += 1
 
@@ -287,9 +280,12 @@ class Context(object):
 
   @tf_contextlib.contextmanager
   def _mode(self, mode):
+    """A context manager to allow setting the mode to EAGER/GRAPH."""
     ctx = self._eager_context
     old_mode = ctx.mode
+    old_is_eager = ctx.is_eager
     ctx.mode = mode
+    ctx.is_eager = mode == EAGER_MODE
     if mode == EAGER_MODE:
       # Entering graph mode does not provide us with sufficient information to
       # record a context switch; graph-based context switches are only logged
@@ -298,13 +294,14 @@ class Context(object):
     try:
       yield
     finally:
+      ctx.is_eager = old_is_eager
       ctx.mode = old_mode
       if mode == EAGER_MODE:
         self.context_switches.pop()
 
   def executing_eagerly(self):
     """Returns True if current thread has eager executing enabled."""
-    return self._eager_context.mode == EAGER_MODE
+    return self._eager_context.is_eager
 
   def scalar_cache(self):
     """Per-device cache for scalars."""
@@ -411,9 +408,7 @@ class Context(object):
     if mode is None:
       mode = SYNC
     self._eager_context.execution_mode = mode
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle,
-                                                     mode == ASYNC, status)
+    pywrap_tensorflow.TFE_ContextSetAsyncForThread(self._handle, mode == ASYNC)
 
   @tf_contextlib.contextmanager
   def execution_mode(self, mode):
@@ -427,8 +422,7 @@ class Context(object):
 
   def async_wait(self):
     """Waits for ops dispatched in ASYNC mode to finish."""
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAsyncWait(self._handle, status)
+    pywrap_tensorflow.TFE_ContextAsyncWait(self._handle)
 
   def async_clear_error(self):
     """Clears errors raised during ASYNC execution."""
@@ -448,11 +442,9 @@ class Context(object):
     Args:
       fn: A wrapped TF_Function (returned from TF_GraphToFunction_wrapper).
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunction(
-          self._handle,  # pylint: disable=protected-access
-          fn,
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunction(
+        self._handle,  # pylint: disable=protected-access
+        fn)
 
   def add_function_def(self, fdef):
     """Add a function definition to the context.
@@ -464,12 +456,10 @@ class Context(object):
       fdef: A FunctionDef protocol buffer message.
     """
     fdef_string = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.TFE_ContextAddFunctionDef(
-          self._handle,  # pylint: disable=protected-access
-          fdef_string,
-          len(fdef_string),
-          status)
+    pywrap_tensorflow.TFE_ContextAddFunctionDef(
+        self._handle,  # pylint: disable=protected-access
+        fdef_string,
+        len(fdef_string))
 
   def add_post_execution_callback(self, callback):
     """Add a post-execution callback to the context.
@@ -512,23 +502,19 @@ class Context(object):
     To retrieve the accumulated metadata call context.export_run_metadata()
     and to stop tracing call context.disable_run_metadata().
     """
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._context_handle)
+    pywrap_tensorflow.TFE_ContextEnableRunMetadata(self._handle)
 
   @tf_contextlib.contextmanager
   def device_policy(self, policy):
-    if not self._context_handle:
-      self._initialize_handle_and_devices()
-    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-        self._context_handle)
+    handle = self._handle
+    old = pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(handle)
     pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-        self._handle, policy)
+        handle, policy)
     try:
       yield
     finally:
       pywrap_tensorflow.TFE_ContextSetThreadLocalDevicePlacementPolicy(
-          self._handle, old)
+          handle, old)
 
   def disable_run_metadata(self):
     """Disables tracing of op execution via RunMetadata."""
@@ -548,9 +534,8 @@ class Context(object):
     if not self._context_handle:
       return None
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TFE_ContextExportRunMetadata(
-            self._context_handle, buffer_, status)
+      pywrap_tensorflow.TFE_ContextExportRunMetadata(
+          self._context_handle, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     run_metadata = config_pb2.RunMetadata()
     run_metadata.ParseFromString(compat.as_bytes(proto_data))
@@ -579,6 +564,10 @@ def context():
   return _context
 
 
+def context_safe():
+  return _context
+
+
 # TODO(agarwal): remove this.
 def get_default_context():
   """Same as context."""
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 6ebf5b24819d48ba4a17d6059510eee7affe40ea..3fabe7060e980423268eb6f52ab4043cc4a4847c 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -97,6 +97,14 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertTrue(has_cpu_device)
     del ctx
 
+  def testAsyncBasic(self):
+    ctx = context.Context(execution_mode=context.ASYNC)
+    has_cpu_device = False
+    for x in ctx.devices():
+      has_cpu_device = has_cpu_device or 'CPU' in x
+    self.assertTrue(has_cpu_device)
+    del ctx
+
   def testRunMetadata(self):
     context.enable_run_metadata()
     t = constant_op.constant(1.0)
@@ -108,8 +116,7 @@ class TFETest(test_util.TensorFlowTestCase):
     cpu_stats = step_stats.dev_stats[0]
     self.assertEqual('/job:localhost/replica:0/task:0/device:CPU:0',
                      cpu_stats.device)
-    self.assertEqual(len(cpu_stats.node_stats), 1)
-    self.assertEqual(cpu_stats.node_stats[0].node_name, 'Add')
+    self.assertGreaterEqual(len(cpu_stats.node_stats), 1)
 
   def testShouldCopy(self):
     if not context.context().num_gpus():
@@ -650,10 +657,11 @@ class SendRecvTest(test_util.TensorFlowTestCase):
     with ops.device('GPU:0'):
       t0 = constant_op.constant(1.0)
       self._send(t0, 't0', self.cpu_device)
-    self.assertAllEqual(
-        self._recv(dtypes.float32, 't0', gpu_device_name),
-        1.0)
-    self._send(constant_op.constant(2.0), 't1', gpu_device_name)
+    with ops.device('cpu:0'):
+      self.assertAllEqual(
+          self._recv(dtypes.float32, 't0', gpu_device_name),
+          1.0)
+      self._send(constant_op.constant(2.0), 't1', gpu_device_name)
     with ops.device('GPU:0'):
       self.assertAllEqual(
           self._recv(dtypes.float32, 't1', self.cpu_device),
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 535361498a9dd33003d0479051e97d7ff2553067..9a082596535f51e5a4fb6cc2a11a4dd8a422ed44 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -253,7 +253,7 @@ def add_execution_callback(callback):
       `f(op_type, op_name, attrs, inputs, outputs)`.
       `op_type` is the type of the operation that was just executed (e.g.,
         `MatMul`).
-      `op_name` is the name of the operation that has was just executed. This
+      `op_name` is the name of the operation that was just executed. This
         name is set by the client who created the operation and can be `None` if
         it is unset.
       `attrs` contains the attributes of the operation as a `tuple` of
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 343012e552592a6f8bb1255118add3e938aa443c..5168ad3b18f623588b7804f597fa3b816de147f3 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -34,7 +34,6 @@ from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -79,14 +78,10 @@ def capture_value(tensor_map, value, dtype, name):
         ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
         shapes = [[d.size for d in s.dim]
                   if not s.unknown_rank else None for s in shapes]
-        with errors.raise_exception_on_not_ok_status() as status:
-          pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-              captured_value._op._graph._c_graph,  # pylint: disable=protected-access
-              captured_value._as_tf_output(),  # pylint: disable=protected-access
-              shapes,
-              ranks,
-              types,
-              status)
+        pywrap_tensorflow.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
+            captured_value._op._graph._c_graph,  # pylint: disable=protected-access
+            captured_value._as_tf_output(),  # pylint: disable=protected-access
+            shapes, ranks, types)
 
     tensor_map[ops.tensor_id(value)] = (value, captured_value)
   else:
@@ -228,6 +223,16 @@ class HelperContext(object):
     else:
       return val
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def __enter__(self):
     # pylint: disable=protected-access
     self._g = ops.get_default_graph()
@@ -275,23 +280,20 @@ class _EagerDefinedFunction(object):
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
     """
-    with errors.raise_exception_on_not_ok_status() as status:
-      fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
-          graph._c_graph,  # pylint: disable=protected-access
-          compat.as_str(name),
-          False,
-          [o._c_op for o in operations],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
-          [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
-          [],
-          None,
-          compat.as_str(""),
-          status)
+    fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
+        graph._c_graph,  # pylint: disable=protected-access
+        compat.as_str(name),
+        False,
+        [o._c_op for o in operations],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in inputs],  # pylint: disable=protected-access
+        [t._as_tf_output() for t in outputs],  # pylint: disable=protected-access
+        [],
+        None,
+        compat.as_str(""))
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_, status)
+      pywrap_tensorflow.TF_FunctionToFunctionDef(fn, buffer_)
       proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_)
     function_def = function_pb2.FunctionDef()
     function_def.ParseFromString(compat.as_bytes(proto_data))
@@ -302,7 +304,7 @@ class _EagerDefinedFunction(object):
     self.signature = function_def.signature
     self.grad_func_name = None
     self.python_grad_func = None
-    self._c_func = fn
+    self._c_func = c_api_util.ScopedTFFunction(fn)
     self._grad_func = None
 
 
@@ -669,7 +671,7 @@ def _defun_internal(name, func, args, kwds):
   if context.executing_eagerly():
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
-      _register(f._c_func)  # pylint: disable=protected-access
+      _register(f._c_func.func)  # pylint: disable=protected-access
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
       func_outputs, output_shapes, variables)
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index fd1d2c25ffe50cb7afcae29b3d0b15635b6a57dd..9af197981bde309160781fa5821152962e5383bb 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.eager import tape
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import function as tf_function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -377,23 +376,23 @@ class FunctionTest(test.TestCase):
     self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
 
   def testGradientOfGatherWithDefun(self):
+    with ops.device('cpu:0'):
+      v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
 
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+      def sum_gather():
+        return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
 
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+      grad_fn = backprop.implicit_grad(sum_gather)
+      gradient = grad_fn()
+      defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
+      defun_gradient = defun_grad_fn()
+      self.assertEqual(len(gradient), len(defun_gradient))
 
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+      gradient = gradient[0][0]
+      defun_gradient = defun_gradient[0][0]
+      self.assertAllEqual(gradient.values, defun_gradient.values)
+      self.assertAllEqual(gradient.indices, defun_gradient.indices)
+      self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
 
   def testReturningIndexedSlicesWithDefun(self):
 
@@ -476,9 +475,7 @@ class FunctionTest(test.TestCase):
     reshape = function.defun(array_ops.reshape)
     value = constant_op.constant([1., 2.])
     shape = constant_op.constant([2, 1]).gpu()
-    with self.assertRaises(errors.InvalidArgumentError):
-      with ops.device('gpu:0'):
-        reshape(value, shape)
+    reshape(value, shape)  # No error is raised
 
   def testDifferentiableFunctionNoneOutputs(self):
 
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index ee5d87f0835a8e70e0ce14537a51ea5418db41b9..d40ea982c746593d0eb91cd58f5db3f10b7af687 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -325,7 +325,7 @@ def _graph_callable_internal(func, shape_and_dtypes):
   # Also, what about the gradient registry of these functions? Those need to be
   # addressed as well.
   for f in tmp_graph._functions.values():  # pylint: disable=protected-access
-    function._register(f._c_func)  # pylint: disable=protected-access
+    function._register(f._c_func.func)  # pylint: disable=protected-access
   initializer_function = function.GraphModeFunction(
       initialization_name,
       placeholder_inputs,
diff --git a/tensorflow/python/eager/imperative_grad.py b/tensorflow/python/eager/imperative_grad.py
index 837cad974ac6555ef2b13d1a1a5e0e5f5166b01d..000152855d1a90f32936cca40c10f00c2df863a5 100644
--- a/tensorflow/python/eager/imperative_grad.py
+++ b/tensorflow/python/eager/imperative_grad.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import collections
 
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors
 
 
 VSpace = collections.namedtuple(
@@ -60,6 +59,5 @@ def imperative_grad(
      or if only non-differentiable functions of the source were used in the
      computation of target.
   """
-  with errors.raise_exception_on_not_ok_status() as status:
-    return pywrap_tensorflow.TFE_Py_TapeGradient(
-        tape._tape, vspace, target, sources, output_gradients, status)  # pylint: disable=protected-access
+  return pywrap_tensorflow.TFE_Py_TapeGradient(
+      tape._tape, vspace, target, sources, output_gradients)  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
index c2ce8efd7f70c6ba93b6d444f88ddbb9aa51ccdb..9afab0077b666b36d77abea5a7d8c444b6400812 100644
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ b/tensorflow/python/eager/python_eager_op_gen.cc
@@ -117,7 +117,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
                    const string& function_name)
       : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
     op_name_ = function_name_;
-    op_name_.Consume("_");
+    str_util::ConsumePrefix(&op_name_, "_");
   }
   ~GenEagerPythonOp() override {}
 
@@ -366,8 +366,8 @@ string GenEagerPythonOp::Code() {
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   // Handle graph-mode case
   strings::StrAppend(&result_,
-                     "  _ctx = _context.context()\n"
-                     "  if not _ctx.executing_eagerly():\n",
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
                      function_setup,
                      "    _, _, _op = _op_def_lib._apply_op_helper(\n");
   AddBodyNoReturn("        ");
@@ -492,7 +492,7 @@ bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
       strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
                          " = ", default_value, "\n");
     }
-    if (attr_type.starts_with("list(")) {
+    if (str_util::StartsWith(attr_type, "list(")) {
       ExpectListArg(indentation, attr_api_name, function_setup);
     }
 
@@ -683,13 +683,14 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
     return true;
   }
 
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix), parameters);
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
   strings::StrAppend(
       &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
   strings::StrAppend(&result_, "  This is for function ", function_name_,
                      "\n  \"\"\"\n");
 
-  strings::StrAppend(&result_, "  _ctx = _context.context()\n");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
 
   string function_setup;
   if (!GetEagerFunctionSetup("  ", &function_setup)) {
@@ -712,9 +713,9 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 }
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
-  string fastpath_execute_params =
-      strings::StrCat("_ctx._handle, _ctx.device_name, \"", op_def_.name(),
-                      "\", ", "name, _ctx._post_execution_callbacks");
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
@@ -755,6 +756,8 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      ", "return _result\n");
 
   // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
   strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
   strings::StrAppend(
       &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 55ba509065ba44ccafbd209201a250205553e261..d99bd0b0ffe5ef8042b3b869f82a9952f1985abc 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1372,11 +1372,15 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
   }
   if (!result.empty()) {
     PyObject* py_result = PyList_New(result.size());
+    tensorflow::gtl::FlatSet<PyObject*> seen_results(result.size());
     for (int i = 0; i < result.size(); ++i) {
       if (result[i] == nullptr) {
         Py_INCREF(Py_None);
         result[i] = Py_None;
+      } else if (seen_results.find(result[i]) != seen_results.end()) {
+        Py_INCREF(result[i]);
       }
+      seen_results.insert(result[i]);
       PyList_SET_ITEM(py_result, i, reinterpret_cast<PyObject*>(result[i]));
     }
     return py_result;
@@ -1405,16 +1409,33 @@ bool CheckInputsOk(PyObject* seq, int start_index,
     PyObject* item = PyTuple_GET_ITEM(seq, i + start_index);
     if (!op_def.input_arg(i).number_attr().empty() ||
         !op_def.input_arg(i).type_list_attr().empty()) {
-      // This item should be a list input.
-      if (!PyList_Check(item)) return false;
-      for (Py_ssize_t j = 0; j < PyList_Size(item); j++) {
-        PyObject* inner_item = PyList_GET_ITEM(item, j);
+      // This item should be a seq input.
+      if (!PySequence_Check(item)) {
+        VLOG(1) << "Falling back to slow path for Op \"" << op_def.name()
+                << "\", Input \"" << op_def.input_arg(i).name()
+                << "\" since we expected a sequence, but got "
+                << item->ob_type->tp_name;
+        return false;
+      }
+      for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
+        PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
         if (!EagerTensor_CheckExact(inner_item) &&
             !CheckResourceVariable(inner_item)) {
+          VLOG(1)
+              << "Falling back to slow path for Op \"" << op_def.name()
+              << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
+              << j
+              << " since we expected an EagerTensor/ResourceVariable, but got "
+              << inner_item->ob_type->tp_name;
           return false;
         }
       }
     } else if (!EagerTensor_CheckExact(item) && !CheckResourceVariable(item)) {
+      VLOG(1)
+          << "Falling back to slow path for Op \"" << op_def.name()
+          << "\", Input \"" << op_def.input_arg(i).name()
+          << "\" since we expected an EagerTensor/ResourceVariable, but got "
+          << item->ob_type->tp_name;
       return false;
     }
   }
@@ -1726,11 +1747,11 @@ const char* GetDeviceName(PyObject* py_device_name) {
   return nullptr;
 }
 
-bool RaiseIfNotPyList(PyObject* list, const string& attr_name) {
-  if (!PyList_Check(list)) {
+bool RaiseIfNotPySequence(PyObject* seq, const string& attr_name) {
+  if (!PySequence_Check(seq)) {
     PyErr_SetString(PyExc_TypeError,
-                    Printf("expected a list for attr %s, got %s instead",
-                           attr_name.data(), list->ob_type->tp_name)
+                    Printf("expected a sequence for attr %s, got %s instead",
+                           attr_name.data(), seq->ob_type->tp_name)
                         .data());
 
     return false;
@@ -1823,6 +1844,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
   op_exec_info.ctx = reinterpret_cast<TFE_Context*>(
       PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+
+  if (op_exec_info.ctx == nullptr) {
+    // The context hasn't been initialized. It will be in the slow path.
+    RaiseFallbackException(
+        "This function does not handle the case of the path where "
+        "all inputs are not already EagerTensors.");
+    return nullptr;
+  }
+
   op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
   op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
   op_exec_info.op_def = GetOpDef(op_exec_info.op_name);
@@ -1894,6 +1924,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
                               py_attr_value, &attr_list_sizes, status);
 
         if (TF_GetCode(status) != TF_OK) {
+          VLOG(1) << "Falling back to slow path for Op \"" << op_def->name()
+                  << "\" since we are unable to set the value for attr \""
+                  << attr.name() << "\" due to: " << TF_Message(status);
           RaiseFallbackException(TF_Message(status));
           return nullptr;
         }
@@ -1940,8 +1973,8 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
         PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.number_attr())) return nullptr;
-      Py_ssize_t len = PyList_Size(input);
+      if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
 
       TFE_OpSetAttrInt(op, input_arg.number_attr().data(), len);
       if (op_exec_info.run_callbacks) {
@@ -1953,15 +1986,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
 
       if (len > 0) {
         // First item adds the type attr.
-        if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, 0), &input_arg,
-                          flattened_attrs.get(), flattened_inputs.get(), op,
-                          status)) {
+        if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, 0),
+                          &input_arg, flattened_attrs.get(),
+                          flattened_inputs.get(), op, status)) {
           return nullptr;
         }
 
         for (Py_ssize_t j = 1; j < len; j++) {
           // Since the list is homogeneous, we don't need to re-add the attr.
-          if (!AddInputToOp(op_exec_info, PyList_GET_ITEM(input, j),
+          if (!AddInputToOp(op_exec_info, PySequence_Fast_GET_ITEM(input, j),
                             nullptr /* input_arg */,
                             nullptr /* flattened_attrs */,
                             flattened_inputs.get(), op, status)) {
@@ -1971,16 +2004,18 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject*, PyObject* args) {
       }
     } else if (!input_arg.type_list_attr().empty()) {
       // The item is a heterogeneous list.
-      if (!RaiseIfNotPyList(input, input_arg.type_list_attr())) return nullptr;
+      if (!RaiseIfNotPySequence(input, input_arg.type_list_attr())) {
+        return nullptr;
+      }
       const string& attr_name = input_arg.type_list_attr();
-      Py_ssize_t len = PyList_Size(input);
+      Py_ssize_t len = PySequence_Fast_GET_SIZE(input);
       tensorflow::gtl::InlinedVector<TF_DataType, 4> attr_value(len);
       PyObject* py_attr_value = nullptr;
       if (op_exec_info.run_callbacks) {
         py_attr_value = PyTuple_New(len);
       }
       for (Py_ssize_t j = 0; j < len; j++) {
-        PyObject* py_input = PyList_GET_ITEM(input, j);
+        PyObject* py_input = PySequence_Fast_GET_ITEM(input, j);
         tensorflow::Safe_PyObjectPtr py_eager_tensor;
         if (!ConvertToTensor(op_exec_info, py_input, &py_eager_tensor,
                              status)) {
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 5afb5a7dd5d88715768fda985fcea34bc798e37f..5d8b19223f000862aa46ad3a60796ae68bdec2f9 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -9,24 +9,13 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "estimator_py",
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":baseline",
+        ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
         ":estimator",
@@ -251,6 +240,53 @@ py_test(
     ],
 )
 
+py_library(
+    name = "boosted_trees",
+    srcs = ["canned/boosted_trees.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":head",
+        ":model_fn",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+    ],
+)
+
+py_test(
+    name = "boosted_trees_test",
+    size = "medium",
+    srcs = ["canned/boosted_trees_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":boosted_trees",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+    ],
+)
+
 py_library(
     name = "dnn",
     srcs = ["canned/dnn.py"],
@@ -930,5 +966,6 @@ cuda_py_test(
     tags = [
         "multi_gpu",
         "noasan",  # flaky time outs
+        "notsan",  # flaky
     ],
 )
diff --git a/tensorflow/python/estimator/canned/baseline_test.py b/tensorflow/python/estimator/canned/baseline_test.py
index 7833df2052657114c9799417e1b9d96035b4c5ef..7bf2e62da9c4598c28ad38825aac2031c9d51905 100644
--- a/tensorflow/python/estimator/canned/baseline_test.py
+++ b/tensorflow/python/estimator/canned/baseline_test.py
@@ -42,13 +42,13 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import queue_runner
@@ -482,7 +482,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -490,7 +490,7 @@ class BaselineRegressorTrainingTest(test.TestCase):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -685,13 +685,13 @@ class BaselineClassifierTrainingTest(test.TestCase):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer.Optimizer,
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5d5455b1a327d7e08e6c6b59377a51931a31827
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -0,0 +1,729 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator classes for BoostedTrees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.util.tf_export import tf_export
+
+_TreeHParams = collections.namedtuple(
+    'TreeHParams',
+    ['n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity'])
+
+_HOLD_FOR_MULTI_CLASS_SUPPORT = object()
+_HOLD_FOR_MULTI_DIM_SUPPORT = object()
+
+
+def _get_transformed_features(features, feature_columns):
+  """Gets the transformed features from features/feature_columns pair.
+
+  Args:
+    features: a dicionary of name to Tensor.
+    feature_columns: a list/set of tf.feature_column.
+
+  Returns:
+    result_features: a list of the transformed features, sorted by the name.
+    num_buckets: the maximum number of buckets across bucketized_columns.
+
+  Raises:
+    ValueError: when unsupported features/columns are tried.
+  """
+  num_buckets = 1
+  # pylint:disable=protected-access
+  for fc in feature_columns:
+    if isinstance(fc, feature_column_lib._BucketizedColumn):
+      # N boundaries creates (N+1) buckets.
+      num_buckets = max(num_buckets, len(fc.boundaries) + 1)
+    else:
+      raise ValueError('For now, only bucketized_column is supported but '
+                       'got: {}'.format(fc))
+  transformed = feature_column_lib._transform_features(features,
+                                                       feature_columns)
+  # pylint:enable=protected-access
+  result_features = []
+  for column in sorted(transformed, key=lambda tc: tc.name):
+    source_name = column.source_column.name
+    squeezed_tensor = array_ops.squeeze(transformed[column], axis=1)
+    if len(squeezed_tensor.shape) > 1:
+      raise ValueError('For now, only supports features equivalent to rank 1 '
+                       'but column `{}` got: {}'.format(
+                           source_name, features[source_name].shape))
+    result_features.append(squeezed_tensor)
+  return result_features, num_buckets
+
+
+def _keep_as_local_variable(tensor, name=None):
+  """Stores a tensor as a local Variable for faster read."""
+  return variable_scope.variable(
+      initial_value=tensor,
+      trainable=False,
+      collections=[ops.GraphKeys.LOCAL_VARIABLES],
+      validate_shape=False,
+      name=name)
+
+
+class _CacheTrainingStatesUsingHashTable(object):
+  """Caching logits, etc. using MutableHashTable."""
+
+  def __init__(self, example_ids, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains a MutableDenseHashTable for all values.
+    The API lookup() and insert() would have those specs,
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+    However in the MutableDenseHashTable, ids are bitcasted into float32 and
+    all values are concatenated as a single tensor (of float32).
+
+    Hence conversion happens internally before inserting to the HashTable and
+    after lookup from it.
+
+    Args:
+      example_ids: a Rank 1 tensor to be used as a key of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+
+    Raises:
+      ValueError: if example_ids is other than int64 or string.
+    """
+    if dtypes.as_dtype(dtypes.int64).is_compatible_with(example_ids.dtype):
+      empty_key = -1 << 62
+    elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
+      empty_key = ''
+    else:
+      raise ValueError('Unsupported example_id_feature dtype %s.',
+                       example_ids.dtype)
+    # Cache holds latest <tree_id, node_id, logits> for each example.
+    # tree_id and node_id are both int32 but logits is a float32.
+    # To reduce the overhead, we store all of them together as float32 and
+    # bitcast the ids to int32.
+    self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
+        empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
+    self._example_ids = example_ids
+    self._logits_dimension = logits_dimension
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    cached_tree_ids, cached_node_ids, cached_logits = array_ops.split(
+        lookup_ops.lookup_table_find_v2(
+            self._table_ref, self._example_ids, default_value=[0.0, 0.0, 0.0]),
+        [1, 1, self._logits_dimension],
+        axis=1)
+    cached_tree_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_tree_ids, dtypes.int32))
+    cached_node_ids = array_ops.squeeze(
+        array_ops.bitcast(cached_node_ids, dtypes.int32))
+    return (cached_tree_ids, cached_node_ids, cached_logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    insert_op = lookup_ops.lookup_table_insert_v2(
+        self._table_ref, self._example_ids,
+        array_ops.concat(
+            [
+                array_ops.expand_dims(
+                    array_ops.bitcast(tree_ids, dtypes.float32), 1),
+                array_ops.expand_dims(
+                    array_ops.bitcast(node_ids, dtypes.float32), 1),
+                logits,
+            ],
+            axis=1,
+            name='value_concat_for_cache_insert'))
+    return insert_op
+
+
+class _CacheTrainingStatesUsingVariables(object):
+  """Caching logits, etc. using Variables."""
+
+  def __init__(self, batch_size, logits_dimension):
+    """Creates a cache with the given configuration.
+
+    It maintains three variables, tree_ids, node_ids, logits, for caching.
+      tree_ids: shape=[batch_size], dtype=int32
+      node_ids: shape=[batch_size], dtype=int32
+      logits: shape=[batch_size, logits_dimension], dtype=float32
+
+    Note, this can be used only with in-memory data setting.
+
+    Args:
+      batch_size: `int`, the size of the cache.
+      logits_dimension: a constant (int) for the dimension of logits.
+    """
+    self._logits_dimension = logits_dimension
+    self._tree_ids = _keep_as_local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='tree_ids_cache')
+    self._node_ids = _keep_as_local_variable(
+        array_ops.zeros([batch_size], dtype=dtypes.int32),
+        name='node_ids_cache')
+    self._logits = _keep_as_local_variable(
+        array_ops.zeros([batch_size, logits_dimension], dtype=dtypes.float32),
+        name='logits_cache')
+
+  def lookup(self):
+    """Returns cached_tree_ids, cached_node_ids, cached_logits."""
+    return (self._tree_ids, self._node_ids, self._logits)
+
+  def insert(self, tree_ids, node_ids, logits):
+    """Inserts values and returns the op."""
+    return control_flow_ops.group(
+        [
+            self._tree_ids.assign(tree_ids),
+            self._node_ids.assign(node_ids),
+            self._logits.assign(logits)
+        ],
+        name='cache_insert')
+
+
+class _StopAtAttemptsHook(session_run_hook.SessionRunHook):
+  """Hook that requests stop at the number of attempts."""
+
+  def __init__(self, num_finalized_trees_tensor, num_attempted_layers_tensor,
+               max_trees, max_depth):
+    self._num_finalized_trees_tensor = num_finalized_trees_tensor
+    self._num_attempted_layers_tensor = num_attempted_layers_tensor
+    self._max_trees = max_trees
+    self._max_depth = max_depth
+
+  def before_run(self, run_context):
+    return session_run_hook.SessionRunArgs(
+        [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
+
+  def after_run(self, run_context, run_values):
+    # num_* tensors should be retrieved by a separate session than the training
+    # one, in order to read the values after growing.
+    # So, if it's approaching to the limit, get the actual value by additional
+    # session.
+    num_finalized_trees, num_attempted_layers = run_values.results
+    if (num_finalized_trees >= self._max_trees - 1 or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth - 1):
+      num_finalized_trees, num_attempted_layers = run_context.session.run(
+          [self._num_finalized_trees_tensor, self._num_attempted_layers_tensor])
+    if (num_finalized_trees >= self._max_trees or
+        num_attempted_layers > 2 * self._max_trees * self._max_depth):
+      run_context.request_stop()
+
+
+def _bt_model_fn(
+    features,
+    labels,
+    mode,
+    head,
+    feature_columns,
+    tree_hparams,
+    n_batches_per_layer,
+    config,
+    closed_form_grad_and_hess_fn=None,
+    example_id_column_name=None,
+    # TODO(youngheek): replace this later using other options.
+    train_in_memory=False,
+    name='boosted_trees'):
+  """Gradient Boosted Trees model_fn.
+
+  Args:
+    features: dict of `Tensor`.
+    labels: `Tensor` of shape [batch_size, 1] or [batch_size] labels of
+      dtype `int32` or `int64` in the range `[0, n_classes)`.
+    mode: Defines whether this is training, evaluation or prediction.
+      See `ModeKeys`.
+    head: A `head_lib._Head` instance.
+    feature_columns: Iterable of `feature_column._FeatureColumn` model inputs.
+    tree_hparams: TODO. collections.namedtuple for hyper parameters.
+    n_batches_per_layer: A `Tensor` of `int64`. Each layer is built after at
+      least n_batches_per_layer accumulations.
+    config: `RunConfig` object to configure the runtime settings.
+    closed_form_grad_and_hess_fn: a function that accepts logits and labels
+      and returns gradients and hessians. By default, they are created by
+      tf.gradients() from the loss.
+    example_id_column_name: Name of the feature for a unique ID per example.
+      Currently experimental -- not exposed to public API.
+    train_in_memory: `bool`, when true, it assumes the dataset is in memory,
+      i.e., input_fn should return the entire dataset as a single batch, and
+      also n_batches_per_layer should be set as 1.
+    name: Name to use for the model.
+
+  Returns:
+      An `EstimatorSpec` instance.
+
+  Raises:
+    ValueError: mode or params are invalid, or features has the wrong type.
+  """
+  is_single_machine = (config.num_worker_replicas <= 1)
+  if train_in_memory:
+    assert n_batches_per_layer == 1, (
+        'When train_in_memory is enabled, input_fn should return the entire '
+        'dataset as a single batch, and n_batches_per_layer should be set as '
+        '1.')
+  worker_device = control_flow_ops.no_op().device
+  # maximum number of splits possible in the whole tree =2^(D-1)-1
+  # TODO(youngheek): perhaps storage could be optimized by storing stats with
+  # the dimension max_splits_per_layer, instead of max_splits (for the entire
+  # tree).
+  max_splits = (1 << tree_hparams.max_depth) - 1
+  with ops.name_scope(name) as name:
+    # Prepare.
+    global_step = training_util.get_or_create_global_step()
+    input_feature_list, num_buckets = _get_transformed_features(
+        features, feature_columns)
+    if train_in_memory and mode == model_fn.ModeKeys.TRAIN:
+      input_feature_list = [
+          _keep_as_local_variable(feature) for feature in input_feature_list
+      ]
+    num_features = len(input_feature_list)
+
+    cache = None
+    if mode == model_fn.ModeKeys.TRAIN:
+      if train_in_memory and is_single_machine:  # maybe just train_in_memory?
+        batch_size = array_ops.shape(input_feature_list[0])[0]
+        cache = _CacheTrainingStatesUsingVariables(batch_size,
+                                                   head.logits_dimension)
+      elif example_id_column_name:
+        example_ids = features[example_id_column_name]
+        cache = _CacheTrainingStatesUsingHashTable(example_ids,
+                                                   head.logits_dimension)
+
+    # Create Ensemble resources.
+    if is_single_machine:
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+      local_tree_ensemble = tree_ensemble
+      ensemble_reload = control_flow_ops.no_op()
+    else:
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(name=name)
+      with ops.device(worker_device):
+        local_tree_ensemble = boosted_trees_ops.TreeEnsemble(
+            name=name + '_local', is_local=True)
+      # TODO(soroush): Do partial updates if this becomes a bottleneck.
+      ensemble_reload = local_tree_ensemble.deserialize(
+          *tree_ensemble.serialize())
+
+    # Create logits.
+    if mode != model_fn.ModeKeys.TRAIN:
+      logits = boosted_trees_ops.predict(
+          tree_ensemble_handle=local_tree_ensemble.resource_handle,
+          bucketized_features=input_feature_list,
+          logits_dimension=head.logits_dimension,
+          max_depth=tree_hparams.max_depth)
+    else:
+      if cache:
+        cached_tree_ids, cached_node_ids, cached_logits = cache.lookup()
+      else:
+        # Always start from the beginning when no cache is set up.
+        batch_size = array_ops.shape(input_feature_list[0])[0]
+        cached_tree_ids, cached_node_ids, cached_logits = (
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros([batch_size], dtype=dtypes.int32),
+            array_ops.zeros(
+                [batch_size, head.logits_dimension], dtype=dtypes.float32))
+      with ops.control_dependencies([ensemble_reload]):
+        (stamp_token, num_trees, num_finalized_trees,
+         num_attempted_layers) = local_tree_ensemble.get_states()
+        summary.scalar('ensemble/num_trees', num_trees)
+        summary.scalar('ensemble/num_finalized_trees', num_finalized_trees)
+        summary.scalar('ensemble/num_attempted_layers', num_attempted_layers)
+
+        partial_logits, tree_ids, node_ids = boosted_trees_ops.training_predict(
+            tree_ensemble_handle=local_tree_ensemble.resource_handle,
+            cached_tree_ids=cached_tree_ids,
+            cached_node_ids=cached_node_ids,
+            bucketized_features=input_feature_list,
+            logits_dimension=head.logits_dimension,
+            max_depth=tree_hparams.max_depth)
+      logits = cached_logits + partial_logits
+
+    # Create training graph.
+    def _train_op_fn(loss):
+      """Run one training iteration."""
+      train_op = []
+      if cache:
+        train_op.append(cache.insert(tree_ids, node_ids, logits))
+      if closed_form_grad_and_hess_fn:
+        gradients, hessians = closed_form_grad_and_hess_fn(logits, labels)
+      else:
+        gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
+        hessians = gradients_impl.gradients(
+            gradients, logits, name='Hessians')[0]
+      stats_summary_list = [
+          array_ops.squeeze(
+              boosted_trees_ops.make_stats_summary(
+                  node_ids=node_ids,
+                  gradients=gradients,
+                  hessians=hessians,
+                  bucketized_features_list=[input_feature_list[f]],
+                  max_splits=max_splits,
+                  num_buckets=num_buckets),
+              axis=0) for f in range(num_features)
+      ]
+
+      def grow_tree_from_stats_summaries(stats_summary_list):
+        """Updates ensemble based on the best gains from stats summaries."""
+        (node_ids_per_feature, gains_list, thresholds_list,
+         left_node_contribs_list, right_node_contribs_list) = (
+             boosted_trees_ops.calculate_best_gains_per_feature(
+                 node_id_range=array_ops.stack([
+                     math_ops.reduce_min(node_ids),
+                     math_ops.reduce_max(node_ids)
+                 ]),
+                 stats_summary_list=stats_summary_list,
+                 l1=tree_hparams.l1,
+                 l2=tree_hparams.l2,
+                 tree_complexity=tree_hparams.tree_complexity,
+                 max_splits=max_splits))
+        grow_op = boosted_trees_ops.update_ensemble(
+            # Confirm if local_tree_ensemble or tree_ensemble should be used.
+            tree_ensemble.resource_handle,
+            feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32),
+            node_ids=node_ids_per_feature,
+            gains=gains_list,
+            thresholds=thresholds_list,
+            left_node_contribs=left_node_contribs_list,
+            right_node_contribs=right_node_contribs_list,
+            learning_rate=tree_hparams.learning_rate,
+            max_depth=tree_hparams.max_depth,
+            pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
+        return grow_op
+
+      if train_in_memory and is_single_machine:
+        train_op.append(distribute_lib.increment_var(global_step))
+        train_op.append(grow_tree_from_stats_summaries(stats_summary_list))
+      else:
+        summary_accumulator = data_flow_ops.ConditionalAccumulator(
+            dtype=dtypes.float32,
+            # The stats consist of gradients and hessians (the last dimension).
+            shape=[num_features, max_splits, num_buckets, 2],
+            shared_name='stats_summary_accumulator')
+        apply_grad = summary_accumulator.apply_grad(
+            array_ops.stack(stats_summary_list, axis=0), stamp_token)
+
+        def grow_tree_from_accumulated_summaries_fn():
+          """Updates the tree with the best layer from accumulated summaries."""
+          # Take out the accumulated summaries from the accumulator and grow.
+          stats_summary_list = array_ops.unstack(
+              summary_accumulator.take_grad(1), axis=0)
+          grow_op = grow_tree_from_stats_summaries(stats_summary_list)
+          return grow_op
+
+        with ops.control_dependencies([apply_grad]):
+          train_op.append(distribute_lib.increment_var(global_step))
+          if config.is_chief:
+            train_op.append(
+                control_flow_ops.cond(
+                    math_ops.greater_equal(
+                        summary_accumulator.num_accumulated(),
+                        n_batches_per_layer),
+                    grow_tree_from_accumulated_summaries_fn,
+                    control_flow_ops.no_op,
+                    name='wait_until_n_batches_accumulated'))
+
+      return control_flow_ops.group(train_op, name='train_op')
+
+  estimator_spec = head.create_estimator_spec(
+      features=features,
+      mode=mode,
+      labels=labels,
+      train_op_fn=_train_op_fn,
+      logits=logits)
+  if mode == model_fn.ModeKeys.TRAIN:
+    # Add an early stop hook.
+    estimator_spec = estimator_spec._replace(
+        training_hooks=estimator_spec.training_hooks +
+        (_StopAtAttemptsHook(num_finalized_trees, num_attempted_layers,
+                             tree_hparams.n_trees, tree_hparams.max_depth),))
+  return estimator_spec
+
+
+def _create_classification_head(n_classes,
+                                weight_column=None,
+                                label_vocabulary=None):
+  """Creates a classification head. Refer to canned.head for details on args."""
+  # TODO(nponomareva): Support multi-class cases.
+  if n_classes == 2:
+    # pylint: disable=protected-access
+    return head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+    # pylint: enable=protected-access
+  else:
+    raise ValueError('For now only binary classification is supported.'
+                     'n_classes given as {}'.format(n_classes))
+
+
+def _create_classification_head_and_closed_form(n_classes, weight_column,
+                                                label_vocabulary):
+  """Creates a head for classifier and the closed form gradients/hessians."""
+  head = _create_classification_head(n_classes, weight_column, label_vocabulary)
+  if n_classes == 2 and weight_column is None and label_vocabulary is None:
+    # Use the closed-form gradients/hessians for 2 class.
+    def _grad_and_hess_for_logloss(logits, labels):
+      # TODO(youngheek): add weights handling.
+      predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
+      normalizer = math_ops.reciprocal(
+          math_ops.cast(array_ops.size(predictions), dtypes.float32))
+      gradients = (predictions - labels) * normalizer
+      hessians = predictions * (1.0 - predictions) * normalizer
+      return gradients, hessians
+
+    closed_form = _grad_and_hess_for_logloss
+  else:
+    closed_form = None
+  return (head, closed_form)
+
+
+def _create_regression_head(label_dimension, weight_column=None):
+  if label_dimension != 1:
+    raise ValueError('For now only 1 dimension regression is supported.'
+                     'label_dimension given as {}'.format(label_dimension))
+  # pylint: disable=protected-access
+  return head_lib._regression_head_with_mean_squared_error_loss(
+      label_dimension=label_dimension,
+      weight_column=weight_column,
+      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+  # pylint: enable=protected-access
+
+
+@tf_export('estimator.BoostedTreesClassifier')
+class BoostedTreesClassifier(estimator.Estimator):
+  """A Classifier for Tensorflow Boosted Trees models."""
+
+  def __init__(
+      self,
+      feature_columns,
+      n_batches_per_layer,
+      model_dir=None,
+      n_classes=_HOLD_FOR_MULTI_CLASS_SUPPORT,
+      weight_column=None,
+      label_vocabulary=None,
+      n_trees=100,
+      max_depth=6,
+      learning_rate=0.1,
+      l1_regularization=0.,
+      l2_regularization=0.,
+      tree_complexity=0.,
+      config=None):
+    """Initializes a `BoostedTreesClassifier` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    classifier = estimator.BoostedTreesClassifier(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    classifier.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = classifier.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      n_classes: number of label classes. Default is binary classification.
+        Multiclass support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      label_vocabulary: A list of strings represents possible label values. If
+        given, labels must be string type and have any value in
+        `label_vocabulary`. If it is not given, that means labels are
+        already encoded as integer or float within [0, 1] for `n_classes=2` and
+        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+        Also there will be errors if vocabulary is not provided and labels are
+        string.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Support multi-class cases.
+    if n_classes == _HOLD_FOR_MULTI_CLASS_SUPPORT:
+      n_classes = 2
+    head, closed_form = _create_classification_head_and_closed_form(
+        n_classes, weight_column, label_vocabulary=label_vocabulary)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features,
+          labels,
+          mode,
+          head,
+          feature_columns,
+          tree_hparams,
+          n_batches_per_layer,
+          config,
+          closed_form_grad_and_hess_fn=closed_form)
+
+    super(BoostedTreesClassifier, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+@tf_export('estimator.BoostedTreesRegressor')
+class BoostedTreesRegressor(estimator.Estimator):
+  """A Regressor for Tensorflow Boosted Trees models."""
+
+  def __init__(
+      self,
+      feature_columns,
+      n_batches_per_layer,
+      model_dir=None,
+      label_dimension=_HOLD_FOR_MULTI_DIM_SUPPORT,
+      weight_column=None,
+      n_trees=100,
+      max_depth=6,
+      learning_rate=0.1,
+      l1_regularization=0.,
+      l2_regularization=0.,
+      tree_complexity=0.,
+      config=None):
+    """Initializes a `BoostedTreesRegressor` instance.
+
+    Example:
+
+    ```python
+    bucketized_feature_1 = bucketized_column(
+      numeric_column('feature_1'), BUCKET_BOUNDARIES_1)
+    bucketized_feature_2 = bucketized_column(
+      numeric_column('feature_2'), BUCKET_BOUNDARIES_2)
+
+    regressor = estimator.BoostedTreesRegressor(
+        feature_columns=[bucketized_feature_1, bucketized_feature_2],
+        n_trees=100,
+        ... <some other params>
+    )
+
+    def input_fn_train():
+      ...
+      return dataset
+
+    regressor.train(input_fn=input_fn_train)
+
+    def input_fn_eval():
+      ...
+      return dataset
+
+    metrics = regressor.evaluate(input_fn=input_fn_eval)
+    ```
+
+    Args:
+      feature_columns: An iterable containing all the feature columns used by
+        the model. All items in the set should be instances of classes derived
+        from `FeatureColumn`.
+      n_batches_per_layer: the number of batches to collect statistics per
+        layer.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator
+        to continue training a previously saved model.
+      label_dimension: Number of regression targets per example.
+        Multi-dimensional support is not yet implemented.
+      weight_column: A string or a `_NumericColumn` created by
+        `tf.feature_column.numeric_column` defining feature column representing
+        weights. It is used to downweight or boost examples during training. It
+        will be multiplied by the loss of the example. If it is a string, it is
+        used as a key to fetch weight tensor from the `features`. If it is a
+        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+        then weight_column.normalizer_fn is applied on it to get weight tensor.
+      n_trees: number trees to be created.
+      max_depth: maximum depth of the tree to grow.
+      learning_rate: shrinkage parameter to be used when a tree added to the
+        model.
+      l1_regularization: regularization multiplier applied to the absolute
+        weights of the tree leafs.
+      l2_regularization: regularization multiplier applied to the square weights
+        of the tree leafs.
+      tree_complexity: regularization factor to penalize trees with more leaves.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: when wrong arguments are given or unsupported functionalities
+         are requested.
+    """
+    # TODO(nponomareva): Extend it to multi-dimension cases.
+    if label_dimension == _HOLD_FOR_MULTI_DIM_SUPPORT:
+      label_dimension = 1
+    head = _create_regression_head(label_dimension, weight_column)
+
+    # HParams for the model.
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity)
+
+    def _model_fn(features, labels, mode, config):
+      return _bt_model_fn(  # pylint: disable=protected-access
+          features, labels, mode, head, feature_columns, tree_hparams,
+          n_batches_per_layer, config)
+
+    super(BoostedTreesRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..625745a3f97465c51484212572e5bae4dd101c23
--- /dev/null
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -0,0 +1,786 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees estimators and model_fn."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator.canned import boosted_trees
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import checkpoint_utils
+
+NUM_FEATURES = 3
+
+BUCKET_BOUNDARIES = [-2., .5, 12.]  # Boundaries for all the features.
+INPUT_FEATURES = np.array(
+    [
+        [12.5, 1.0, -2.001, -2.0001, -1.999],  # feature_0 quantized:[3,2,0,0,1]
+        [2.0, -3.0, 0.5, 0.0, 0.4995],         # feature_1 quantized:[2,0,2,1,1]
+        [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
+    ],
+    dtype=np.float32)
+CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
+REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
+FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
+
+# EXAMPLE_ID is not exposed to Estimator yet, but supported at model_fn level.
+EXAMPLE_IDS = np.array([0, 1, 2, 3, 4], dtype=np.int64)
+EXAMPLE_ID_COLUMN = '__example_id__'
+
+
+def _make_train_input_fn(is_classification):
+  """Makes train input_fn for classification/regression."""
+
+  def _input_fn():
+    features = dict(FEATURES_DICT)
+    features[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
+    if is_classification:
+      labels = CLASSIFICATION_LABELS
+    else:
+      labels = REGRESSION_LABELS
+    return features, labels
+
+  return _input_fn
+
+
+class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES)
+        for i in range(NUM_FEATURES)
+    }
+
+  def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
+                         attempted_layers):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+    serialized = reader.get_tensor('boosted_trees:0_serialized')
+    ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+    ensemble_proto.ParseFromString(serialized)
+    self.assertEqual(
+        finalized_trees,
+        sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
+    self.assertEqual(attempted_layers,
+                     ensemble_proto.growing_metadata.num_layers_attempted)
+
+  def testTrainAndEvaluateBinaryClassifier(self):
+    input_fn = _make_train_input_fn(is_classification=True)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+  def testInferBinaryClassifier(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    # All labels are correct.
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testTrainAndEvaluateRegressor(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        max_depth=5)
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 1.008551)
+
+  def testInferRegressor(self):
+    train_input_fn = _make_train_input_fn(is_classification=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(train_input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose(
+        [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
+        [pred['predictions'] for pred in predictions])
+
+
+class ModelFnTests(test_util.TensorFlowTestCase):
+  """Tests bt_model_fn including unexposed internal functionalities."""
+
+  def setUp(self):
+    self._feature_columns = {
+        feature_column.bucketized_column(
+            feature_column.numeric_column('f_%d' % i, dtype=dtypes.float32),
+            BUCKET_BOUNDARIES) for i in range(NUM_FEATURES)
+    }
+    self._tree_hparams = boosted_trees._TreeHParams(  # pylint:disable=protected-access
+        n_trees=2,
+        max_depth=2,
+        learning_rate=0.1,
+        l1=0.,
+        l2=0.01,
+        tree_complexity=0.)
+
+  def _get_expected_ensembles_for_classification(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0625
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 2
+              threshold: 2
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.387675
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.0
+              original_leaf {
+                scalar: -0.181818
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.105518
+              original_leaf {
+                scalar: 0.0625
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.348397
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.181818
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.224091
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.056815
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.287131
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.162042
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.086986
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_expected_ensembles_for_regression(self):
+    first_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.241322
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.083951
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+        """
+    second_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 0
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+        """
+    third_round = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 1
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 1.169714
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 1
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.673407
+              original_leaf {
+                scalar: 0.241322
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 0
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.324102
+              original_leaf {
+                scalar: 0.083951
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.563167
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.247047
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.095273
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.222102
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 0
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 0.981026
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.005166
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.180281
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 3
+        }
+        """
+    return (first_round, second_round, third_round)
+
+  def _get_train_op_and_ensemble(self, head, config, is_classification,
+                                 train_in_memory):
+    """Calls bt_model_fn() and returns the train_op and ensemble_serialzed."""
+    features, labels = _make_train_input_fn(is_classification)()
+    estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
+        features=features,
+        labels=labels,
+        mode=model_fn.ModeKeys.TRAIN,
+        head=head,
+        feature_columns=self._feature_columns,
+        tree_hparams=self._tree_hparams,
+        example_id_column_name=EXAMPLE_ID_COLUMN,
+        n_batches_per_layer=1,
+        config=config,
+        train_in_memory=train_in_memory)
+    resources.initialize_resources(resources.shared_resources()).run()
+    variables.global_variables_initializer().run()
+    variables.local_variables_initializer().run()
+
+    # Gets the train_op and serialized proto of the ensemble.
+    shared_resources = resources.shared_resources()
+    self.assertEqual(1, len(shared_resources))
+    train_op = estimator_spec.train_op
+    with ops.control_dependencies([train_op]):
+      _, ensemble_serialized = (
+          gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+              shared_resources[0].handle))
+    return train_op, ensemble_serialized
+
+  def testTrainClassifierInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainClassifierNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_classification())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_classification_head(n_classes=2),
+            run_config.RunConfig(),
+            is_classification=True,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train with train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=True)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+  def testTrainRegressorNonInMemory(self):
+    ops.reset_default_graph()
+    expected_first, expected_second, expected_third = (
+        self._get_expected_ensembles_for_regression())
+    with self.test_session() as sess:
+      # Train without train_in_memory mode.
+      with sess.graph.as_default():
+        train_op, ensemble_serialized = self._get_train_op_and_ensemble(
+            boosted_trees._create_regression_head(label_dimension=1),
+            run_config.RunConfig(),
+            is_classification=False,
+            train_in_memory=False)
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      # Validate the trained ensemble.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_first, ensemble_proto)
+
+      # Run one more time and validate the trained ensemble.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_second, ensemble_proto)
+
+      # Third round training and validation.
+      _, serialized = sess.run([train_op, ensemble_serialized])
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      ensemble_proto.ParseFromString(serialized)
+      self.assertProtoEquals(expected_third, ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 6d0fb96057ee93964ee3571bae3b878faad88882..f47706db2fc5f9baa38a36790832a958d5098587 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -31,10 +31,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
@@ -215,8 +215,7 @@ def _dnn_linear_combined_model_fn(features,
 
     train_op = control_flow_ops.group(*train_ops)
     with ops.control_dependencies([train_op]):
-      with ops.colocate_with(global_step):
-        return state_ops.assign_add(global_step, 1)
+      return distribute_lib.increment_var(global_step)
 
   return head.create_estimator_spec(
       features=features,
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 44545c058c673d00f16c4276dc42cdbf4ca188e4..62b13c3200dd782c14dc427b62f9b03086c7174f 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -44,13 +44,13 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary as summary_lib
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import optimizer as optimizer_lib
@@ -196,7 +196,7 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
     testcase.assertEquals(0, loss.shape.ndims)
     if expected_loss is None:
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
     assert_loss = assert_close(
         math_ops.to_float(expected_loss, name='expected'),
@@ -204,7 +204,7 @@ def mock_optimizer(testcase, hidden_units, expected_loss=None):
         name='assert_loss')
     with ops.control_dependencies((assert_loss,)):
       if global_step is not None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       return control_flow_ops.no_op()
 
   optimizer_mock = test.mock.NonCallableMagicMock(
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index bb033d349534e044b2b92d064051ee5fa07f4d62..189b81aeea80a987b5d9d70c0c5360703dd5a3f5 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -57,8 +57,8 @@ _PREDICT_SERVING_KEY = 'predict'
 
 # A LossSpec contains
 # * a scalar `Tensor` representing reduced weighted training loss
-# * a scalar `Tensor` representing the unreduced unweighted loss
-# * a scalar `Tensor` representing the example weights
+# * a `Tensor` representing the unreduced unweighted loss
+# * a `Tensor` representing the example weights
 # * possibly processed labels (e.g. vocabulary lookup, shape manipulation, etc)
 LossSpec = collections.namedtuple(
     'LossSpec', ['training_loss', 'unreduced_loss', 'weights',
@@ -163,8 +163,8 @@ class _Head(object):
     Returns:
       A LossSpec that contains
       * the scalar `Tensor` representing reduced weighted training loss
-      * the scalar `Tensor` representing the unreduced unweighted loss
-      * the scalar `Tensor` representing the example weights
+      * the `Tensor` representing the unreduced unweighted loss
+      * the `Tensor` representing the example weights
       * possibly processed labels (e.g. vocabulary lookup, shape manipulation,
         etc.)
 
diff --git a/tensorflow/python/estimator/canned/linear_testing_utils.py b/tensorflow/python/estimator/canned/linear_testing_utils.py
index da3ce86999b32e081eb8f12bbd9f7a4599ed4eaa..0e6436b42143f4b136165d47c41e143dacb4d476 100644
--- a/tensorflow/python/estimator/canned/linear_testing_utils.py
+++ b/tensorflow/python/estimator/canned/linear_testing_utils.py
@@ -47,13 +47,13 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import optimizer as optimizer_lib
@@ -682,7 +682,7 @@ class BaseLinearRegressorTrainingTest(object):
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
@@ -690,7 +690,7 @@ class BaseLinearRegressorTrainingTest(object):
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
         if global_step is not None:
-          return state_ops.assign_add(global_step, 1).op
+          return distribute_lib.increment_var(global_step)
         return control_flow_ops.no_op()
 
     mock_optimizer = test.mock.NonCallableMock(
@@ -905,13 +905,13 @@ class BaseLinearClassifierTrainingTest(object):
       # Verify loss. We can't check the value directly, so we add an assert op.
       self.assertEquals(0, loss.shape.ndims)
       if expected_loss is None:
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
       assert_loss = assert_close(
           math_ops.to_float(expected_loss, name='expected'),
           loss,
           name='assert_loss')
       with ops.control_dependencies((assert_loss,)):
-        return state_ops.assign_add(global_step, 1).op
+        return distribute_lib.increment_var(global_step)
 
     mock_optimizer = test.mock.NonCallableMock(
         spec=optimizer_lib.Optimizer,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 6a4132bca2cb9f14984b39462d00cf68e4e4ae62..301a3606360bd7a628427d1475a6b45ab1df1553 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -41,8 +41,11 @@ from tensorflow.python.estimator.export.export import get_temp_export_dir
 from tensorflow.python.estimator.export.export import get_timestamped_export_dir
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
@@ -50,6 +53,7 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import device_setter
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import evaluation
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver
@@ -183,6 +187,9 @@ class Estimator(object):
             config)
       self._config = config
 
+    # The distribute field contains an instance of DistributionStrategy.
+    self._distribution = self._config.train_distribute
+
     # Model directory.
     model_dir = compat_internal.path_to_str(model_dir)
     if (model_dir is not None) and (self._config.model_dir is not None):
@@ -682,11 +689,26 @@ class Estimator(object):
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
+    # TODO(anjalisridhar): What about the default DistributionStrategy? Perhaps
+    # using any input is alright in that case. There is also a
+    # has_dataset_or_queue_runner function that we may want to extend and use.
+    if (self._distribution is not None and
+        not isinstance(result, dataset_ops.Dataset) and
+        mode == model_fn_lib.ModeKeys.TRAIN):
+      raise ValueError('input_fn() must return a tf.data.Dataset when using a '
+                       'DistributionStrategy.')
     input_hooks = []
     if isinstance(result, dataset_ops.Dataset):
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
+      if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
+        # TODO(josh11b): This is currently using a one-shot iterator, we
+        # will update this to an initializeable iterator once the
+        # necessory support for creating an initializable iterator is
+        # available.
+        result = self._distribution.distribute_dataset(result).get_next()
+      else:
+        iterator = result.make_initializable_iterator()
+        input_hooks.append(_DatasetInitializerHook(iterator))
+        result = iterator.get_next()
     if isinstance(result, (list, tuple)):
       if len(result) != 2:
         raise ValueError(
@@ -701,7 +723,7 @@ class Estimator(object):
       batch_length = batch_length or value.shape[0]
       if value.shape[0] != batch_length:
         raise ValueError('Batch length of predictions should be same. %s has '
-                         'different batch length then others.' % key)
+                         'different batch length than others.' % key)
     return batch_length
 
   def _extract_keys(self, predictions, predict_keys):
@@ -815,6 +837,12 @@ class Estimator(object):
     return model_fn_results
 
   def _train_model(self, input_fn, hooks, saving_listeners):
+    if self._distribution:
+      return self._train_model_distributed(input_fn, hooks, saving_listeners)
+    else:
+      return self._train_model_default(input_fn, hooks, saving_listeners)
+
+  def _train_model_default(self, input_fn, hooks, saving_listeners):
     worker_hooks = []
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
@@ -826,86 +854,210 @@ class Estimator(object):
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+      return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                             hooks, global_step_tensor,
+                                             saving_listeners)
 
-      if self._warm_start_settings:
-        logging.info('Warm-starting with WarmStartSettings: %s' %
-                     (self._warm_start_settings,))
-        # pylint: disable=protected-access
-        warm_starting_util.warm_start(*self._warm_start_settings)
-        # pylint: enable=protected-access
-      # Check if the user created a loss summary, and add one if they didn't.
-      # We assume here that the summary is called 'loss'. If it is not, we will
-      # make another one with the name 'loss' to ensure it shows up in the right
-      # graph in TensorBoard.
-      if not any([x.op.name == 'loss'
-                  for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
-        summary.scalar('loss', estimator_spec.loss)
-      ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
-      worker_hooks.extend(hooks)
-      worker_hooks.extend([
-          training.NanTensorHook(estimator_spec.loss),
-          training.LoggingTensorHook(
-              {
-                  'loss': estimator_spec.loss,
-                  'step': global_step_tensor
-              },
-              every_n_iter=self._config.log_step_count_steps)
-      ])
-      worker_hooks.extend(estimator_spec.training_hooks)
-
-      if not (estimator_spec.scaffold.saver or
-              ops.get_collection(ops.GraphKeys.SAVERS)):
-        ops.add_to_collection(
-            ops.GraphKeys.SAVERS,
-            training.Saver(
-                sharded=True,
-                max_to_keep=self._config.keep_checkpoint_max,
-                keep_checkpoint_every_n_hours=(
-                    self._config.keep_checkpoint_every_n_hours),
-                defer_build=True,
-                save_relative_paths=True))
-
-      chief_hooks = []
-      all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
-      saver_hooks = [
-          h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
-      if (self._config.save_checkpoints_secs or
-          self._config.save_checkpoints_steps):
-        if not saver_hooks:
-          chief_hooks = [
-              training.CheckpointSaverHook(
-                  self._model_dir,
-                  save_secs=self._config.save_checkpoints_secs,
-                  save_steps=self._config.save_checkpoints_steps,
-                  scaffold=estimator_spec.scaffold)
-          ]
-          saver_hooks = [chief_hooks[0]]
-      if saving_listeners:
-        if not saver_hooks:
-          raise ValueError(
-              'There should be a CheckpointSaverHook to use saving_listeners. '
-              'Please set one of the RunConfig.save_checkpoints_steps or '
-              'RunConfig.save_checkpoints_secs.')
+  def _train_model_distributed(self, input_fn, hooks, saving_listeners):
+    self._distribution.configure(self._session_config)
+    worker_hooks = []
+    with ops.Graph().as_default() as g:
+      with self._distribution.scope():
+        random_seed.set_random_seed(self._config.tf_random_seed)
+        features, labels, input_hooks = (
+            self._get_features_and_labels_from_input_fn(
+                input_fn, model_fn_lib.ModeKeys.TRAIN))
+        worker_hooks.extend(input_hooks)
+        global_step_tensor = self._create_and_assert_global_step(g)
+        # The default destination for the global_step_tensor fetch call is the
+        # CPU.
+        global_step_read_tensor = self._distribution.fetch(global_step_tensor)
+        # we want to add to the global collection in the main thread not the
+        # tower threads.
+        ops.add_to_collection(training_util.GLOBAL_STEP_READ_KEY,
+                              global_step_read_tensor)
+        grouped_estimator_spec = self._distribution.call_for_each_tower(
+            self._call_model_fn,
+            features,
+            labels,  # although this will be None it seems
+            model_fn_lib.ModeKeys.TRAIN,
+            self.config)
+
+        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # parameters: init_feed_dict, init_fn.
+        scaffold_list = self._distribution.unwrap(
+            grouped_estimator_spec.scaffold)
+        init_feed_dict = [
+            s.init_feed_dict
+            for s in scaffold_list
+            if s.init_feed_dict is not None
+        ]
+        if init_feed_dict:
+          init_feed_dict = self._distribution.group(init_feed_dict)
         else:
-          # It is expected to have one CheckpointSaverHook. If multiple, we pick
-          # up the first one to add listener.
-          saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
-      with training.MonitoredTrainingSession(
-          master=self._config.master,
-          is_chief=self._config.is_chief,
-          checkpoint_dir=self._model_dir,
-          scaffold=estimator_spec.scaffold,
-          hooks=worker_hooks,
-          chief_only_hooks=(
-              tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
-          save_checkpoint_secs=0,  # Saving is handled by a hook.
-          save_summaries_steps=self._config.save_summary_steps,
-          config=self._session_config,
-          log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
-        loss = None
-        while not mon_sess.should_stop():
-          _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
-      return loss
+          init_feed_dict = None
+
+        init_fn = [s.init_fn for s in scaffold_list if s.init_fn is not None]
+        if init_fn:
+          init_fn = self._distribution.group(init_fn)
+        else:
+          init_fn = None
+
+        init_op = [s.init_op for s in scaffold_list if s.init_op is not None]
+        if init_op:
+          init_op = self._distribution.group(init_op)
+        else:
+          init_op = None
+
+        ready_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_op, grouped_estimator_spec.scaffold)
+        if ready_op is not None:
+          ready_op = self._distribution.group(ready_op)
+        else:
+          ready_op = None
+
+        ready_for_local_init_op = self._distribution.call_for_each_tower(
+            create_per_tower_ready_for_local_init_op,
+            grouped_estimator_spec.scaffold)
+        if ready_for_local_init_op is not None:
+          ready_for_local_init_op = self._distribution.group(
+              ready_for_local_init_op)
+        else:
+          ready_for_local_init_op = None
+
+        local_init_op = [
+            s.local_init_op
+            for s in scaffold_list
+            if s.local_init_op is not None
+        ]
+        if local_init_op:
+          local_init_op = self._distribution.group(local_init_op)
+        else:
+          local_init_op = None
+
+        summary_op = [
+            s.summary_op for s in scaffold_list if s.summary_op is not None
+        ]
+        if summary_op:
+          summary_op = self._distribution.group(summary_op)
+        else:
+          summary_op = None
+
+        scaffold = monitored_session.Scaffold(
+            init_op=init_op,
+            ready_op=ready_op,
+            ready_for_local_init_op=ready_for_local_init_op,
+            local_init_op=local_init_op,
+            summary_op=summary_op,
+            init_feed_dict=init_feed_dict,
+            init_fn=init_fn)
+
+        def get_hooks_from_the_first_device(per_device_hooks):
+          hooks_list = self._distribution.unwrap(per_device_hooks)
+          assert hooks_list
+          return hooks_list[0]
+
+        training_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_hooks)
+        training_chief_hooks = get_hooks_from_the_first_device(
+            grouped_estimator_spec.training_chief_hooks)
+
+        estimator_spec = model_fn_lib.EstimatorSpec(
+            mode=grouped_estimator_spec.mode,
+            loss=self._distribution.unwrap(
+                self._distribution.reduce(distribute_lib.get_loss_reduction(),
+                                          grouped_estimator_spec.loss,
+                                          destinations='/device:CPU:0'))[0],
+            train_op=self._distribution.group(grouped_estimator_spec.train_op),
+            training_hooks=training_hooks,
+            training_chief_hooks=training_chief_hooks,
+            scaffold=scaffold)
+        return self._train_with_estimator_spec(estimator_spec, worker_hooks,
+                                               hooks, global_step_read_tensor,
+                                               saving_listeners)
+
+  def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
+                                 global_step_tensor, saving_listeners):
+    """Train a model with the given Estimator Spec."""
+    if self._warm_start_settings:
+      logging.info('Warm-starting with WarmStartSettings: %s' %
+                   (self._warm_start_settings,))
+      # pylint: disable=protected-access
+      warm_starting_util.warm_start(*self._warm_start_settings)
+      # pylint: enable=protected-access
+    # Check if the user created a loss summary, and add one if they didn't.
+    # We assume here that the summary is called 'loss'. If it is not, we will
+    # make another one with the name 'loss' to ensure it shows up in the right
+    # graph in TensorBoard.
+    if not any([x.op.name == 'loss'
+                for x in ops.get_collection(ops.GraphKeys.SUMMARIES)]):
+      summary.scalar('loss', estimator_spec.loss)
+    ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
+    worker_hooks.extend(hooks)
+    worker_hooks.extend([
+        training.NanTensorHook(estimator_spec.loss),
+        training.LoggingTensorHook(
+            {
+                'loss': estimator_spec.loss,
+                'step': global_step_tensor
+            },
+            every_n_iter=self._config.log_step_count_steps)
+    ])
+    worker_hooks.extend(estimator_spec.training_hooks)
+
+    if not (estimator_spec.scaffold.saver or
+            ops.get_collection(ops.GraphKeys.SAVERS)):
+      ops.add_to_collection(
+          ops.GraphKeys.SAVERS,
+          training.Saver(
+              sharded=True,
+              max_to_keep=self._config.keep_checkpoint_max,
+              keep_checkpoint_every_n_hours=(
+                  self._config.keep_checkpoint_every_n_hours),
+              defer_build=True,
+              save_relative_paths=True))
+
+    chief_hooks = []
+    all_hooks = worker_hooks + list(estimator_spec.training_chief_hooks)
+    saver_hooks = [
+        h for h in all_hooks if isinstance(h, training.CheckpointSaverHook)]
+    if (self._config.save_checkpoints_secs or
+        self._config.save_checkpoints_steps):
+      if not saver_hooks:
+        chief_hooks = [
+            training.CheckpointSaverHook(
+                self._model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=estimator_spec.scaffold)
+        ]
+        saver_hooks = [chief_hooks[0]]
+    if saving_listeners:
+      if not saver_hooks:
+        raise ValueError(
+            'There should be a CheckpointSaverHook to use saving_listeners. '
+            'Please set one of the RunConfig.save_checkpoints_steps or '
+            'RunConfig.save_checkpoints_secs.')
+      else:
+        # It is expected to have one CheckpointSaverHook. If multiple, we pick
+        # up the first one to add listener.
+        saver_hooks[0]._listeners.extend(saving_listeners)  # pylint: disable=protected-access
+    with training.MonitoredTrainingSession(
+        master=self._config.master,
+        is_chief=self._config.is_chief,
+        checkpoint_dir=self._model_dir,
+        scaffold=estimator_spec.scaffold,
+        hooks=worker_hooks,
+        chief_only_hooks=(
+            tuple(chief_hooks) + tuple(estimator_spec.training_chief_hooks)),
+        save_checkpoint_secs=0,  # Saving is handled by a hook.
+        save_summaries_steps=self._config.save_summary_steps,
+        config=self._session_config,
+        log_step_count_steps=self._config.log_step_count_steps) as mon_sess:
+      loss = None
+      while not mon_sess.should_stop():
+        _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
+    return loss
 
   def _evaluate_model(self,
                       input_fn,
@@ -972,6 +1124,35 @@ class Estimator(object):
     return eval_results
 
 
+def create_per_tower_ready_op(scaffold):
+  """Create a Scaffold.ready_op inside a tower."""
+  if scaffold.ready_op:
+    return scaffold.ready_op
+
+  def default_ready_op():
+    return array_ops.concat([
+        variables.report_uninitialized_variables(),
+        resources.report_uninitialized_resources()
+    ], 0)
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_op', ops.GraphKeys.READY_OP, default_ready_op)
+
+
+def create_per_tower_ready_for_local_init_op(scaffold):
+  """Create a Scaffold.ready_for_local_init_op inside a tower."""
+  if scaffold.ready_for_local_init_op:
+    return scaffold.ready_for_local_init_op
+
+  def default_ready_for_local_init_op():
+    return variables.report_uninitialized_variables(
+        variables.global_variables())
+
+  return monitored_session.Scaffold.get_or_default(
+      'ready_for_local_init_op', ops.GraphKeys.READY_FOR_LOCAL_INIT_OP,
+      default_ready_for_local_init_op)
+
+
 def _check_checkpoint_available(model_dir):
   latest_path = saver.latest_checkpoint(model_dir)
   if not latest_path:
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index be8930b3cbcd89dbb31dffde0a7a5ecfb64fcd8b..60c59cbc183ccde936384e25da3d8bf44316f712 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.estimator.canned.baseline import BaselineClassifier
 from tensorflow.python.estimator.canned.baseline import BaselineRegressor
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesClassifier
+from tensorflow.python.estimator.canned.boosted_trees import BoostedTreesRegressor
 from tensorflow.python.estimator.canned.dnn import DNNClassifier
 from tensorflow.python.estimator.canned.dnn import DNNRegressor
 from tensorflow.python.estimator.canned.dnn_linear_combined import DNNLinearCombinedClassifier
@@ -52,6 +54,8 @@ _allowed_symbols = [
     # Canned Estimators
     'BaselineClassifier',
     'BaselineRegressor',
+    'BoostedTreesClassifier',
+    'BoostedTreesRegressor',
     'DNNClassifier',
     'DNNRegressor',
     'DNNLinearCombinedClassifier',
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 141eaeff649414412a4277f8945dcb4445985170..dab442aeda61e10d5020a2d9f4766d6629e674ed 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -44,7 +44,7 @@ _DEFAULT_REPLACEABLE_LIST = [
     'keep_checkpoint_max',
     'keep_checkpoint_every_n_hours',
     'log_step_count_steps',
-    'distribute'
+    'train_distribute'
 ]
 
 _SAVE_CKPT_ERR = (
@@ -302,7 +302,7 @@ class RunConfig(object):
                keep_checkpoint_max=5,
                keep_checkpoint_every_n_hours=10000,
                log_step_count_steps=100,
-               distribute=None):
+               train_distribute=None):
     """Constructs a RunConfig.
 
     All distributed training related properties `cluster_spec`, `is_chief`,
@@ -316,7 +316,7 @@ class RunConfig(object):
     a list of task addresses.
 
     `task` has two attributes: `type` and `index`, where `type` can be any of
-    the task types in `cluster`. ` When `TF_CONFIG` contains said information,
+    the task types in `cluster`. When `TF_CONFIG` contains said information,
     the following properties are set on this class:
 
     * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If
@@ -426,10 +426,10 @@ class RunConfig(object):
         the feature.
       log_step_count_steps: The frequency, in number of global steps, that the
         global step/sec and the loss will be logged during training.
-      distribute: an optional instance of
+      train_distribute: an optional instance of
         `tf.contrib.distribute.DistributionStrategy`. If specified,
-        then Estimator will distribute the user's model according to the policy
-        specified by that strategy.
+        then Estimator will distribute the user's model during training,
+        according to the policy specified by that strategy.
 
     Raises:
       ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs`
@@ -466,7 +466,7 @@ class RunConfig(object):
         keep_checkpoint_max=keep_checkpoint_max,
         keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
         log_step_count_steps=log_step_count_steps,
-        distribute=distribute)
+        train_distribute=train_distribute)
 
     self._init_distributed_setting_from_environment_var(tf_config)
 
@@ -678,17 +678,17 @@ class RunConfig(object):
     return self._service
 
   @property
-  def distribute(self):
+  def train_distribute(self):
     """Returns the optional `tf.contrib.distribute.DistributionStrategy` object.
     """
-    return self._distribute
+    return self._train_distribute
 
   def replace(self, **kwargs):
     """Returns a new instance of `RunConfig` replacing specified properties.
 
     Only the properties in the following list are allowed to be replaced:
 
-      - `model_dir`.
+      - `model_dir`,
       - `tf_random_seed`,
       - `save_summary_steps`,
       - `save_checkpoints_steps`,
@@ -697,6 +697,7 @@ class RunConfig(object):
       - `keep_checkpoint_max`,
       - `keep_checkpoint_every_n_hours`,
       - `log_step_count_steps`,
+      - `train_distribute`.
 
     In addition, either `save_checkpoints_steps` or `save_checkpoints_secs`
     can be set (should not be both).
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index e38b765da52a7b6957a4fb8a02087c5d1fd5a781..9d271758f635869730c03bc2ac853b3493ec0cae 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -137,7 +137,7 @@ class TrainSpec(
           * A tuple (features, labels): Where features is a `Tensor` or a
             dictionary of string feature name to `Tensor` and labels is a
             `Tensor` or a dictionary of string label name to `Tensor`.
-            
+
       max_steps: Int. Positive number of total steps for which to train model.
         If `None`, train forever. The training `input_fn` is not expected to
         generate `OutOfRangeError` or `StopIteration` exceptions. See the
@@ -334,7 +334,8 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   can read and write). The only extra work to do is setting the environment
   variable `TF_CONFIG` properly for each worker correspondingly.
 
-  Also see: https://www.tensorflow.org/deploy/distributed
+  Also see
+  [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 238a90b67d9d0039c25a6f3800aad25a2db9e36f..295d4ca094cc8cb85c0f1f7fd47c20b910c270df 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -6,18 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_library(
     name = "feature_column_py",
     srcs = ["feature_column_lib.py"],
@@ -45,6 +33,7 @@ py_library(
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
@@ -54,6 +43,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/keras",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7d99fcb3e79318c2fecabaa9bdd0347aa67cf309..f9201a4794f78ec94e3901b14c25aca61f932d86 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -139,6 +139,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -148,6 +150,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import template
@@ -451,13 +454,163 @@ def linear_model(features,
     if cols_to_vars is not None:
       # Add the bias to cols_to_vars as well, converting the Variable or
       # PartitionedVariable to a list of Variable's.
-      if isinstance(bias, variables.Variable):
+      if (isinstance(bias, variables.Variable) or
+          resource_variable_ops.is_resource_variable(bias)):
         cols_to_vars['bias'] = [bias]
       else:  # Must be a PartitionedVariable.
         cols_to_vars['bias'] = list(bias)
     return predictions
 
 
+class _FCLinearWrapper(base.Layer):
+  """Wraps a _FeatureColumn in a layer for use in a linear model.
+
+  See `linear_model` above.
+  """
+
+  def __init__(self,
+               feature_column,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_FCLinearWrapper, self).__init__(
+        trainable=trainable, name=name, **kwargs)
+    self._feature_column = feature_column
+    self._units = units
+    self._sparse_combiner = sparse_combiner
+    self._weight_collections = weight_collections
+    self._state = {}
+
+  def build(self, _):
+    self._state = self._feature_column._create_state(  # pylint: disable=protected-access
+        self._weight_collections, self.add_variable)
+
+    if isinstance(self._feature_column, _CategoricalColumn):
+      weight = self.add_variable(
+          name='weights',
+          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    else:
+      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
+      weight = self.add_variable(
+          name='weights',
+          shape=[num_elements, self._units],
+          initializer=init_ops.zeros_initializer(),
+          trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, weight)
+    self._weight_var = weight
+    self.built = True
+
+  def call(self, builder):
+    weighted_sum = _create_weighted_sum(
+        column=self._feature_column,
+        builder=builder,
+        units=self._units,
+        sparse_combiner=self._sparse_combiner,
+        weight_collections=self._weight_collections,
+        trainable=self.trainable,
+        weight_var=self._weight_var,
+        state=self._state)
+    return weighted_sum
+
+
+class _BiasLayer(base.Layer):
+  """A layer for the bias term.
+  """
+
+  def __init__(self,
+               units=1,
+               trainable=True,
+               weight_collections=None,
+               name=None,
+               **kwargs):
+    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
+    self._units = units
+    self._weight_collections = weight_collections
+
+  def build(self, _):
+    self._bias_variable = self.add_variable(
+        'bias_weights',
+        shape=[self._units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=self.trainable)
+    ops.add_to_collections(self._weight_collections, self._bias_variable)
+    self.built = True
+
+  def call(self, _):
+    return self._bias_variable
+
+
+class _LinearModel(training.Model):
+  """Creates a linear model using feature columns.
+  """
+
+  def __init__(self,
+               feature_columns,
+               units=1,
+               sparse_combiner='sum',
+               weight_collections=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(_LinearModel, self).__init__(name=name, **kwargs)
+    self._feature_columns = _clean_feature_columns(feature_columns)
+    self._weight_collections = list(weight_collections or [])
+    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
+      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
+
+    column_layers = {}
+    for column in sorted(self._feature_columns, key=lambda x: x.name):
+      with variable_scope.variable_scope(
+          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
+        column_name = vs.name
+      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
+                                      self._weight_collections, trainable,
+                                      column_name, **kwargs)
+      column_layers[column_name] = column_layer
+    self._column_layers = self._add_layers(column_layers)
+    self._bias_layer = _BiasLayer(
+        units=units,
+        trainable=trainable,
+        weight_collections=self._weight_collections,
+        name='bias_layer',
+        **kwargs)
+
+  def call(self, features):
+    with variable_scope.variable_scope(self.name):
+      for column in self._feature_columns:
+        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
+          raise ValueError(
+              'Items of feature_columns must be either a '
+              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
+      weighted_sums = []
+      ordered_columns = []
+      builder = _LazyBuilder(features)
+      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
+        ordered_columns.append(layer._feature_column)  # pylint: disable=protected-access
+        weighted_sum = layer(builder)
+        weighted_sums.append(weighted_sum)
+
+      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
+      predictions_no_bias = math_ops.add_n(
+          weighted_sums, name='weighted_sum_no_bias')
+      predictions = nn_ops.bias_add(
+          predictions_no_bias, self._bias_layer(builder), name='weighted_sum')  # pylint: disable=not-callable
+    return predictions
+
+  def _add_layers(self, layers):
+    # "Magic" required for keras.Model classes to track all the variables in
+    # a list of layers.Layer objects.
+    # TODO(ashankar): Figure out API so user code doesn't have to do this.
+    for name, layer in layers.items():
+      setattr(self, 'layer-%s' % name, layer)
+    return layers
+
+
 def _transform_features(features, feature_columns):
   """Returns transformed features based on features columns passed in.
 
@@ -1641,6 +1794,19 @@ class _FeatureColumn(object):
     """
     pass
 
+  def _create_state(self, weight_collections=None, creator=None):
+    """Returns an object that captures the state of the column.
+
+    Args:
+      weight_collections: Collections to add the variable to
+      creator: Variable creator method called, if provided.
+
+    Returns:
+      An object that encapsulates the state of the column. Can return None.
+    """
+    del weight_collections, creator  # Unused
+    return None
+
 
 class _DenseColumn(_FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
@@ -1660,7 +1826,11 @@ class _DenseColumn(_FeatureColumn):
     pass
 
   @abc.abstractmethod
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     """Returns a `Tensor`.
 
     The output of this function will be used by model-builder-functions. For
@@ -1678,6 +1848,9 @@ class _DenseColumn(_FeatureColumn):
         will be created) are added.
       trainable: If `True` also add variables to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see @{tf.Variable}).
+      state: An object encapsulating the state of the column. Columns that
+        create state using the _create_state method would have that state
+        passed in to this method.
 
     Returns:
       `Tensor` of shape [batch_size] + `_variable_shape`.
@@ -1685,13 +1858,14 @@ class _DenseColumn(_FeatureColumn):
     pass
 
 
-def _create_weighted_sum(
-    column,
-    builder,
-    units,
-    sparse_combiner,
-    weight_collections,
-    trainable):
+def _create_weighted_sum(column,
+                         builder,
+                         units,
+                         sparse_combiner,
+                         weight_collections,
+                         trainable,
+                         weight_var=None,
+                         state=None):
   """Creates a weighted sum for a dense or sparse column for linear_model."""
   if isinstance(column, _CategoricalColumn):
     return _create_categorical_column_weighted_sum(
@@ -1700,32 +1874,50 @@ def _create_weighted_sum(
         units=units,
         sparse_combiner=sparse_combiner,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var)
   else:
     return _create_dense_column_weighted_sum(
         column=column,
         builder=builder,
         units=units,
         weight_collections=weight_collections,
-        trainable=trainable)
+        trainable=trainable,
+        weight_var=weight_var,
+        state=state)
 
 
-def _create_dense_column_weighted_sum(
-    column, builder, units, weight_collections, trainable):
+def _create_dense_column_weighted_sum(column,
+                                      builder,
+                                      units,
+                                      weight_collections,
+                                      trainable,
+                                      weight_var=None,
+                                      state=None):
   """Create a weighted sum of a dense column for linear_model."""
-  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
-      builder,
-      weight_collections=weight_collections,
-      trainable=trainable)
+  if state is not None:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
+  else:
+    tensor = column._get_dense_tensor(  # pylint: disable=protected-access
+        builder,
+        weight_collections=weight_collections,
+        trainable=trainable)
   num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
   batch_size = array_ops.shape(tensor)[0]
   tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=[num_elements, units],
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=[num_elements, units],
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return math_ops.matmul(tensor, weight, name='weighted_sum')
 
 
@@ -1775,8 +1967,13 @@ class _CategoricalColumn(_FeatureColumn):
     pass
 
 
-def _create_categorical_column_weighted_sum(
-    column, builder, units, sparse_combiner, weight_collections, trainable):
+def _create_categorical_column_weighted_sum(column,
+                                            builder,
+                                            units,
+                                            sparse_combiner,
+                                            weight_collections,
+                                            trainable,
+                                            weight_var=None):
   """Create a weighted sum of a categorical column for linear_model."""
   sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
       builder,
@@ -1790,12 +1987,15 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  weight = variable_scope.get_variable(
-      name='weights',
-      shape=(column._num_buckets, units),  # pylint: disable=protected-access
-      initializer=init_ops.zeros_initializer(),
-      trainable=trainable,
-      collections=weight_collections)
+  if weight_var is not None:
+    weight = weight_var
+  else:
+    weight = variable_scope.get_variable(
+        name='weights',
+        shape=(column._num_buckets, units),  # pylint: disable=protected-access
+        initializer=init_ops.zeros_initializer(),
+        trainable=trainable,
+        collections=weight_collections)
   return _safe_embedding_lookup_sparse(
       weight,
       id_tensor,
@@ -2193,8 +2393,33 @@ class _EmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor_internal(
-      self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if creator is not None:
+      embedding_weights = creator(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable)
+      ops.add_to_collections(weight_collections, embedding_weights)
+    else:
+      embedding_weights = variable_scope.get_variable(
+          name='embedding_weights',
+          shape=embedding_shape,
+          dtype=dtypes.float32,
+          initializer=self.initializer,
+          trainable=self.trainable,
+          collections=weight_collections)
+    variables_map['embedding_weights'] = embedding_weights
+    return variables_map
+
+  def _get_dense_tensor_internal(self,
+                                 inputs,
+                                 weight_collections=None,
+                                 trainable=None,
+                                 state=None):
     """Private method that follows the signature of _get_dense_tensor."""
     # Get sparse IDs and weights.
     sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
@@ -2202,14 +2427,10 @@ class _EmbeddingColumn(
     sparse_ids = sparse_tensors.id_tensor
     sparse_weights = sparse_tensors.weight_tensor
 
-    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
-    embedding_weights = variable_scope.get_variable(
-        name='embedding_weights',
-        shape=embedding_shape,
-        dtype=dtypes.float32,
-        initializer=self.initializer,
-        trainable=self.trainable and trainable,
-        collections=weight_collections)
+    if state is None:
+      state = self._create_state(weight_collections)
+    embedding_weights = state['embedding_weights']
+
     if self.ckpt_to_load_from is not None:
       to_restore = embedding_weights
       if isinstance(to_restore, variables.PartitionedVariable):
@@ -2227,7 +2448,11 @@ class _EmbeddingColumn(
         name='%s_weights' % self.name,
         max_norm=self.max_norm)
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     if isinstance(self.categorical_column, _SequenceCategoricalColumn):
       raise ValueError(
           'In embedding_column: {}. '
@@ -2240,8 +2465,10 @@ class _EmbeddingColumn(
               self.name, type(self.categorical_column),
               self.categorical_column))
     return self._get_dense_tensor_internal(
-        inputs=inputs, weight_collections=weight_collections,
-        trainable=trainable)
+        inputs=inputs,
+        weight_collections=weight_collections,
+        trainable=trainable,
+        state=state)
 
   def _get_sequence_dense_tensor(
       self, inputs, weight_collections=None, trainable=None):
@@ -2297,7 +2524,39 @@ class _SharedEmbeddingColumn(
       self._shape = tensor_shape.vector(self.dimension)
     return self._shape
 
-  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+  def _create_state(self, weight_collections=None, creator=None):
+    variables_map = {}
+    shared_embedding_collection = ops.get_collection(
+        self.shared_embedding_collection_name)
+    if not shared_embedding_collection:
+      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+      if creator is not None:
+        embedding_weights = creator(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable)
+        ops.add_to_collections(weight_collections, embedding_weights)
+      else:
+        embedding_weights = variable_scope.get_variable(
+            name='embedding_weights',
+            shape=embedding_shape,
+            dtype=dtypes.float32,
+            initializer=self.initializer,
+            trainable=self.trainable,
+            collections=weight_collections)
+      ops.add_to_collection(self.shared_embedding_collection_name,
+                            embedding_weights)
+      variables_map['embedding_weights'] = embedding_weights
+
+    return variables_map
+
+  def _get_dense_tensor(self,
+                        inputs,
+                        weight_collections=None,
+                        trainable=None,
+                        state=None):
     # This method is called from a variable_scope with name _var_scope_name,
     # which is shared among all shared embeddings. Open a name_scope here, so
     # that the ops for different columns have distinct names.
@@ -2889,6 +3148,9 @@ def _safe_embedding_lookup_sparse(embedding_weights,
 
     # Prune invalid ids and weights.
     sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
+    if combiner != 'sum':
+      sparse_ids, sparse_weights = _prune_invalid_weights(
+          sparse_ids, sparse_weights)
 
     # Fill in dummy values for empty features, if necessary.
     sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
@@ -2937,13 +3199,23 @@ def _prune_invalid_ids(sparse_ids, sparse_weights):
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
   if sparse_weights is not None:
     is_id_valid = math_ops.logical_and(
-        is_id_valid, math_ops.greater(sparse_weights.values, 0))
+        is_id_valid,
+        array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))
   sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)
   if sparse_weights is not None:
     sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)
   return sparse_ids, sparse_weights
 
 
+def _prune_invalid_weights(sparse_ids, sparse_weights):
+  """Prune invalid weights (< 0) from the input ids and weights."""
+  if sparse_weights is not None:
+    is_weights_valid = math_ops.greater(sparse_weights.values, 0)
+    sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)
+    sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)
+  return sparse_ids, sparse_weights
+
+
 class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
                        collections.namedtuple('_IndicatorColumn',
                                               ['categorical_column'])):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 6f366e77229577b1a6a5363f882daa07203f525c..62718db0e5a71e5be8361cd297eb61a78b07a06f 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
+from tensorflow.python.feature_column.feature_column import _LinearModel
 from tensorflow.python.feature_column.feature_column import _transform_features
 from tensorflow.python.feature_column.feature_column import InputLayer
 from tensorflow.python.framework import constant_op
@@ -339,6 +340,20 @@ class NumericColumnTest(test.TestCase):
         sess.run(price_var.assign([[10.]]))
         self.assertAllClose([[10.], [50.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[10.], [50.]], predictions.eval())
+
 
 class BucketizedColumnTest(test.TestCase):
 
@@ -561,6 +576,62 @@ class BucketizedColumnTest(test.TestCase):
         sess.run(bias.assign([1.]))
         self.assertAllClose([[81.], [141.]], predictions.eval())
 
+  def test_keras_linear_model_one_input_value(self):
+    """Tests _LinearModel for input with shape=[1]."""
+    price = fc.numeric_column('price', shape=[1])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1.], [1.], [5.], [6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight variable per bucket, all initialized to zero.
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
+        # price -1. is in the 0th bucket, whose weight is 10.
+        # price 1. is in the 1st bucket, whose weight is 20.
+        # price 5. is in the 3rd bucket, whose weight is 40.
+        # price 6. is in the 4th bucket, whose weight is 50.
+        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+
+  def test_keras_linear_model_two_input_values(self):
+    """Tests _LinearModel for input with shape=[2]."""
+    price = fc.numeric_column('price', shape=[2])
+    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    with ops.Graph().as_default():
+      features = {'price': [[-1., 1.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [bucketized_price])
+      bias = get_keras_linear_model_bias()
+      bucketized_price_var = get_linear_model_column_var(bucketized_price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        # One weight per bucket per input column, all initialized to zero.
+        self.assertAllClose(
+            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
+            bucketized_price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
+        # 1st example:
+        #   price -1. is in the 0th bucket, whose weight is 10.
+        #   price 1. is in the 6th bucket, whose weight is 70.
+        # 2nd example:
+        #   price 5. is in the 3rd bucket, whose weight is 40.
+        #   price 6. is in the 9th bucket, whose weight is 100.
+        self.assertAllClose([[80.], [140.]], predictions.eval())
+        sess.run(bias.assign([1.]))
+        self.assertAllClose([[81.], [141.]], predictions.eval())
+
 
 class HashedCategoricalColumnTest(test.TestCase):
 
@@ -767,6 +838,28 @@ class HashedCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
         self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 3: wire_var[3] = 4
+        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
 
 class CrossedColumnTest(test.TestCase):
 
@@ -1060,6 +1153,96 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
+  def test_keras_linear_model(self):
+    """Tests _LinearModel.
+
+    Uses data from test_get_sparse_tesnsors_simple.
+    """
+    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc.bucketized_column(a, boundaries=(0, 1))
+    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'a':
+              constant_op.constant(((-1., .5), (.5, 1.))),
+          'c':
+              sparse_tensor.SparseTensor(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=['cA', 'cB', 'cC'],
+                  dense_shape=(2, 2)),
+      }, (crossed,))
+      bias = get_keras_linear_model_bias()
+      crossed_var = get_linear_model_column_var(crossed)
+      with _initialized_session() as sess:
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            crossed_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
+        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
+        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        sess.run(bias.assign((.1,)))
+        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+
+  def test_keras_linear_model_with_weights(self):
+
+    class _TestColumnWithWeights(_CategoricalColumn):
+      """Produces sparse IDs and sparse weights."""
+
+      @property
+      def name(self):
+        return 'test_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
+
+      @property
+      def _num_buckets(self):
+        return 5
+
+      def _transform_feature(self, inputs):
+        return (inputs.get(self.name),
+                inputs.get('{}_weights'.format(self.name)))
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        """Populates both id_tensor and weight_tensor."""
+        ids_and_weights = inputs.get(self)
+        return _CategoricalColumn.IdWeightPair(
+            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
+
+    t = _TestColumnWithWeights()
+    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
+        get_keras_linear_model_predictions({
+            t.name:
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[0, 1, 2],
+                    dense_shape=(2, 2)),
+            '{}_weights'.format(t.name):
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=[1., 10., 2.],
+                    dense_shape=(2, 2)),
+            'c':
+                sparse_tensor.SparseTensor(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=['cA', 'cB', 'cC'],
+                    dense_shape=(2, 2)),
+        }, (crossed,))
+
 
 def get_linear_model_bias():
   with variable_scope.variable_scope('linear_model', reuse=True):
@@ -1071,6 +1254,28 @@ def get_linear_model_column_var(column):
                             'linear_model/' + column.name)[0]
 
 
+def get_keras_linear_model_bias():
+  with variable_scope.variable_scope('linear_model', reuse=True):
+    with variable_scope.variable_scope('bias_layer', reuse=True):
+      return variable_scope.get_variable('bias_weights')
+
+
+def get_keras_linear_model_predictions(features,
+                                       feature_columns,
+                                       units=1,
+                                       sparse_combiner='sum',
+                                       weight_collections=None,
+                                       trainable=True):
+  keras_linear_model = _LinearModel(
+      feature_columns,
+      units,
+      sparse_combiner,
+      weight_collections,
+      trainable,
+      name='linear_model')
+  return keras_linear_model(features)  # pylint: disable=not-callable
+
+
 @test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
@@ -1306,6 +1511,28 @@ class LinearModelTest(test.TestCase):
         sess.run(bias.assign([5.]))
         self.assertAllClose([[1005.], [5010.]], predictions.eval())
 
+  def test_sparse_combiner_with_negative_weights(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {
+          'wire_cast': wire_tensor,
+          'weights': constant_op.constant([[1., 1., -1.0]])
+      }
+      predictions = fc.linear_model(
+          features, [wire_cast_weights], sparse_combiner='sum')
+      bias = get_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
@@ -1698,89 +1925,712 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-class InputLayerTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes()
-  def test_retrieving_input(self):
-    features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(input_layer(features))
-    self.assertAllClose([[0.]], inputs)
-
-  def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+@test_util.with_c_api
+class _LinearModelTest(test.TestCase):
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
-      embedding_dimension = 2
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+  def test_raises_if_empty_feature_columns(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 'feature_columns must not be empty'):
+      get_keras_linear_model_predictions(features={}, feature_columns=[])
 
-      input_layer = InputLayer([embedding_column])
-      features = {'a': sparse_input}
+  def test_should_be_feature_column(self):
+    with self.assertRaisesRegexp(ValueError, 'must be a _FeatureColumn'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns='NotSupported')
 
-      inputs = input_layer(features)
-      variables = input_layer.variables
+  def test_should_be_dense_or_categorical_column(self):
 
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+    class NotSupportedColumn(_FeatureColumn):
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
+      @property
+      def name(self):
+        return 'NotSupportedColumn'
 
-      # Check that invoking input_layer on the same features does not create
-      # additional variables
-      _ = input_layer(features)
-      self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], input_layer.variables[0])
+      def _transform_feature(self, cache):
+        pass
 
-  def test_feature_column_input_layer_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+      @property
+      def _parse_example_spec(self):
+        pass
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
-      embedding_dimension = 2
+    with self.assertRaisesRegexp(
+        ValueError, 'must be either a _DenseColumn or _CategoricalColumn'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]}, feature_columns=[NotSupportedColumn()])
 
-      def _embedding_column_initializer(shape, dtype, partition_info):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+  def test_does_not_support_dict_columns(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Expected feature_columns to be iterable, found dict.'):
+      fc.linear_model(
+          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+  def test_raises_if_duplicate_name(self):
+    with self.assertRaisesRegexp(
+        ValueError, 'Duplicate feature column name found for columns'):
+      get_keras_linear_model_predictions(
+          features={'a': [[0]]},
+          feature_columns=[fc.numeric_column('a'),
+                           fc.numeric_column('a')])
 
-      input_layer = InputLayer([embedding_column])
-      features = {'a': sparse_input}
+  def test_dense_bias(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        sess.run(price_var.assign([[10.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[15.], [55.]], predictions.eval())
 
-      def scale_matrix():
+  def test_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_and_sparse_bias(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [wire_cast, price])
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        sess.run(price_var.assign([[10.]]))
+        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+
+  def test_dense_and_sparse_column(self):
+    """When the column is both dense and sparse, uses sparse tensors."""
+
+    class _DenseAndSparseColumn(_DenseColumn, _CategoricalColumn):
+
+      @property
+      def name(self):
+        return 'dense_and_sparse_column'
+
+      @property
+      def _parse_example_spec(self):
+        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
+
+      def _transform_feature(self, inputs):
+        return inputs.get(self.name)
+
+      @property
+      def _variable_shape(self):
+        raise ValueError('Should not use this method.')
+
+      def _get_dense_tensor(self,
+                            inputs,
+                            weight_collections=None,
+                            trainable=None):
+        raise ValueError('Should not use this method.')
+
+      @property
+      def _num_buckets(self):
+        return 4
+
+      def _get_sparse_tensors(self,
+                              inputs,
+                              weight_collections=None,
+                              trainable=None):
+        sp_tensor = sparse_tensor.SparseTensor(
+            indices=[[0, 0], [1, 0], [1, 1]],
+            values=[2, 0, 3],
+            dense_shape=[2, 2])
+        return _CategoricalColumn.IdWeightPair(sp_tensor, None)
+
+    dense_and_sparse_column = _DenseAndSparseColumn()
+    with ops.Graph().as_default():
+      sp_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {dense_and_sparse_column.name: sp_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [dense_and_sparse_column])
+      bias = get_keras_linear_model_bias()
+      dense_and_sparse_column_var = get_linear_model_column_var(
+          dense_and_sparse_column)
+      with _initialized_session() as sess:
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+
+  def test_dense_multi_output(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        sess.run(price_var.assign([[10., 100., 1000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
+                            predictions.eval())
+
+  def test_sparse_multi_output(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], units=3)
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        sess.run(
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100.,
+                                   1200.], [10000., 11000., 12000.]]))
+        sess.run(bias.assign([5., 6., 7.]))
+        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
+                            predictions.eval())
+
+  def test_dense_multi_dimension(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_sparse_multi_rank(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
+      wire_value = sparse_tensor.SparseTensorValue(
+          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
+          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
+          dense_shape=[2, 2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(features, [wire_cast])
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(
+            np.zeros((2, 1)),
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        self.assertAllClose(
+            [[1010.], [11000.]],
+            predictions.eval(feed_dict={wire_tensor: wire_value}))
+
+  def test_sparse_combiner(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default():
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
+          indices=[[0, 0], [1, 0], [1, 1]],
+          dense_shape=[2, 2])
+      features = {'wire_cast': wire_tensor}
+      predictions = get_keras_linear_model_predictions(
+          features, [wire_cast], sparse_combiner='mean')
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      with _initialized_session() as sess:
+        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
+        sess.run(bias.assign([5.]))
+        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+
+  def test_dense_multi_dimension_multi_output(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1., 2.], [5., 6.]]}
+      predictions = get_keras_linear_model_predictions(
+          features, [price], units=3)
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose(np.zeros((3,)), bias.eval())
+        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
+        sess.run(bias.assign([2., 3., 4.]))
+        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
+                            predictions.eval())
+
+  def test_raises_if_shape_mismatch(self):
+    price = fc.numeric_column('price', shape=2)
+    with ops.Graph().as_default():
+      features = {'price': [[1.], [5.]]}
+      if ops._USE_C_API:
+        with self.assertRaisesRegexp(
+            Exception,
+            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+          predictions = get_keras_linear_model_predictions(features, [price])
+      else:
+        predictions = get_keras_linear_model_predictions(features, [price])
+        with _initialized_session():
+          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
+            predictions.eval()
+
+  def test_dense_reshaping(self):
+    price = fc.numeric_column('price', shape=[1, 2])
+    with ops.Graph().as_default():
+      features = {'price': [[[1., 2.]], [[5., 6.]]]}
+      predictions = get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price_var.assign([[10.], [100.]]))
+        self.assertAllClose([[210.], [650.]], predictions.eval())
+
+  def test_dense_multi_column(self):
+    price1 = fc.numeric_column('price1', shape=2)
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      bias = get_keras_linear_model_bias()
+      price1_var = get_linear_model_column_var(price1)
+      price2_var = get_linear_model_column_var(price2)
+      with _initialized_session() as sess:
+        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([[0.], [0.]], price1_var.eval())
+        self.assertAllClose([[0.]], price2_var.eval())
+        self.assertAllClose([[0.], [0.]], predictions.eval())
+        sess.run(price1_var.assign([[10.], [100.]]))
+        sess.run(price2_var.assign([[1000.]]))
+        sess.run(bias.assign([7.]))
+        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+
+  def test_dense_collection(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(
+          features, [price], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      self.assertIn(bias, my_vars)
+      self.assertIn(price_var, my_vars)
+
+  def test_sparse_collection(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(
+          features, [wire_cast], weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, my_vars)
+      self.assertIn(wire_cast_var, my_vars)
+
+  def test_dense_trainable_default(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(features, [price])
+      bias = get_keras_linear_model_bias()
+      price_var = get_linear_model_column_var(price)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(price_var, trainable_vars)
+
+  def test_sparse_trainable_default(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(features, [wire_cast])
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      bias = get_keras_linear_model_bias()
+      wire_cast_var = get_linear_model_column_var(wire_cast)
+      self.assertIn(bias, trainable_vars)
+      self.assertIn(wire_cast_var, trainable_vars)
+
+  def test_dense_trainable_false(self):
+    price = fc.numeric_column('price')
+    with ops.Graph().as_default() as g:
+      features = {'price': [[1.], [5.]]}
+      get_keras_linear_model_predictions(features, [price], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_sparse_trainable_false(self):
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      wire_tensor = sparse_tensor.SparseTensor(
+          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      features = {'wire_cast': wire_tensor}
+      get_keras_linear_model_predictions(features, [wire_cast], trainable=False)
+      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertEqual([], trainable_vars)
+
+  def test_column_order(self):
+    price_a = fc.numeric_column('price_a')
+    price_b = fc.numeric_column('price_b')
+    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      get_keras_linear_model_predictions(
+          features, [price_a, wire_cast, price_b],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+    with ops.Graph().as_default() as g:
+      features = {
+          'price_a': [[1.]],
+          'price_b': [[3.]],
+          'wire_cast':
+              sparse_tensor.SparseTensor(
+                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
+      }
+      get_keras_linear_model_predictions(
+          features, [wire_cast, price_b, price_a],
+          weight_collections=['my-vars'])
+      my_vars = g.get_collection('my-vars')
+      self.assertIn('price_a', my_vars[0].name)
+      self.assertIn('price_b', my_vars[1].name)
+      self.assertIn('wire_cast', my_vars[2].name)
+
+  def test_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': [[1.], [5.], [7.]],  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+      get_keras_linear_model_predictions(features, [price1, price2])
+
+  def test_subset_of_static_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    price3 = fc.numeric_column('price3')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]],  # batchsize = 2
+          'price3': [[3.], [4.], [5.]]  # batchsize = 3
+      }
+      with self.assertRaisesRegexp(
+          ValueError,
+          'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
+        get_keras_linear_model_predictions(features, [price1, price2, price3])
+
+  def test_runtime_batch_size_mismatch(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
+          'price2': [[3.], [4.]]  # batchsize = 2
+      }
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      with _initialized_session() as sess:
+        with self.assertRaisesRegexp(errors.OpError,
+                                     'must have the same size and shape'):
+          sess.run(
+              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
+
+  def test_runtime_batch_size_matches(self):
+    price1 = fc.numeric_column('price1')
+    price2 = fc.numeric_column('price2')
+    with ops.Graph().as_default():
+      features = {
+          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
+      }
+      predictions = get_keras_linear_model_predictions(features,
+                                                       [price1, price2])
+      with _initialized_session() as sess:
+        sess.run(
+            predictions,
+            feed_dict={
+                features['price1']: [[1.], [5.]],
+                features['price2']: [[1.], [5.]],
+            })
+
+  def test_with_numpy_input_fn(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'price': np.array([-1., 2., 13., 104.]),
+            'body-style': np.array(['sedan', 'hardtop', 'wagon', 'sedan']),
+        },
+        batch_size=2,
+        shuffle=False)
+    features = input_fn()
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
+    # self.assertEqual(1 + 3 + 5, net.shape[1])
+    with _initialized_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
+
+      bias = get_keras_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+
+      coord.request_stop()
+      coord.join(threads)
+
+  def test_with_1d_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
+    }
+    self.assertEqual(1, features['price'].shape.ndims)
+    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
+
+    net = get_keras_linear_model_predictions(features,
+                                             [price_buckets, body_style])
+    with _initialized_session() as sess:
+      bias = get_keras_linear_model_bias()
+      price_buckets_var = get_linear_model_column_var(price_buckets)
+      body_style_var = get_linear_model_column_var(body_style)
+
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+
+  def test_with_1d_unknown_shape_sparse_tensor(self):
+    price = fc.numeric_column('price')
+    price_buckets = fc.bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc.categorical_column_with_vocabulary_list(
+        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
+    country = fc.categorical_column_with_vocabulary_list(
+        'country', vocabulary_list=['US', 'JP', 'CA'])
+
+    # Provides 1-dim tensor and dense tensor.
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+        'body-style': array_ops.sparse_placeholder(dtypes.string),
+        'country': array_ops.placeholder(dtypes.string),
+    }
+    self.assertIsNone(features['price'].shape.ndims)
+    self.assertIsNone(features['body-style'].get_shape().ndims)
+
+    price_data = np.array([-1., 12.])
+    body_style_data = sparse_tensor.SparseTensorValue(
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
+    country_data = np.array(['US', 'CA'])
+
+    net = get_keras_linear_model_predictions(
+        features, [price_buckets, body_style, country])
+    bias = get_keras_linear_model_bias()
+    price_buckets_var = get_linear_model_column_var(price_buckets)
+    body_style_var = get_linear_model_column_var(body_style)
+    with _initialized_session() as sess:
+      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
+      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
+      sess.run(bias.assign([5.]))
+
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          sess.run(
+                              net,
+                              feed_dict={
+                                  features['price']: price_data,
+                                  features['body-style']: body_style_data,
+                                  features['country']: country_data
+                              }))
+
+  def test_with_rank_0_feature(self):
+    price = fc.numeric_column('price')
+    features = {
+        'price': constant_op.constant(0),
+    }
+    self.assertEqual(0, features['price'].shape.ndims)
+
+    # Static rank 0 should fail
+    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
+      get_keras_linear_model_predictions(features, [price])
+
+    # Dynamic rank 0 should fail
+    features = {
+        'price': array_ops.placeholder(dtypes.float32),
+    }
+    net = get_keras_linear_model_predictions(features, [price])
+    self.assertEqual(1, net.shape[1])
+    with _initialized_session() as sess:
+      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
+        sess.run(net, feed_dict={features['price']: np.array(1)})
+
+
+class InputLayerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_retrieving_input(self):
+    features = {'a': [0.]}
+    input_layer = InputLayer(fc.numeric_column('a'))
+    inputs = self.evaluate(input_layer(features))
+    self.assertAllClose([[0.]], inputs)
+
+  def test_reuses_variables(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      inputs = input_layer(features)
+      variables = input_layer.variables
+
+      # Sanity check: test that the inputs are correct.
+      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+
+      # Check that only one variable was created.
+      self.assertEqual(1, len(variables))
+
+      # Check that invoking input_layer on the same features does not create
+      # additional variables
+      _ = input_layer(features)
+      self.assertEqual(1, len(variables))
+      self.assertEqual(variables[0], input_layer.variables[0])
+
+  def test_feature_column_input_layer_gradient(self):
+    with context.eager_mode():
+      sparse_input = sparse_tensor.SparseTensor(
+          indices=((0, 0), (1, 0), (2, 0)),
+          values=(0, 1, 2),
+          dense_shape=(3, 3))
+
+      # Create feature columns (categorical and embedding).
+      categorical_column = fc.categorical_column_with_identity(key='a',
+                                                               num_buckets=3)
+      embedding_dimension = 2
+
+      def _embedding_column_initializer(shape, dtype, partition_info):
+        del shape  # unused
+        del dtype  # unused
+        del partition_info  # unused
+        embedding_values = (
+            (1, 0),  # id 0
+            (0, 1),  # id 1
+            (1, 1))  # id 2
+        return embedding_values
+
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_embedding_column_initializer)
+
+      input_layer = InputLayer([embedding_column])
+      features = {'a': sparse_input}
+
+      def scale_matrix():
         matrix = input_layer(features)
         return 2 * matrix
 
@@ -2715,6 +3565,32 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_file(
+        key='wire',
+        vocabulary_file=self._wire_vocabulary_file_name,
+        vocabulary_size=self._wire_vocabulary_size,
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
@@ -3082,6 +3958,31 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
         self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    wire_column = fc.categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=('omar', 'stringer', 'marlo'),
+        num_oov_buckets=1)
+    self.assertEqual(4, wire_column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          wire_column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=('marlo', 'skywalker', 'omar'),
+                  dense_shape=(2, 2))
+      }, (wire_column,))
+      bias = get_keras_linear_model_bias()
+      wire_var = get_linear_model_column_var(wire_column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
+        # 'marlo' -> 2: wire_var[2] = 3
+        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
@@ -3306,6 +4207,28 @@ class IdentityCategoricalColumnTest(test.TestCase):
         # weight_var[2] + weight_var[1] = 3+2 = 5
         self.assertAllClose(((1.,), (5.,)), predictions.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    self.assertEqual(3, column._num_buckets)
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          column.name:
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] = 1
+        # weight_var[2] + weight_var[1] = 3+2 = 5
+        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
 
 class TransformFeaturesTest(test.TestCase):
 
@@ -3537,6 +4460,25 @@ class IndicatorColumnTest(test.TestCase):
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
         self.assertAllClose([[2. + 3.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+
+      predictions = get_keras_linear_model_predictions(features, [animal])
+      weight_var = get_linear_model_column_var(animal)
+      with _initialized_session():
+        # All should be zero-initialized.
+        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
+        self.assertAllClose([[0.]], predictions.eval())
+        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
+        self.assertAllClose([[2. + 3.]], predictions.eval())
+
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -3725,6 +4667,72 @@ class EmbeddingColumnTest(test.TestCase):
             'aaa': sparse_input
         }))
 
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Create embedding_weights variable.
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
+    state = embedding_column._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup = embedding_column._get_dense_tensor(
+        _LazyBuilder({
+            'aaa': sparse_input
+        }), state=state)
+
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
@@ -3940,15 +4948,90 @@ class EmbeddingColumnTest(test.TestCase):
             'aaa': sparse_input
         }))
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, global_vars[0].eval())
+      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+
+  def test_linear_model(self):
+    # Inputs.
+    batch_size = 4
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(batch_size, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column, dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = fc.linear_model({
+          categorical_column.name: sparse_input
+      }, (embedding_column,))
+      expected_var_names = (
+          'linear_model/bias_weights:0',
+          'linear_model/aaa_embedding/weights:0',
+          'linear_model/aaa_embedding/embedding_weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v for v in ops.get_collection(
+              ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_embedding/embedding_weights:0']
+      linear_weights = trainable_vars[
+          'linear_model/aaa_embedding/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # example 2, ids [], embedding[2] = [0, 0]
+        # example 3, ids [1], embedding[3] = [3, 5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
-  def test_linear_model(self):
+  def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
     vocabulary_size = 3
@@ -3965,6 +5048,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -3975,15 +5059,16 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
+      predictions = get_keras_linear_model_predictions({
           categorical_column.name: sparse_input
       }, (embedding_column,))
       expected_var_names = (
-          'linear_model/bias_weights:0',
+          'linear_model/bias_layer/bias_weights:0',
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
@@ -3991,15 +5076,14 @@ class EmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
         self.assertAllClose(np.zeros((1,)), bias.eval())
@@ -4443,6 +5527,80 @@ class SharedEmbeddingColumnTest(test.TestCase):
     embedding_lookup_b = embedding_column_b._get_dense_tensor(
         _LazyBuilder(input_features))
 
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    embedding_var = global_vars[0]
+    with _initialized_session():
+      self.assertAllEqual(embedding_values, embedding_var.eval())
+      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
+      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+
+  def test_get_dense_tensor_with_state(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    # Create state.
+    weight_collections = [
+        ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES
+    ]
+    state = embedding_column_a._create_state(weight_collections)
+
+    # Provide sparse input and get dense result.
+    embedding_lookup_a = embedding_column_a._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+    embedding_lookup_b = embedding_column_b._get_dense_tensor(
+        _LazyBuilder(input_features), state=state)
+
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(
@@ -4595,6 +5753,97 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
         self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
 
+  def test_keras_linear_model(self):
+    # Inputs.
+    batch_size = 2
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_shape = (vocabulary_size, embedding_dimension)
+    zeros_embedding_values = np.zeros(embedding_shape)
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual(embedding_shape, shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return zeros_embedding_values
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          categorical_column_a.name: input_a,
+          categorical_column_b.name: input_b,
+      }, (embedding_column_a, embedding_column_b))
+      # Linear weights do not follow the column name. But this is a rare use
+      # case, and fixing it would add too much complexity to the code.
+      expected_var_names = (
+          'linear_model/bias_layer/bias_weights:0',
+          'linear_model/aaa_bbb_shared_embedding/weights:0',
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0',
+      )
+      self.assertItemsEqual(
+          expected_var_names,
+          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
+      trainable_vars = {
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+      }
+      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      bias = trainable_vars['linear_model/bias_layer/bias_weights:0']
+      embedding_weights = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
+      linear_weights_a = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding/weights:0']
+      linear_weights_b = trainable_vars[
+          'linear_model/aaa_bbb_shared_embedding_1/weights:0']
+      with _initialized_session():
+        # Predictions with all zero weights.
+        self.assertAllClose(np.zeros((1,)), bias.eval())
+        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
+        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+
+        # Predictions with all non-zero weights.
+        embedding_weights.assign((
+            (1., 2.),  # id 0
+            (3., 5.),  # id 1
+            (7., 11.)  # id 2
+        )).eval()
+        linear_weights_a.assign(((4.,), (6.,))).eval()
+        # example 0, ids [2], embedding[0] = [7, 11]
+        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # sum(embeddings * linear_weights)
+        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+        linear_weights_b.assign(((3.,), (5.,))).eval()
+        # example 0, ids [0], embedding[0] = [1, 2]
+        # example 1, ids [], embedding[1] = 0, 0]
+        # sum(embeddings * linear_weights)
+        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+
   def _test_input_layer(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
@@ -4880,6 +6129,103 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           weight_tensor.eval())
 
+  def test_keras_linear_model(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(.5, 1., .1),
+                  dense_shape=(2, 2))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+  def test_keras_linear_model_mismatched_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(ValueError,
+                                   r'Dimensions.*are not compatible'):
+        get_keras_linear_model_predictions({
+            'ids':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (1, 0), (1, 1)),
+                    values=(0, 2, 1),
+                    dense_shape=(2, 2)),
+            'values':
+                sparse_tensor.SparseTensorValue(
+                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
+                    values=(.5, 11., 1., .1),
+                    dense_shape=(2, 2))
+        }, (column,))
+
+  def test_keras_linear_model_mismatched_dense_values(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
+      with _initialized_session():
+        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
+          predictions.eval()
+
+  def test_keras_linear_model_mismatched_dense_shape(self):
+    column = fc.weighted_categorical_column(
+        categorical_column=fc.categorical_column_with_identity(
+            key='ids', num_buckets=3),
+        weight_feature_key='values')
+    with ops.Graph().as_default():
+      predictions = get_keras_linear_model_predictions({
+          'ids':
+              sparse_tensor.SparseTensorValue(
+                  indices=((0, 0), (1, 0), (1, 1)),
+                  values=(0, 2, 1),
+                  dense_shape=(2, 2)),
+          'values': ((.5,), (1.,), (.1,))
+      }, (column,))
+      bias = get_keras_linear_model_bias()
+      weight_var = get_linear_model_column_var(column)
+      with _initialized_session():
+        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
+        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        weight_var.assign(((1.,), (2.,), (3.,))).eval()
+        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+        # = 3*1 + 2*.1 = 3+.2 = 3.2
+        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -4933,13 +6279,16 @@ class WeightedCategoricalColumnTest(test.TestCase):
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
-      predictions = fc.linear_model({
-          'ids': sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=(0, 2, 1),
-              dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      }, (column,))
+      predictions = fc.linear_model(
+          {
+              'ids':
+                  sparse_tensor.SparseTensorValue(
+                      indices=((0, 0), (1, 0), (1, 1)),
+                      values=(0, 2, 1),
+                      dense_shape=(2, 2)),
+              'values': ((.5,), (1.,))
+          }, (column,),
+          sparse_combiner='mean')
       with _initialized_session():
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 6c522de452b59ea9a200ccf89cfb428a26970db1..7bbe3183dfa376776dde8412a3270b1684a02391 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -33,7 +33,7 @@ class ScopedTFStatus(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteStatus is not None:
+    if c_api is not None and c_api.TF_DeleteStatus is not None:
       c_api.TF_DeleteStatus(self.status)
 
 
@@ -46,7 +46,7 @@ class ScopedTFGraph(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteGraph is not None:
+    if c_api is not None and c_api.TF_DeleteGraph is not None:
       c_api.TF_DeleteGraph(self.graph)
 
 
@@ -59,10 +59,36 @@ class ScopedTFImportGraphDefOptions(object):
   def __del__(self):
     # Note: when we're destructing the global context (i.e when the process is
     # terminating) we can have already deleted other modules.
-    if c_api.TF_DeleteImportGraphDefOptions is not None:
+    if c_api is not None and c_api.TF_DeleteImportGraphDefOptions is not None:
       c_api.TF_DeleteImportGraphDefOptions(self.options)
 
 
+class ScopedTFImportGraphDefResults(object):
+  """Wrapper around TF_ImportGraphDefOptions that handles deletion."""
+
+  def __init__(self, results):
+    self.results = results
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteImportGraphDefResults is not None:
+      c_api.TF_DeleteImportGraphDefResults(self.results)
+
+
+class ScopedTFFunction(object):
+  """Wrapper around TF_Function that handles deletion."""
+
+  def __init__(self, func):
+    self.func = func
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteFunction is not None:
+      c_api.TF_DeleteFunction(self.func)
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 0edae92fd4a86e7d10a180ce64364d3ea552bf60..6d918f8b891375bd9aaf7a8743952b4c4a1ebdd0 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -345,7 +345,7 @@ tf_export("uint16").export_constant(__name__, "uint16")
 uint32 = DType(types_pb2.DT_UINT32)
 tf_export("uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
-tf_export("uint64").export_constant(__name__, "uint32")
+tf_export("uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
 tf_export("int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
@@ -648,6 +648,10 @@ QUANTIZED_DTYPES = frozenset([
 ])
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
+_PYTHON_TO_TF = {
+    float: float32,
+    bool: bool,
+}
 
 @tf_export("as_dtype")
 def as_dtype(type_value):
@@ -679,6 +683,11 @@ def as_dtype(type_value):
   except KeyError:
     pass
 
+  try:
+    return _PYTHON_TO_TF[type_value]
+  except KeyError:
+    pass
+
   if isinstance(type_value, np.dtype):
     # The numpy dtype for strings is variable length. We can not compare
     # dtype with a single constant (np.string does not exist) to decide
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index e49e2fda5d84da4f8f87fae73874351afe0a20f2..478733e38921a5dbbe6ab7ffb363a814156839a4 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -295,6 +295,9 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertNotEqual(dtypes.int32, int)
     self.assertNotEqual(dtypes.float64, 2.1)
 
+  def testPythonTypesConversion(self):
+    self.assertIs(dtypes.float32, dtypes.as_dtype(float))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(bool))
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py
index 2a40316d51c023df9c664d0dd79a0df3b2ac5041..84106c32c673e15832ff747a7fededdfbfb94ed8 100644
--- a/tensorflow/python/framework/errors_impl.py
+++ b/tensorflow/python/framework/errors_impl.py
@@ -473,6 +473,8 @@ _CODE_TO_EXCEPTION_CLASS = {
     DATA_LOSS: DataLossError,
 }
 
+c_api.PyExceptionRegistry_Init(_CODE_TO_EXCEPTION_CLASS)
+
 _EXCEPTION_CLASS_TO_CODE = dict((
     (class_, code) for (code, class_) in _CODE_TO_EXCEPTION_CLASS.items()))
 
@@ -499,6 +501,7 @@ def _make_specific_exception(node_def, op, message, error_code):
 # Named like a function for backwards compatibility with the
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
+# TODO(b/77295559): expand use of TF_Status* SWIG typemap and deprecate this.
 @tf_export("errors.raise_exception_on_not_ok_status")  # pylint: disable=invalid-name
 class raise_exception_on_not_ok_status(object):
   """Context manager to check for C API status."""
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 14d72d8a3de7e22bee4f9961c2f66044c217f641..9570f009a5c458cb904968cc7990270b30da91a1 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -30,7 +30,6 @@ from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -275,8 +274,7 @@ class _DefinedFunction(object):
     self._create_definition_if_needed()
     if self._c_func:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_FunctionToFunctionDef(self._c_func, buf, status)
+        c_api.TF_FunctionToFunctionDef(self._c_func.func, buf)
         fdef = function_pb2.FunctionDef()
         proto_data = c_api.TF_GetBuffer(buf)
         fdef.ParseFromString(compat.as_bytes(proto_data))
@@ -399,18 +397,17 @@ class _DefinedFunction(object):
                       if self._out_names else [])
       description = self._func.__doc__ or None
       # pylint: disable=protected-access
-      with errors.raise_exception_on_not_ok_status() as status:
-        self._c_func = c_api.TF_GraphToFunction_wrapper(
-            temp_graph._c_graph,
-            base_func_name,
-            self._func_name is None,  # append_hash_to_fn_name
-            None,  # opers
-            [t._as_tf_output() for t in inputs],
-            [t._as_tf_output() for t in outputs],
-            output_names,
-            None,  # opts
-            description,
-            status)
+      c_func = c_api.TF_GraphToFunction_wrapper(
+          temp_graph._c_graph,
+          base_func_name,
+          self._func_name is None,  # append_hash_to_fn_name
+          None,  # opers
+          [t._as_tf_output() for t in inputs],
+          [t._as_tf_output() for t in outputs],
+          output_names,
+          None,  # opts
+          description)
+      self._c_func = c_api_util.ScopedTFFunction(c_func)
       # pylint: enable=protected-access
       self._set_c_attrs(kwargs_attr)
 
@@ -433,9 +430,8 @@ class _DefinedFunction(object):
       serialized = attr_value.SerializeToString()
       # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
       # It might be worth creating a convenient way to re-use the same status.
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_FunctionSetAttrValueProto(self._c_func, compat.as_str(name),
-                                           serialized, status)
+      c_api.TF_FunctionSetAttrValueProto(self._c_func.func, compat.as_str(name),
+                                         serialized)
 
   def _create_hash_str(self, input_arg, output_arg, node_def):
     """Creates an 8-character string unique to this input.
@@ -830,8 +826,8 @@ def _from_definition(fdef, grad_func=None):
   # pylint: disable=protected-access
   if ops._USE_C_API:
     serialized = fdef.SerializeToString()
-    with errors.raise_exception_on_not_ok_status() as status:
-      result._c_func = c_api.TF_FunctionImportFunctionDef(serialized, status)
+    c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+    result._c_func = c_api_util.ScopedTFFunction(c_func)
     result._extra_inputs = []
   else:
     result._definition = fdef
@@ -934,6 +930,12 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
           s=("function_%s" % func_name).encode())
     # pylint: enable=protected-access
 
+  kwargs_keys = list(kwargs.keys())
+  for key in kwargs_keys:
+    if key.startswith("experimental_"):
+      attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(kwargs[key]))
+      del kwargs[key]
+
   if kwargs:
     raise ValueError("Unknown keyword arguments: %s" % kwargs.keys())
   return attrs
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 65ca801cbe922b36e3bc72bc2fbcd88f66aa5290..d6bc14fbc75199a97f50c4dc120b2704970d1879 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_logging_ops
@@ -58,12 +57,32 @@ def _OptimizerOptions():
   for cse in [False, True]:
     for inline in [False, True]:
       for cfold in [False, True]:
-        yield config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-            optimizer_options=config_pb2.OptimizerOptions(
-                opt_level=config_pb2.OptimizerOptions.L0,
-                do_common_subexpression_elimination=cse,
-                do_function_inlining=inline,
-                do_constant_folding=cfold)))
+        cfg = config_pb2.ConfigProto(
+            graph_options=config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    opt_level=config_pb2.OptimizerOptions.L0,
+                    do_common_subexpression_elimination=cse,
+                    do_function_inlining=inline,
+                    do_constant_folding=cfold)))
+        if cse:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.arithmetic_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if inline:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.function_optimization = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        if cfold:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.ON)
+        else:
+          cfg.graph_options.rewrite_options.constant_folding = (
+              rewriter_config_pb2.RewriterConfig.OFF)
+        yield cfg
 
 
 @test_util.with_c_api
@@ -1227,6 +1246,15 @@ class FunctionsFromProtos(test.TestCase):
         ValueError, "FunctionDefLibrary contains cyclic gradient functions!"):
       function._from_library(library)
 
+  def testExperimentalAttrs(self):
+
+    @function.Defun(dtypes.int32, experimental_tag="tag_value")
+    def FunctionWithAttr(i):
+      return array_ops.identity(i)
+    self.assertTrue("experimental_tag" in FunctionWithAttr.definition.attr)
+    self.assertEqual(
+        FunctionWithAttr.definition.attr["experimental_tag"].s, b"tag_value")
+
 
 @test_util.with_c_api
 class FunctionOverloadTest(test.TestCase):
@@ -1333,7 +1361,7 @@ class UnrollLSTMTest(test.TestCase):
         value=math_ops.matmul(xm, weights), num_or_size_splits=4, axis=1)
     new_c = math_ops.sigmoid(f_g) * cprev + math_ops.sigmoid(
         i_g) * math_ops.tanh(i_i)
-    new_c = clip_ops.clip_by_value(new_c, -50.0, 50.0)
+    new_c = math_ops.maximum(math_ops.minimum(new_c, 50.0), -50.0)
     new_m = math_ops.sigmoid(o_g) * math_ops.tanh(new_c)
     return new_m, new_c
 
diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py
index 910364364c8be84b1a629dbdaae5e69443d07e75..394fac6c856197030f85aab5b11fa881eddf670d 100644
--- a/tensorflow/python/framework/graph_util_impl.py
+++ b/tensorflow/python/framework/graph_util_impl.py
@@ -285,7 +285,7 @@ def convert_variables_to_constants(sess,
     output_graph_def.node.extend([output_node])
 
   output_graph_def.library.CopyFrom(inference_graph.library)
-  print("Converted %d variables to const ops." % how_many_converted)
+  logging.info("Converted %d variables to const ops.", how_many_converted)
   return output_graph_def
 
 
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 4ea34d7bb2831845aec1f40fcdb7f64a8f8c438a..3f8a8c4befb63abf14a70b48833d7e6e400e5c51 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -485,9 +485,9 @@ def import_graph_def(graph_def,
     with graph._lock:  # pylint: disable=protected-access
       with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
         try:
-          with errors.raise_exception_on_not_ok_status() as status:
-            results = c_api.TF_GraphImportGraphDefWithResults(
-                graph._c_graph, serialized, options, status)  # pylint: disable=protected-access
+          results = c_api.TF_GraphImportGraphDefWithResults(
+              graph._c_graph, serialized, options)  # pylint: disable=protected-access
+          results = c_api_util.ScopedTFImportGraphDefResults(results)
         except errors.InvalidArgumentError as e:
           # Convert to ValueError for backwards compatibility.
           raise ValueError(str(e))
@@ -516,7 +516,7 @@ def import_graph_def(graph_def,
     # they are likely to be due to a typo.
     missing_unused_input_keys = (
         c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results))
+            results.results))
     if missing_unused_input_keys:
       missing_unused_input_keys = [
           compat.as_str(s) for s in missing_unused_input_keys
@@ -528,7 +528,7 @@ def import_graph_def(graph_def,
     if return_elements is None:
       return None
     else:
-      return _GatherReturnElements(return_elements, graph, results)
+      return _GatherReturnElements(return_elements, graph, results.results)
 
   else:
     g = graph
@@ -685,11 +685,10 @@ def import_graph_def(graph_def,
                      ', '.join(x.name for x in op._input_types))))
         # pylint: enable=protected-access
 
-        if not g._is_function(op.type):  # pylint: disable=protected-access
-          # Execute shape inference for this op.
-          # NOTE(mrry): If the graph contains a cycle, the full shape
-          # information may not be available for this op's inputs.
-          ops.set_shapes_for_outputs(op)
+        # Execute shape inference for this op.
+        # NOTE(mrry): If the graph contains a cycle, the full shape
+        # information may not be available for this op's inputs.
+        ops.set_shape_and_handle_data_for_outputs(op)
         # For nodes with _output_shapes set, set the output shapes.
         if '_output_shapes' in op.node_def.attr:
           for i, output in enumerate(op.outputs):
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 6593b1718434fd2035133f65aa08b17774e9e806..2c913d1e028e15e293158fe180e263a78c514ee4 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -218,6 +219,23 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(outer_inner.name, "outer/inner_1")
       self.assertEqual(outer_inner_c.name, "outer/inner/c_1")
 
+  def testEmptyNameScope(self):
+    with ops.Graph().as_default():
+      # Create name scope but don't create any ops with it
+      with ops.name_scope("foo"):
+        pass
+
+      # Import graph def that uses name scope name
+      op, = importer.import_graph_def(
+          self._MakeGraphDef("node { name: 'foo' op: 'IntOutput' }"),
+          return_elements=["foo"],
+          name="")
+
+      if ops._USE_C_API:
+        self.assertEqual(op.name, "foo")
+      else:
+        self.assertEqual(op.name, "foo_1")
+
   def testInputMap(self):
     with ops.Graph().as_default():
       feed_a_0 = constant_op.constant(0, dtype=dtypes.int32)
@@ -356,6 +374,39 @@ class ImportGraphDefTest(test.TestCase):
       self.assertEqual(d._input_types, [dtypes.int32_ref, dtypes.int32])
       self.assertEqual(d.outputs, [])
 
+  def testResources(self):
+    # Produce GraphDef containing a ops producing and consuming resources.
+    graph = ops.Graph()
+    with graph.as_default():
+      var = resource_variable_ops.ResourceVariable(1.0)
+      var_assign = var.assign(2.0)
+      # Use an op that requires handle shape to be set.
+      var_shape = resource_variable_ops.variable_shape(var.handle)
+      init = variables.global_variables_initializer()
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef.
+    with ops.Graph().as_default():
+      # pylint: disable=unused-variable
+      imported_var, imported_assign, imported_shape, imported_init = (
+          importer.import_graph_def(
+              graph_def,
+              return_elements=[var.name, var_assign.name, var_shape.name,
+                               init.name]))
+
+      # Make sure the handle shape is set on the imported variable.
+      new_var_shape = resource_variable_ops.variable_shape(imported_var)
+      # pylint: enable=unused-variable
+
+      # Run the imported graph.
+      # TODO(b/76173421): make this work (currently DCHECKS)
+      # with self.test_session() as sess:
+      #   sess.run(imported_init)
+      #   self.assertEqual(sess.run(imported_var), 1.0)
+      #   self.assertEqual(sess.run(imported_assign), 2.0)
+      #   self.assertEqual(list(sess.run(imported_shape)), [])
+      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
     graph = ops.Graph()
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 1f2aa264c110930b318f30e3a24010a96ebce47e..535c6017f5fd0f8adf9ed091bd4477762e52b0e3 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -26,7 +26,6 @@ import threading  # pylint: disable=unused-import
 from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-import
 from tensorflow.python import pywrap_tensorflow as py_tf
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -54,8 +53,7 @@ def load_op_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library or get the python wrappers.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  lib_handle = py_tf.TF_LoadLibrary(library_filename)
 
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
@@ -99,5 +97,4 @@ def load_file_system_library(library_filename):
   Raises:
     RuntimeError: when unable to load the library.
   """
-  with errors_impl.raise_exception_on_not_ok_status() as status:
-    lib_handle = py_tf.TF_LoadLibrary(library_filename, status)
+  py_tf.TF_LoadLibrary(library_filename)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 25a951a2de10c0c549b02c686a02415c7ce5b2ec..662cda2a7d4822d92a7d10ac42012bc2675c5eac 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import collections
 import copy
+import functools
 import linecache
 import os
 import re
@@ -42,6 +43,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -62,7 +64,7 @@ from tensorflow.python.util.tf_export import tf_export
 # calls to the C API. Currently disabled by default but can be manually enabled
 # in code or via the environment variable. This will be removed once all
 # functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "0") is not "0"
+_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
 _USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
@@ -288,14 +290,26 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-    self._shape_val = tensor_shape.unknown_shape()
+
+    if _USE_C_API:
+      # This will be set by set_shape_and_handle_data_for_outputs.
+      self._shape_val = None
+    else:
+      # The Python code requires all tensors start with a shape to support shape
+      # inference on imported while loops. This isn't necessary with the C API
+      # enabled because the C API provides the shapes for imported nodes.
+      # TODO(skyewm): remove when _USE_C_API is removed.
+      self._shape_val = tensor_shape.unknown_shape()
+
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
 
-    # Attributes used for C++ shape inference. Not inspected, only forwarded.
-    # If set, will be a HandleData object from cpp_shape_inference.proto.
-    self._handle_data = None
+    if not _USE_C_SHAPES:
+      # Attributes used for C++ shape inference. Not inspected, only forwarded.
+      # If set, will be a HandleData object from cpp_shape_inference.proto.
+      self._handle_data = None
+
     self._id = uid()
 
   @property
@@ -369,21 +383,45 @@ class Tensor(_TensorLike):
       A `TensorShape` representing the shape of this tensor.
 
     """
-    graph = self._op._graph._c_graph # pylint: disable=protected-access
-    if graph and _USE_C_SHAPES:
-      with errors.raise_exception_on_not_ok_status() as status:
-        num_dims = c_api.TF_GraphGetTensorNumDims(graph, self._as_tf_output(),
-                                                  status)
-      if num_dims == -1:
-        dim_list = None
+    if self._shape_val is None:
+      if _USE_C_SHAPES:
+        self._shape_val = self._c_api_shape()
       else:
-        with errors.raise_exception_on_not_ok_status() as status:
-          dim_list = c_api.TF_GraphGetTensorShape_wrapper(
-              graph, self._as_tf_output(), num_dims, status)
-        dim_list = [None if i == -1 else i for i in dim_list]
-      return tensor_shape.TensorShape(dim_list)
+        assert _USE_C_API
+        # Call set_shape_and_handle_data_for_outputs in topological order on all
+        # ops that are needed to compute self.op's shape. We do this instead of
+        # having set_shape_and_handle_data_for_outputs recursively call
+        # Operation.shape on self.op.inputs to overflowing the call stack.
+        need_shapes = self._get_input_ops_without_shapes(self.op)
+        need_shapes.sort(key=lambda op: op._id)
+        for op in need_shapes:
+          set_shape_and_handle_data_for_outputs(op)
     return self._shape_val
 
+  def _get_input_ops_without_shapes(self, target_op):
+    """Returns ops needing shape inference to compute target_op's shape."""
+    result = []
+    stack = [self._op]
+    visited = set()
+    while stack:
+      op = stack.pop()
+      if op in visited: continue
+      result.append(op)
+      stack.extend(t.op for t in op.inputs if t._shape_val is None)
+      visited.add(op)
+    return result
+
+  def _c_api_shape(self):
+    """Returns the TensorShape of this tensor according to the C API."""
+    c_graph = self._op._graph._c_graph  # pylint: disable=protected-access
+    shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
+        c_graph, self._as_tf_output())
+    if unknown_shape:
+      return tensor_shape.unknown_shape()
+    else:
+      shape_vector = [None if d == -1 else d for d in shape_vector]
+      return tensor_shape.TensorShape(shape_vector)
+
   @property
   def _shape(self):
     logging.warning("Tensor._shape is private, use Tensor.shape "
@@ -467,8 +505,11 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if not _USE_C_SHAPES:  # pylint: disable=protected-access
-      self._shape_val = self._shape_val.merge_with(shape)
+    if _USE_C_SHAPES:  # pylint: disable=protected-access
+      # Reset cached shape.
+      self._shape_val = None
+    else:
+      self._shape_val = self.shape.merge_with(shape)
 
     if not self._op._graph._c_graph: return
 
@@ -487,13 +528,11 @@ class Tensor(_TensorLike):
         else:
           dim_list.append(dim.value)
     try:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.TF_GraphSetTensorShape_wrapper(
-            self._op._graph._c_graph,  # pylint: disable=protected-access
-            self._as_tf_output(),
-            dim_list,
-            unknown_shape,
-            status)
+      c_api.TF_GraphSetTensorShape_wrapper(
+          self._op._graph._c_graph,  # pylint: disable=protected-access
+          self._as_tf_output(),
+          dim_list,
+          unknown_shape)
     except errors.InvalidArgumentError as e:
       # Convert to ValueError for backwards compatibility.
       raise ValueError(str(e))
@@ -582,6 +621,16 @@ class Tensor(_TensorLike):
     # Necessary to support Python's collection membership operators
     return id(self) == id(other)
 
+  def __copy__(self):
+    # Make sure _shape_val is computed before we copy.
+    # TODO(b/77597810): get rid of Tensor copies.
+    if self._shape_val is None:
+      set_shape_and_handle_data_for_outputs(self.op)
+    cls = self.__class__
+    result = cls.__new__(cls)
+    result.__dict__.update(self.__dict__)
+    return result
+
   # NOTE(mrry): This enables the Tensor's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
   # accords the Tensor class higher priority than an ndarray, or a
@@ -1512,13 +1561,10 @@ def _create_c_op(graph, node_def, inputs, control_inputs):
     serialized = attr_value.SerializeToString()
     # TODO(skyewm): this creates and deletes a new TF_Status for every attr.
     # It might be worth creating a convenient way to re-use the same status.
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_api.TF_SetAttrValueProto(op_desc,
-                                 compat.as_str(name), serialized, status)
+    c_api.TF_SetAttrValueProto(op_desc, compat.as_str(name), serialized)
 
   try:
-    with errors.raise_exception_on_not_ok_status() as status:
-      c_op = c_api.TF_FinishOperation(op_desc, status)
+    c_op = c_api.TF_FinishOperation(op_desc)
   except errors.InvalidArgumentError as e:
     # Convert to ValueError for backwards compatibility.
     raise ValueError(str(e))
@@ -1663,6 +1709,9 @@ class Operation(object):
       self._control_inputs_val = control_input_ops
       self._node_def_val = copy.deepcopy(node_def)
       self._op_def_val = op_def
+    else:
+      # This will be set by self.inputs.
+      self._inputs_val = None
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
@@ -1935,13 +1984,20 @@ class Operation(object):
     if not isinstance(tensor, Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
+
+    # Make sure output shapes are already computed for this op in case we create
+    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
+    # lazily upon request.
+    if not _USE_C_SHAPES:
+      set_shape_and_handle_data_for_outputs(self)
+
     if self._c_op:
-      with errors.raise_exception_on_not_ok_status() as status:
-        c_api.UpdateEdge(
-            self._graph._c_graph,  # pylint: disable=protected-access
-            tensor._as_tf_output(),  # pylint: disable=protected-access
-            self._tf_input(index),
-            status)
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.UpdateEdge(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._tf_input(index))
     else:
       self._inputs_val[index].consumers().remove(self)
       self._inputs_val[index] = tensor
@@ -2052,15 +2108,18 @@ class Operation(object):
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
     if self._c_op:
-      tf_outputs = c_api.GetOperationInputs(self._c_op)
-      # pylint: disable=protected-access
-      retval = [
-          self.graph._get_tensor_by_tf_output(tf_output)
-          for tf_output in tf_outputs
-      ]
-      # pylint: enable=protected-access
-      return Operation._InputList(retval)
-    return Operation._InputList(self._inputs_val)
+      if self._inputs_val is None:
+        tf_outputs = c_api.GetOperationInputs(self._c_op)
+        # pylint: disable=protected-access
+        retval = [
+            self.graph._get_tensor_by_tf_output(tf_output)
+            for tf_output in tf_outputs
+        ]
+        # pylint: enable=protected-access
+        self._inputs_val = Operation._InputList(retval)
+      return self._inputs_val
+    else:
+      return Operation._InputList(self._inputs_val)
 
   @property
   def _inputs(self):
@@ -2113,6 +2172,30 @@ class Operation(object):
     else:
       return self._control_inputs_val
 
+  @property
+  def _control_outputs(self):
+    """The `Operation` objects which have a control dependency on this op.
+
+    Before any of the ops in self._control_outputs can execute tensorflow will
+    ensure self has finished executing.
+
+    Returns:
+      A list of `Operation` objects.
+
+    """
+    if self._c_op:
+      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+      # pylint: disable=protected-access
+      return [
+          self.graph._get_operation_by_name_unsafe(
+              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+      ]
+      # pylint: enable=protected-access
+    else:
+      # TODO(apassos) this should be less inefficient.
+      return [o for o in self._graph.get_operations()
+              if self in o.control_inputs]
+
   @property
   def _control_inputs(self):
     logging.warning("Operation._control_inputs is private, use "
@@ -2159,8 +2242,7 @@ class Operation(object):
     # pylint: enable=line-too-long
     if self._c_op:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_OperationToNodeDef(self._c_op, buf, status)
+        c_api.TF_OperationToNodeDef(self._c_op, buf)
         data = c_api.TF_GetBuffer(buf)
       node_def = node_def_pb2.NodeDef()
       node_def.ParseFromString(compat.as_bytes(data))
@@ -2218,11 +2300,9 @@ class Operation(object):
       buf = c_api.TF_NewBufferFromString(
           compat.as_bytes(attr_value.SerializeToString()))
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf,
-                        status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+        # pylint: enable=protected-access
       finally:
         c_api.TF_DeleteBuffer(buf)
     else:
@@ -2244,8 +2324,7 @@ class Operation(object):
     if self._c_op:
       try:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf, status)
+          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
           data = c_api.TF_GetBuffer(buf)
       except errors.InvalidArgumentError as e:
         # Convert to ValueError for backwards compatibility.
@@ -2454,28 +2533,41 @@ class RegisterShape(object):
     return f
 
 
-def _set_shapes_for_outputs_c_api(op):
-  """set_shapes_for_outputs implementation when C API is enabled."""
-  # The C API computes the shapes when the TF_Operation is created. Fetch the
-  # output shapes from the C object.
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def _set_shape_and_handle_data_for_outputs_c_api(op):
+  """Set shapes and resource handle data using info from the C API."""
+  assert not _USE_C_SHAPES
   for output in op.outputs:
-    with errors.raise_exception_on_not_ok_status() as status:
-      # pylint: disable=protected-access
-      shape_vector, unknown_shape = c_api.TF_GraphGetTensorShapeHelper(
-          op._graph._c_graph, output._as_tf_output(), status)
-      # pylint: enable=protected-access
-    if unknown_shape:
-      output.set_shape(tensor_shape.unknown_shape())
-    elif not shape_vector:
-      output.set_shape(tensor_shape.scalar())
+    output._shape_val = output._c_api_shape()
+    # Set the resource handle data for compatibility with the Python shape
+    # inference code.
+    serialized = c_api.ResourceHandleShapeAndType(
+        op._graph._c_graph, output._as_tf_output())
+    if serialized:
+      output._handle_data = (
+          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
+          .FromString(compat.as_bytes(serialized)))
     else:
-      shape_vector = [None if d == -1 else d for d in shape_vector]
-      output.set_shape(tensor_shape.TensorShape(shape_vector))
+      output._handle_data = None
 
 
-# TODO(skyewm): remove this when _USE_C_API flag is removed.
-def _set_shapes_for_outputs(op):
-  """set_shapes_for_outputs implementation when C API is disabled."""
+# TODO(b/74620627): remove when _USE_C_SHAPES is removed
+def set_shape_and_handle_data_for_outputs(op):
+  """Set the shapes and resource handle data for op's outputs.
+
+  When _USE_C_API = True, this is lazily called when a tensor's shape is first
+  requested. Usually this should work automatically, but some edge cases may
+  require manaully calling this first to make sure Tensor._shape_val and
+  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
+  Tensor).
+  """
+  if _USE_C_SHAPES: return
+
+  if op.graph._is_function(op.type):
+    for output in op.outputs:
+      output._shape_val = tensor_shape.unknown_shape()
+    return
+
   try:
     shape_func = _shape_registry.lookup(op.type)
   except LookupError:
@@ -2494,8 +2586,10 @@ def _set_shapes_for_outputs(op):
     shapes = shapes_dict["shapes"]
     handle_datas = shapes_dict["handle_data"]
     for output, handle_data in zip(op.outputs, handle_datas):
+      # Don't override any existing handle data that may have been manually set.
       # pylint: disable=protected-access
-      output._handle_data = handle_data
+      if output._handle_data is None:
+        output._handle_data = handle_data
       # pylint: enable=protected-access
 
   if len(op.outputs) != len(shapes):
@@ -2503,15 +2597,8 @@ def _set_shapes_for_outputs(op):
         "Shape function for op %s returned %d shapes but expected %d %s %s" %
         (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
   for output, s in zip(op.outputs, shapes):
-    output.set_shape(s)
-
-
-def set_shapes_for_outputs(op):
-  """Set the shapes for op's outputs."""
-  if op._c_op and _USE_C_SHAPES:  # pylint: disable=protected-access
-    return _set_shapes_for_outputs_c_api(op)
-  else:
-    return _set_shapes_for_outputs(op)
+    output._shape_val = tensor_shape.unknown_shape()
+    output._shape_val = output._shape_val.merge_with(s)
 
 
 class OpStats(object):
@@ -2976,8 +3063,7 @@ class Graph(object):
     # pylint: enable=line-too-long
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          c_api.TF_GraphVersions(self._c_graph, buf, status)
+        c_api.TF_GraphVersions(self._c_graph, buf)
         data = c_api.TF_GetBuffer(buf)
       version_def = versions_pb2.VersionDef()
       version_def.ParseFromString(compat.as_bytes(data))
@@ -3080,8 +3166,7 @@ class Graph(object):
     if self._c_graph:
       with self._lock:
         with c_api_util.tf_buffer() as buf:
-          with errors.raise_exception_on_not_ok_status() as status:
-            c_api.TF_GraphToGraphDef(self._c_graph, buf, status)
+          c_api.TF_GraphToGraphDef(self._c_graph, buf)
           data = c_api.TF_GetBuffer(buf)
         graph = graph_pb2.GraphDef()
         graph.ParseFromString(compat.as_bytes(data))
@@ -3190,14 +3275,12 @@ class Graph(object):
       # remove this when all functions are generated using the C API by default
       # as this will be unnecessary.
       if not function._c_func:
-        with errors.raise_exception_on_not_ok_status() as status:
-          serialized = function.definition.SerializeToString()
-          function._c_func = c_api.TF_FunctionImportFunctionDef(
-              serialized, status)
-      with errors.raise_exception_on_not_ok_status() as status:
-        gradient = function._grad_func._c_func if function._grad_func else None
-        c_api.TF_GraphCopyFunction(self._c_graph, function._c_func, gradient,
-                                   status)
+        serialized = function.definition.SerializeToString()
+        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+        function._c_func = c_api_util.ScopedTFFunction(c_func)
+      gradient = (function._grad_func._c_func.func if function._grad_func
+                  else None)
+      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     else:
       # If there is already a function with the same name, raise an error
       # if bodies are different. Else, do nothing. The C API version above
@@ -3308,18 +3391,14 @@ class Graph(object):
           original_op=self._default_original_op,
           op_def=op_def)
 
-      # TODO(vrv): Instead of eagerly filling in shape property for every op,
-      # only populate the shape when requested.
+      # Note: shapes are lazily computed with the C API enabled.
       #
       # TODO(skyewm): unlike in the original Python implementation, the C API
       # always computes shape information (even for function calls, which the
       # original Python shape inference code doesn't handle). Deprecate the
       # compute_shapes argument.
-      #
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed
-      if (ret._c_op and _USE_C_SHAPES) or compute_shapes:  # pylint: disable=protected-access
-        set_shapes_for_outputs(ret)
+      if not _USE_C_API and compute_shapes:
+        set_shape_and_handle_data_for_outputs(ret)
 
       self._create_op_helper(ret, compute_shapes=compute_shapes,
                              compute_device=compute_device)
@@ -3347,8 +3426,12 @@ class Graph(object):
     """
     self._check_not_finalized()
     ret = Operation(c_op, self)
-    assert ret.name not in self._names_in_use
-    self._names_in_use[ret.name] = 1
+    # If a name_scope was created with ret.name but no nodes were created in it,
+    # the name will still appear in _names_in_use even though the name hasn't
+    # been used. This is ok, just leave _names_in_use as-is in this case.
+    # TODO(skyewm): make the C API guarantee no name conflicts.
+    if ret.name not in self._names_in_use:
+      self._names_in_use[ret.name] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
@@ -3457,18 +3540,17 @@ class Graph(object):
         for c_op in c_api_util.new_tf_operations(self)
     ]
 
+    # pylint: disable=protected-access
     for op in new_ops:
       # Operations created by the C API always retrieve shapes from the C API so
       # we preserve the shapes of ops created in import_graph_def (from the
       # "_output_shapes" attr of the imported NodeDef).
-      # TODO(b/74620627): move this back to _create_op_helper once _USE_C_SHAPES
-      # is removed.
-      _set_shapes_for_outputs_c_api(op)
+      if not _USE_C_SHAPES:
+        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
-      # pylint: disable=protected-access
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
-      # pylint: enable=protected-access
+    # pylint: enable=protected-access
 
     return new_ops
 
@@ -3714,11 +3796,9 @@ class Graph(object):
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     if self._c_graph:
       with c_api_util.tf_buffer() as buf:
-        with errors.raise_exception_on_not_ok_status() as status:
-          # pylint: disable=protected-access
-          c_api.TF_GraphGetOpDef(self._c_graph,
-                                 compat.as_bytes(type), buf, status)
-          # pylint: enable=protected-access
+        # pylint: disable=protected-access
+        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+        # pylint: enable=protected-access
         data = c_api.TF_GetBuffer(buf)
       op_def = op_def_pb2.OpDef()
       op_def.ParseFromString(compat.as_bytes(data))
@@ -4100,6 +4180,19 @@ class Graph(object):
     """
     return self._name_stack
 
+  @tf_contextlib.contextmanager
+  def _colocate_with_for_gradient(self, op, gradient_uid,
+                                  ignore_existing=False):
+    with self.colocate_with(op, ignore_existing):
+      if gradient_uid is not None and self._control_flow_context is not None:
+        try:
+          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
+          yield
+        finally:
+          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
+      else:
+        yield
+
   @tf_contextlib.contextmanager
   def colocate_with(self, op, ignore_existing=False):
     """Returns a context manager that specifies an op to colocate with.
@@ -4494,6 +4587,22 @@ class Graph(object):
         return tf.matmul(tensor, tensor)
     ```
 
+    Also note that though execution of ops created under this scope will trigger
+    execution of the dependencies, the ops created under this scope might still
+    be pruned from a normal tensorflow graph. For example, in the following
+    snippet of code the dependencies are never executed:
+
+    ```python
+      loss = model.loss()
+      with tf.control_dependencies(dependencies):
+        loss = loss + tf.constant(1)  # note: dependencies ignored in the
+                                      # backward pass
+      return tf.gradients(loss, model.variables)
+    ```
+
+    This is because evaluating the gradient graph does not require evaluating
+    the constant(1) op created in the forward pass.
+
     Args:
       control_inputs: A list of `Operation` or `Tensor` objects which
         must be executed or computed before running the operations
@@ -4863,8 +4972,7 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-@tf_export("colocate_with")
-def colocate_with(op, ignore_existing=False):
+def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
   if context.executing_eagerly():
     if op is not None:
       return device(op.device)
@@ -4878,7 +4986,13 @@ def colocate_with(op, ignore_existing=False):
       else:
         raise ValueError("Encountered an Eager-defined Tensor during graph "
                          "construction, but a function was not being built.")
-    return default_graph.colocate_with(op, ignore_existing)
+    return default_graph._colocate_with_for_gradient(
+        op, gradient_uid=gradient_uid, ignore_existing=ignore_existing)
+
+
+@tf_export("colocate_with")
+def colocate_with(op, ignore_existing=False):
+  return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @tf_export("control_dependencies")
@@ -5131,14 +5245,35 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      context.context().context_switches.push(default.building_function,
-                                              default.as_default)
+      if context.executing_eagerly():
+        # A Graph alone on the context stack would keep init_scope-wrapped
+        # operations graph building when entered (assuming init_scope is called
+        # in a graph building context). Instead, we push a context which first
+        # enables eager execution and then re-enters the Graph.
+        context.context().context_switches.push(
+            default.building_function,
+            functools.partial(
+                _enter_context_and_graph,
+                context.eager_mode,
+                default.as_default))
+      else:
+        # This Graph is being used from a graph building context. A lack of
+        # context switch implies that the context is graph building.
+        context.context().context_switches.push(default.building_function,
+                                                default.as_default)
       with super(_DefaultGraphStack, self).get_controller(default) as g:
         yield g
     finally:
       context.context().context_switches.pop()
 
 
+@tf_contextlib.contextmanager
+def _enter_context_and_graph(context_fn, graph_fn):
+  """Combines two context managers."""
+  with context_fn(), graph_fn():
+    yield
+
+
 _default_graph_stack = _DefaultGraphStack()
 
 
@@ -5332,6 +5467,10 @@ def enable_eager_execution(config=None, device_policy=None,
     raise ValueError(
         "tf.enable_eager_execution must be called at program startup.")
 
+  # Monkey patch to get rid of an unnecessary conditional since the context is
+  # now initialized.
+  context.context = context.context_safe
+
 
 def eager_run(main=None, argv=None):
   """Runs the program with an optional main function and argv list.
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index aa51391871f4c12d34b86311cc5b8ea9aabd5434..c9c1a3d66be1051859b3dc4eef67803881efcd55 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -473,6 +473,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, x, x, y, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testAddControlInputC(self):
     # The C API dedups redundant control edges, pure Python does not
@@ -487,6 +488,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x])
     z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
     self.assertEqual(z.control_inputs, [x, y])
+    self.assertEqual(x._control_outputs, [z])
 
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
@@ -2303,6 +2305,13 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertEqual(ops.get_name_scope(), "inner")
       self.assertEqual(ops.get_name_scope(), "")
 
+  def testEagerGraphContextsExecuteEagerly(self):
+    with context.eager_mode():
+      with ops.Graph().as_default():
+        with context.graph_mode():
+          with ops.init_scope():
+            self.assertTrue(context.executing_eagerly())
+
   def testPreservesNameScopeInEagerExecution(self):
     with context.eager_mode():
       def foo():
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 9850f0becc69ff1f53b70f0ad2296aead8b5152c..e5e3b821998718e7b87a95439a442df98ad7c997 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -448,7 +448,7 @@ string AttrValueToPython(const string& type, const AttrValue& value,
     return TensorToPython(value.tensor());
   } else if (type == "func") {
     return StringToPython(value.func().name());
-  } else if (StringPiece(type).starts_with("list(")) {
+  } else if (str_util::StartsWith(type, "list(")) {
     return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
     return "?";
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index bc5ca195da50499c6fbab822a9a093be3f0277e0..ca6ed42beec4a3d5ff70d5a605ab006265d1cce9 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/scanner.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
@@ -95,7 +96,8 @@ string InferSourceFileName(const char* argv_zero) {
   // operators defined in <op type>_ops.cc
   const char* kExecPrefix = "gen_";
   const char* kExecSuffix = "_py_wrappers_cc";
-  if (command_str.Consume(kExecPrefix) && command_str.ends_with(kExecSuffix)) {
+  if (str_util::ConsumePrefix(&command_str, kExecPrefix) &&
+      str_util::EndsWith(command_str, kExecSuffix)) {
     command_str.remove_suffix(strlen(kExecSuffix));
     return strings::StrCat(command_str, ".cc");
   } else {
diff --git a/tensorflow/python/framework/smart_cond.py b/tensorflow/python/framework/smart_cond.py
index c7ff23e4ff809ed7bc57259fa3ec9feb921b5a71..48a834392b47b4cdcc82381153852584052a5aad 100644
--- a/tensorflow/python/framework/smart_cond.py
+++ b/tensorflow/python/framework/smart_cond.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python import pywrap_tensorflow as c_api
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -83,9 +82,8 @@ def smart_constant_value(pred):
     # wanted to limit the change hidden behind _USE_C_API).
     # pylint: disable=protected-access
     if pred_value is None and ops._USE_C_API:
-      with errors.raise_exception_on_not_ok_status() as status:
-        pred_value = c_api.TF_TryEvaluateConstant_wrapper(
-            pred.graph._c_graph, pred._as_tf_output(), status)
+      pred_value = c_api.TF_TryEvaluateConstant_wrapper(pred.graph._c_graph,
+                                                        pred._as_tf_output())
     # pylint: enable=protected-access
 
   else:
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index af2a5b1a7ef9a70c0baf5d02257951803a7a76fa..26069d9d90e4a75cfe3988628f1407d6f327385b 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -30,6 +31,8 @@ class Dimension(object):
     """Creates a new Dimension with the given value."""
     if value is None:
       self._value = None
+    elif isinstance(value, dtypes.DType):
+      raise TypeError("Cannot convert %s to Dimension" % value)
     else:
       self._value = int(value)
       if (not isinstance(value, compat.bytes_or_text_types) and
diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py
index 4e8ce4d889c4ef0c6e56806587a64e8f9be7e10a..4f239228332946d9a863be408f5967c282019852 100644
--- a/tensorflow/python/framework/tensor_shape_test.py
+++ b/tensorflow/python/framework/tensor_shape_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import tensor_shape_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -184,6 +185,10 @@ class DimensionTest(test_util.TensorFlowTestCase):
     self.assertEqual(str(tensor_shape.Dimension(7)), "7")
     self.assertEqual(str(tensor_shape.Dimension(None)), "?")
 
+  def testUnsupportedType(self):
+    with self.assertRaises(TypeError):
+      tensor_shape.Dimension(dtypes.string)
+      
   def testMod(self):
     four = tensor_shape.Dimension(4)
     nine = tensor_shape.Dimension(9)
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 984bcecdfe05efd79bdf218197c410b14abe3516..8cf24206edab8be807eca1d067662a57585e2bda 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -22,7 +22,6 @@ import six
 
 from tensorflow.core.framework import tensor_pb2
 from tensorflow.core.framework import tensor_shape_pb2
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
@@ -823,17 +822,32 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
   all-or-nothing.
 
   Args:
-    tensor: The rank-1 Tensor to be evaluated.
+    tensor: The rank-0 or rank-1 Tensor to be evaluated.
 
   Returns:
     A `TensorShape` based on the constant value of the given `tensor`.
+
+  Raises:
+    ValueError: If the shape is rank-0 and is not statically known to be -1.
   """
-  if context.executing_eagerly():
+  if isinstance(tensor, ops.EagerTensor):
     return tensor_shape.as_shape(
         [dim if dim != -1 else None for dim in tensor.numpy()])
 
+  if tensor.get_shape().ndims == 0:
+    value = constant_value(tensor)
+    if value is None:
+      raise ValueError(
+          "Received a scalar with unknown value as shape; require a statically "
+          "known scalar with value '-1' to describe an unknown shape.")
+    if value != -1:
+      raise ValueError(
+          "Received a scalar value '%s' as shape; require a statically known "
+          "scalar with value '-1' to describe an unknown shape." % value)
+    return tensor_shape.unknown_shape()
+
   shape = tensor.get_shape().with_rank(1)
-  if tensor.get_shape() == [0]:
+  if shape == [0]:
     return tensor_shape.scalar()
   elif tensor.op.type == "Shape":
     return tensor.op.inputs[0].get_shape()
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 43106b6e598d464b15d0fe00265ccec906fff9a7..eea27d76c609d7dc8addd761dc54c86f6b169b7b 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -487,7 +487,13 @@ def assert_no_new_pyobjects_executing_eagerly(f):
       gc.collect()
       # There should be no new Python objects hanging around.
       new_count = len(gc.get_objects())
-      self.assertEqual(previous_count, new_count)
+      # In some cases (specifacally on MacOS), new_count is somehow
+      # smaller than previous_count.
+      # Using plain assert because not all classes using this decorator
+      # have assertLessEqual
+      assert new_count <= previous_count, (
+          "new_count(%d) is not less than or equal to previous_count(%d)" % (
+              new_count, previous_count))
       gc.enable()
 
   return decorator
@@ -609,45 +615,68 @@ def assert_no_garbage_created(f):
 
 
 def run_in_graph_and_eager_modes(__unused__=None,
-                                 graph=None,
                                  config=None,
-                                 use_gpu=False,
-                                 force_gpu=False,
+                                 use_gpu=True,
                                  reset_test=True,
                                  assert_no_eager_garbage=False):
-  """Runs the test in both graph and eager modes.
+  """Execute the decorated test with and without enabling eager execution.
+
+  This function returns a decorator intended to be applied to test methods in
+  a @{tf.test.TestCase} class. Doing so will cause the contents of the test
+  method to be executed twice - once normally, and once with eager execution
+  enabled. This allows unittests to confirm the equivalence between eager
+  and graph execution (see @{tf.enable_eager_execution}).
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(tf.test.TestCase):
+
+    @run_in_graph_and_eager_modes()
+    def test_foo(self):
+      x = tf.constant([1, 2])
+      y = tf.constant([3, 4])
+      z = tf.add(x, y)
+      self.assertAllEqual([4, 6], self.evaluate(z))
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test validates that `tf.add()` has the same behavior when computed with
+  eager execution enabled as it does when constructing a TensorFlow graph and
+  executing the `z` tensor in a session.
+
 
   Args:
     __unused__: Prevents sliently skipping tests.
-    graph: Optional graph to use during the returned session.
     config: An optional config_pb2.ConfigProto to use to configure the
-      session.
-    use_gpu: If True, attempt to run as many ops as possible on GPU.
-    force_gpu: If True, pin all ops to `/device:GPU:0`.
-    reset_test: If True, tearDown and SetUp the test case again.
+      session when executing graphs.
+    use_gpu: If True, attempt to run as many operations as possible on GPU.
+    reset_test: If True, tearDown and SetUp the test case between the two
+      executions of the test (once with and once without eager execution).
     assert_no_eager_garbage: If True, sets DEBUG_SAVEALL on the garbage
       collector and asserts that no extra garbage has been created when running
-      the test in eager mode. This will fail if there are reference cycles
-      (e.g. a = []; a.append(a)). Off by default because some tests may create
-      garbage for legitimate reasons (e.g. they define a class which inherits
-      from `object`), and because DEBUG_SAVEALL is sticky in some Python
-      interpreters (meaning that tests which rely on objects being collected
-      elsewhere in the unit test file will not work). Additionally, checks that
-      nothing still has a reference to Tensors that the test allocated.
+      the test with eager execution enabled. This will fail if there are
+      reference cycles (e.g. a = []; a.append(a)). Off by default because some
+      tests may create garbage for legitimate reasons (e.g. they define a class
+      which inherits from `object`), and because DEBUG_SAVEALL is sticky in some
+      Python interpreters (meaning that tests which rely on objects being
+      collected elsewhere in the unit test file will not work). Additionally,
+      checks that nothing still has a reference to Tensors that the test
+      allocated.
   Returns:
-    Returns a decorator that will run the decorated test function
-        using both a graph and using eager execution.
+    Returns a decorator that will run the decorated test method twice:
+    once by constructing and executing a graph in a session and once with
+    eager execution enabled.
   """
 
   assert not __unused__, "Add () after run_in_graph_and_eager_modes."
 
   def decorator(f):
-    """Test method decorator."""
-
     def decorated(self, **kwargs):
-      """Decorated the test method."""
       with context.graph_mode():
-        with self.test_session(graph, config, use_gpu, force_gpu):
+        with self.test_session(use_gpu=use_gpu):
           f(self, **kwargs)
 
       if reset_test:
@@ -657,27 +686,20 @@ def run_in_graph_and_eager_modes(__unused__=None,
         self._tempdir = None
         self.setUp()
 
-      def run_eager_mode(self, **kwargs):
-        if force_gpu:
-          gpu_name = gpu_device_name()
-          if not gpu_name:
-            gpu_name = "/device:GPU:0"
-          with context.device(gpu_name):
-            f(self)
-        elif use_gpu:
-          # TODO(xpan): Support softplacement and gpu by default when available.
-          f(self, **kwargs)
-        else:
-          with context.device("/device:CPU:0"):
+      def run_eagerly(self, **kwargs):
+        if not use_gpu:
+          with ops.device("/cpu:0"):
             f(self, **kwargs)
+        else:
+          f(self, **kwargs)
 
       if assert_no_eager_garbage:
-        run_eager_mode = assert_no_new_tensors(
-            assert_no_garbage_created(run_eager_mode))
+        run_eagerly = assert_no_new_tensors(
+            assert_no_garbage_created(run_eagerly))
 
       with context.eager_mode():
         with ops.Graph().as_default():
-          run_eager_mode(self, **kwargs)
+          run_eagerly(self, **kwargs)
 
     return decorated
 
@@ -968,8 +990,6 @@ class TensorFlowTestCase(googletest.TestCase):
       config.graph_options.optimizer_options.opt_level = -1
       config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
-      config.graph_options.rewrite_options.arithmetic_optimization = (
-          rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
     if graph is None:
diff --git a/tensorflow/python/framework/versions.py b/tensorflow/python/framework/versions.py
index 06955b885852a641bc814f88c99838effe03bfd4..472ccbcac7a447926989cfbef27ec1ea9d71e91c 100644
--- a/tensorflow/python/framework/versions.py
+++ b/tensorflow/python/framework/versions.py
@@ -29,15 +29,19 @@ __cxx11_abi_flag__ = pywrap_tensorflow.__cxx11_abi_flag__
 __monolithic_build__ = pywrap_tensorflow.__monolithic_build__
 
 VERSION = __version__
-tf_export("VERSION").export_constant(__name__, "VERSION")
+tf_export("VERSION", "__version__").export_constant(__name__, "VERSION")
 GIT_VERSION = __git_version__
-tf_export("GIT_VERSION").export_constant(__name__, "GIT_VERSION")
+tf_export("GIT_VERSION", "__git_version__").export_constant(
+    __name__, "GIT_VERSION")
 COMPILER_VERSION = __compiler_version__
-tf_export("COMPILER_VERSION").export_constant(__name__, "COMPILER_VERSION")
+tf_export("COMPILER_VERSION", "__compiler_version__").export_constant(
+    __name__, "COMPILER_VERSION")
 CXX11_ABI_FLAG = __cxx11_abi_flag__
-tf_export("CXX11_ABI_FLAG").export_constant(__name__, "CXX11_ABI_FLAG")
+tf_export("CXX11_ABI_FLAG", "__cxx11_abi_flag__").export_constant(
+    __name__, "CXX11_ABI_FLAG")
 MONOLITHIC_BUILD = __monolithic_build__
-tf_export("MONOLITHIC_BUILD").export_constant(__name__, "MONOLITHIC_BUILD")
+tf_export("MONOLITHIC_BUILD", "__monolithic_build__").export_constant(
+    __name__, "MONOLITHIC_BUILD")
 
 GRAPH_DEF_VERSION = pywrap_tensorflow.GRAPH_DEF_VERSION
 tf_export("GRAPH_DEF_VERSION").export_constant(__name__, "GRAPH_DEF_VERSION")
diff --git a/tensorflow/python/grappler/cluster.i b/tensorflow/python/grappler/cluster.i
index 067c8213d4741936e4c28aaedf4f30639b8cdc41..6816e204075bc37c6958efa5b028417078c36b2b 100644
--- a/tensorflow/python/grappler/cluster.i
+++ b/tensorflow/python/grappler/cluster.i
@@ -320,7 +320,8 @@ static PyObject* TF_MeasureCosts(
   tensorflow::OpPerformanceList op_performance_data;
   tensorflow::StepStats step_stats;
 
-  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), 10, 0);
+  const int num_measurements = cluster->type() == "virtual" ? 1 : 10;
+  tensorflow::grappler::MeasuringCostEstimator cost_measure(cluster.get(), num_measurements, 0);
 
   tensorflow::grappler::Costs costs;
   tensorflow::Status status = _GetOpPerformanceDataAndRunTime(
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index a3c4c2bbeba7c4ee5d00268c0e475e11a31fa7eb..26c6f22d34b27c8b866c0b23a36fdef5164348a4 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -87,9 +87,10 @@ class ClusterTest(test.TestCase):
 
   def testVirtualCluster(self):
     with ops.Graph().as_default() as g:
-      a = random_ops.random_uniform(shape=())
-      b = random_ops.random_uniform(shape=())
-      c = a + b
+      with ops.device('/device:GPU:0'):
+        a = random_ops.random_uniform(shape=[1024, 1024])
+        b = random_ops.random_uniform(shape=[1024, 1024])
+        c = a + b
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(c)
       mg = meta_graph.create_meta_graph_def(graph=g)
@@ -102,10 +103,13 @@ class ClusterTest(test.TestCase):
               'architecture': '7'
           })
       named_device = device_properties_pb2.NamedDevice(
-          properties=device_properties, name='/GPU:0')
-      grappler_cluster = cluster.Cluster(devices=[named_device])
+          properties=device_properties, name='/device:GPU:0')
+      grappler_cluster = cluster.Cluster(
+          disable_detailed_stats=False,
+          disable_timeline=False,
+          devices=[named_device])
       op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item)
-      self.assertGreater(run_time, 0)
+      self.assertEqual(run_time, 0.000545)
       self.assertEqual(len(op_perfs), 15)
 
       estimated_perf = grappler_cluster.EstimatePerformance(named_device)
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab1d0ed25b9130fabcffbb8da2265c046206da46
--- /dev/null
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Grappler Constant Folding."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ConstantFoldingTest(test.TestCase):
+
+  # See b/76008022.
+  def testScanInsideWhile(self):
+
+    def loop_cond(idx_step, *unused_args):
+      return idx_step < 1
+
+    def loop_body(idx_step, y):
+      x = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      x = functional_ops.scan(
+          math_ops.add,
+          x,
+          initializer=array_ops.zeros([20, 30], dtype=dtypes.float32),
+          back_prop=False,
+          parallel_iterations=1)
+
+      with ops.device('/cpu:0'):
+        y = array_ops.identity(x)
+
+        return idx_step + 1, y
+
+    if test.is_gpu_available(cuda_only=True):
+      init_y = array_ops.zeros([10, 20, 30], dtype=dtypes.float32)
+      _, y = control_flow_ops.while_loop(
+          loop_cond,
+          loop_body,
+          loop_vars=[0, init_y],
+          back_prop=False,
+          parallel_iterations=1)
+      with session.Session() as sess:
+        y_v = sess.run(y)
+        self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/grappler/item.py b/tensorflow/python/grappler/item.py
index 4a083849bd39f606877069419396d8c42ef077eb..1748efdd130268f2668cd8cb1b5c2da18bafd549 100644
--- a/tensorflow/python/grappler/item.py
+++ b/tensorflow/python/grappler/item.py
@@ -51,9 +51,7 @@ class Item(object):
     self._BuildTFItem()
 
   def IdentifyImportantOps(self, sort_topologically=False):
-    with errors.raise_exception_on_not_ok_status() as status:
-      return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically,
-                                             status)
+    return tf_item.TF_IdentifyImportantOps(self.tf_item, sort_topologically)
 
   def GetOpProperties(self):
     ret_from_swig = tf_item.TF_GetOpProperties(self.tf_item)
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index 3ee4d7807ea5677a742514eb56267b94c6b92bba..1c0f072dd32d38f048cfa48d38b45264951d095e 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,16 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -74,6 +78,47 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     self.assertEqual(a2.op.name, optimized_graph.node[3].name)
     self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
 
+  def testLoops(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def _Cond(_, counter):
+        return counter < end
+
+      def _Body(buf, counter):
+        buf = array_ops.concat([buf, [counter]], 0)
+        counter += 1
+        return [buf, counter]
+
+      start = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      end = array_ops.placeholder(shape=[], dtype=dtypes.int32)
+      init_buf = array_ops.zeros(shape=[0], dtype=dtypes.int32)
+      loop_vars = [init_buf, start]
+      shape_inv = [
+          tensor_shape.TensorShape([None]),
+          tensor_shape.TensorShape([])
+      ]
+      buf, _ = control_flow_ops.while_loop(_Cond, _Body, loop_vars, shape_inv)
+
+      f = -array_ops.ones_like(buf, optimize=False)
+      buf_shape = array_ops.shape(buf)
+      f_shape = array_ops.shape(f)
+      ops.add_to_collection('train_op', buf_shape)
+      ops.add_to_collection('train_op', f_shape)
+
+    # Optimize the graph.
+    mg = meta_graph.create_meta_graph_def(graph=g)
+    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    mg.graph_def.CopyFrom(optimized_graph)
+
+    # Check that the nodes referenced in various collections have been preserved
+    item = gitem.Item(mg)
+    props = item.GetOpProperties()
+    buf_prop = props[buf.op.name]
+    f_prop = props[f.op.name]
+    self.assertEqual(buf_prop, f_prop)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 16033e9b8f3b6970f92c40a5b61db815a97cf6aa..da5bc3e6f11dbdce073abba13820c460c68e6bd6 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -20,7 +20,6 @@ py_library(
     srcs = [
         "__init__.py",
         "_impl/keras/__init__.py",
-        "_impl/keras/activations.py",
         "_impl/keras/applications/__init__.py",
         "_impl/keras/applications/densenet.py",
         "_impl/keras/applications/imagenet_utils.py",
@@ -32,9 +31,6 @@ py_library(
         "_impl/keras/applications/vgg16.py",
         "_impl/keras/applications/vgg19.py",
         "_impl/keras/applications/xception.py",
-        "_impl/keras/backend.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
         "_impl/keras/datasets/__init__.py",
         "_impl/keras/datasets/boston_housing.py",
         "_impl/keras/datasets/cifar.py",
@@ -44,49 +40,13 @@ py_library(
         "_impl/keras/datasets/imdb.py",
         "_impl/keras/datasets/mnist.py",
         "_impl/keras/datasets/reuters.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/base_layer.py",
-        "_impl/keras/engine/input_layer.py",
-        "_impl/keras/engine/network.py",
-        "_impl/keras/engine/saving.py",
-        "_impl/keras/engine/sequential.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_arrays.py",
-        "_impl/keras/engine/training_eager.py",
-        "_impl/keras/engine/training_generator.py",
-        "_impl/keras/engine/training_utils.py",
         "_impl/keras/estimator.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
         "_impl/keras/preprocessing/__init__.py",
         "_impl/keras/preprocessing/image.py",
         "_impl/keras/preprocessing/sequence.py",
         "_impl/keras/preprocessing/text.py",
-        "_impl/keras/regularizers.py",
         "_impl/keras/testing_utils.py",
         "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/io_utils.py",
-        "_impl/keras/utils/layer_utils.py",
         "_impl/keras/utils/multi_gpu_utils.py",
         "_impl/keras/utils/np_utils.py",
         "_impl/keras/utils/vis_utils.py",
@@ -136,7 +96,21 @@ py_library(
         ":empty_condition": [],
         "//conditions:default": [],
     }) + [
-        "@six_archive//:six",
+        ":backend",
+        ":engine",
+        ":layers",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_library(
+    name = "backend",
+    srcs = ["_impl/keras/backend.py"],
+    srcs_version = "PY2AND3",
+    deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -152,8 +126,6 @@ py_library(
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:layers_base",
         "//tensorflow/python:logging_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
@@ -168,13 +140,83 @@ py_library(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_library(
+    name = "engine",
+    srcs = [
+        "_impl/keras/activations.py",
+        "_impl/keras/callbacks.py",
+        "_impl/keras/constraints.py",
+        "_impl/keras/engine/__init__.py",
+        "_impl/keras/engine/base_layer.py",
+        "_impl/keras/engine/input_layer.py",
+        "_impl/keras/engine/network.py",
+        "_impl/keras/engine/saving.py",
+        "_impl/keras/engine/sequential.py",
+        "_impl/keras/engine/training.py",
+        "_impl/keras/engine/training_arrays.py",
+        "_impl/keras/engine/training_eager.py",
+        "_impl/keras/engine/training_generator.py",
+        "_impl/keras/engine/training_utils.py",
+        "_impl/keras/initializers.py",
+        "_impl/keras/losses.py",
+        "_impl/keras/metrics.py",
+        "_impl/keras/models.py",
+        "_impl/keras/optimizers.py",
+        "_impl/keras/regularizers.py",
+        "_impl/keras/utils/data_utils.py",
+        "_impl/keras/utils/io_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":backend",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "layers",
+    srcs = [
+        "_impl/keras/layers/__init__.py",
+        "_impl/keras/layers/advanced_activations.py",
+        "_impl/keras/layers/convolutional.py",
+        "_impl/keras/layers/convolutional_recurrent.py",
+        "_impl/keras/layers/core.py",
+        "_impl/keras/layers/embeddings.py",
+        "_impl/keras/layers/local.py",
+        "_impl/keras/layers/merge.py",
+        "_impl/keras/layers/noise.py",
+        "_impl/keras/layers/normalization.py",
+        "_impl/keras/layers/pooling.py",
+        "_impl/keras/layers/recurrent.py",
+        "_impl/keras/layers/serialization.py",
+        "_impl/keras/layers/wrappers.py",
+        "_impl/keras/utils/conv_utils.py",
+        "_impl/keras/utils/generic_utils.py",
+        "_impl/keras/utils/layer_utils.py",
+        "_impl/keras/utils/tf_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":engine",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -605,10 +647,11 @@ py_test(
 
 py_test(
     name = "data_utils_test",
-    size = "medium",
+    size = "large",
     srcs = ["_impl/keras/utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
+        "no_oss",
         "no_windows",
         "noasan",  # times out
         "notsan",
@@ -637,7 +680,10 @@ py_test(
     size = "small",
     srcs = ["_impl/keras/utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -868,15 +914,3 @@ py_library(
         "//third_party/py/numpy",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/_impl/keras/activations.py
index 236e17653e1b762e1e6962f453b714d1bf7bcbf7..8def7ec49375c7ce23e8f2a24a4c3615d05ca9bb 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/_impl/keras/activations.py
@@ -22,8 +22,8 @@ import six
 
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.layers.base import Layer
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -43,10 +43,10 @@ def softmax(x, axis=-1):
   """
   ndim = K.ndim(x)
   if ndim == 2:
-    return K.softmax(x)
+    return nn.softmax(x)
   elif ndim > 2:
-    e = K.exp(x - K.max(x, axis=axis, keepdims=True))
-    s = K.sum(e, axis=axis, keepdims=True)
+    e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))
+    s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
     return e / s
   else:
     raise ValueError('Cannot apply softmax to a tensor that is 1D')
@@ -79,12 +79,12 @@ def selu(x):
 
 @tf_export('keras.activations.softplus')
 def softplus(x):
-  return K.softplus(x)
+  return nn.softplus(x)
 
 
 @tf_export('keras.activations.softsign')
 def softsign(x):
-  return K.softsign(x)
+  return nn.softsign(x)
 
 
 @tf_export('keras.activations.relu')
@@ -94,12 +94,12 @@ def relu(x, alpha=0., max_value=None):
 
 @tf_export('keras.activations.tanh')
 def tanh(x):
-  return K.tanh(x)
+  return nn.tanh(x)
 
 
 @tf_export('keras.activations.sigmoid')
 def sigmoid(x):
-  return K.sigmoid(x)
+  return nn.sigmoid(x)
 
 
 @tf_export('keras.activations.hard_sigmoid')
@@ -134,12 +134,6 @@ def get(identifier):
     identifier = str(identifier)
     return deserialize(identifier)
   elif callable(identifier):
-    if isinstance(identifier, Layer):
-      logging.warning(
-          'Do not pass a layer instance (such as {identifier}) as the '
-          'activation argument of another layer. Instead, advanced '
-          'activation layers should be used just like any other '
-          'layer in a model.'.format(identifier=identifier.__class__.__name__))
     return identifier
   else:
     raise ValueError('Could not interpret '
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
index c26a28ed4087e30968585ec8ac0b64b51513bcae..d928a7afdc639485d443be382420cac09ba9abd6 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
@@ -22,8 +22,10 @@ import json
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -151,11 +153,11 @@ def _preprocess_symbolic_input(x, data_format, mode):
     std = None
 
   if _IMAGENET_MEAN is None:
-    _IMAGENET_MEAN = K.constant(-np.array(mean))
+    _IMAGENET_MEAN = constant_op.constant(-np.array(mean), dtype=K.floatx())
 
   # Zero-center by mean pixel
   if K.dtype(x) != K.dtype(_IMAGENET_MEAN):
-    x = K.bias_add(x, K.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
+    x = K.bias_add(x, math_ops.cast(_IMAGENET_MEAN, K.dtype(x)), data_format)
   else:
     x = K.bias_add(x, _IMAGENET_MEAN, data_format)
   if std is not None:
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
index ad96b53a4528d99a014a0214b52a78d6a60076f8..12775fccecddd96028a9f2b793be155da5b8d838 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
@@ -84,11 +84,13 @@ from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
 from tensorflow.python.keras._impl.keras.layers import Activation
 from tensorflow.python.keras._impl.keras.layers import BatchNormalization
 from tensorflow.python.keras._impl.keras.layers import Conv2D
+from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
 from tensorflow.python.keras._impl.keras.layers import Dropout
 from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import Reshape
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import conv_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -116,195 +118,6 @@ def preprocess_input(x):
   return imagenet_utils.preprocess_input(x, mode='tf')
 
 
-class DepthwiseConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
-
-  Depthwise Separable convolutions consists in performing
-  just the first step in a depthwise spatial convolution
-  (which acts on each input channel separately).
-  The `depth_multiplier` argument controls how many
-  output channels are generated per input channel in the depthwise step.
-
-  Arguments:
-      kernel_size: An integer or tuple/list of 2 integers, specifying the
-          width and height of the 2D convolution window.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-      strides: An integer or tuple/list of 2 integers,
-          specifying the strides of the convolution along the width and height.
-          Can be a single integer to specify the same value for
-          all spatial dimensions.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: one of `'valid'` or `'same'` (case-insensitive).
-      depth_multiplier: The number of depthwise convolution output channels
-          for each input channel.
-          The total number of depthwise convolution output
-          channels will be equal to `filters_in * depth_multiplier`.
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, height, width, channels)` while `channels_first`
-          corresponds to inputs with shape
-          `(batch, channels, height, width)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be 'channels_last'.
-      activation: Activation function to use.
-          If you don't specify anything, no activation is applied
-          (ie. 'linear' activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      depthwise_initializer: Initializer for the depthwise kernel matrix.
-      bias_initializer: Initializer for the bias vector.
-      depthwise_regularizer: Regularizer function applied to
-          the depthwise kernel matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its 'activation')..
-      depthwise_constraint: Constraint function applied to
-          the depthwise kernel matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-
-  Input shape:
-      4D tensor with shape:
-      `[batch, channels, rows, cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, rows, cols, channels]` if data_format='channels_last'.
-
-  Output shape:
-      4D tensor with shape:
-      `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
-      or 4D tensor with shape:
-      `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
-      `rows` and `cols` values might have changed due to padding.
-  """
-
-  def __init__(self,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               depth_multiplier=1,
-               data_format=None,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer='glorot_uniform',
-               bias_initializer='zeros',
-               depthwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               bias_constraint=None,
-               **kwargs):
-    super(DepthwiseConv2D, self).__init__(
-        filters=None,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = initializers.get(depthwise_initializer)
-    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
-    self.depthwise_constraint = constraints.get(depthwise_constraint)
-    self.bias_initializer = initializers.get(bias_initializer)
-
-  @shape_type_conversion
-  def build(self, input_shape):
-    if len(input_shape) < 4:
-      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
-                       'Received input shape:', str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = 3
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs to '
-                       '`DepthwiseConv2D` '
-                       'should be defined. Found `None`.')
-    input_dim = int(input_shape[channel_axis])
-    depthwise_kernel_shape = (self.kernel_size[0], self.kernel_size[1],
-                              input_dim, self.depth_multiplier)
-
-    self.depthwise_kernel = self.add_weight(
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        name='depthwise_kernel',
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint)
-
-    if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(input_dim * self.depth_multiplier,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
-    else:
-      self.bias = None
-    # Set input spec.
-    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
-    self.built = True
-
-  def call(self, inputs, training=None):
-    outputs = K.depthwise_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        strides=self.strides,
-        padding=self.padding,
-        dilation_rate=self.dilation_rate,
-        data_format=self.data_format)
-
-    if self.bias:
-      outputs = K.bias_add(outputs, self.bias, data_format=self.data_format)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-
-    return outputs
-
-  @shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-      out_filters = input_shape[1] * self.depth_multiplier
-    elif self.data_format == 'channels_last':
-      rows = input_shape[1]
-      cols = input_shape[2]
-      out_filters = input_shape[3] * self.depth_multiplier
-
-    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
-                                         self.padding, self.strides[0])
-    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
-                                         self.padding, self.strides[1])
-
-    if self.data_format == 'channels_first':
-      return (input_shape[0], out_filters, rows, cols)
-    elif self.data_format == 'channels_last':
-      return (input_shape[0], rows, cols, out_filters)
-
-  def get_config(self):
-    config = super(DepthwiseConv2D, self).get_config()
-    config.pop('filters')
-    config.pop('kernel_initializer')
-    config.pop('kernel_regularizer')
-    config.pop('kernel_constraint')
-    config['depth_multiplier'] = self.depth_multiplier
-    config['depthwise_initializer'] = initializers.serialize(
-        self.depthwise_initializer)
-    config['depthwise_regularizer'] = regularizers.serialize(
-        self.depthwise_regularizer)
-    config['depthwise_constraint'] = constraints.serialize(
-        self.depthwise_constraint)
-    return config
-
-
 @tf_export('keras.applications.MobileNet',
            'keras.applications.mobilenet.MobileNet')
 def MobileNet(input_shape=None,
@@ -318,18 +131,11 @@ def MobileNet(input_shape=None,
               classes=1000):
   """Instantiates the MobileNet architecture.
 
-  Note that only TensorFlow is supported for now,
-  therefore it only works with the data format
-  `image_data_format='channels_last'` in your Keras config
-  at `~/.keras/keras.json`.
-
   To load a MobileNet model via `load_model`, import the custom
-  objects `relu6` and `DepthwiseConv2D` and pass them to the
-  `custom_objects` parameter.
+  objects `relu6` and pass them to the `custom_objects` parameter.
   E.g.
   model = load_model('mobilenet.h5', custom_objects={
-                     'relu6': mobilenet.relu6,
-                     'DepthwiseConv2D': mobilenet.DepthwiseConv2D})
+                     'relu6': mobilenet.relu6})
 
   Arguments:
       input_shape: optional shape tuple, only to be specified
@@ -383,11 +189,6 @@ def MobileNet(input_shape=None,
           backend that does not support separable convolutions.
   """
 
-  if K.backend() != 'tensorflow':
-    raise RuntimeError('Only TensorFlow backend is currently supported, '
-                       'as other backends do not support '
-                       'depthwise convolution.')
-
   if not (weights in {'imagenet', None} or os.path.exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
@@ -522,7 +323,7 @@ def MobileNet(input_shape=None,
   # load weights
   if weights == 'imagenet':
     if K.image_data_format() == 'channels_first':
-      raise ValueError('Weights for "channels_last" format '
+      raise ValueError('Weights for "channels_first" format '
                        'are not available.')
     if alpha == 1.0:
       alpha_text = '1_0'
@@ -598,14 +399,14 @@ def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)):
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   filters = int(filters * alpha)
+  x = ZeroPadding2D(padding=(1, 1), name='conv1_pad')(inputs)
   x = Conv2D(
       filters,
       kernel,
-      padding='same',
+      padding='valid',
       use_bias=False,
       strides=strides,
-      name='conv1')(
-          inputs)
+      name='conv1')(x)
   x = BatchNormalization(axis=channel_axis, name='conv1_bn')(x)
   return Activation(relu6, name='conv1_relu')(x)
 
@@ -665,15 +466,14 @@ def _depthwise_conv_block(inputs,
   """
   channel_axis = 1 if K.image_data_format() == 'channels_first' else -1
   pointwise_conv_filters = int(pointwise_conv_filters * alpha)
-
+  x = ZeroPadding2D(padding=(1, 1), name='conv_pad_%d' % block_id)(inputs)
   x = DepthwiseConv2D(  # pylint: disable=not-callable
       (3, 3),
-      padding='same',
+      padding='valid',
       depth_multiplier=depth_multiplier,
       strides=strides,
       use_bias=False,
-      name='conv_dw_%d' % block_id)(
-          inputs)
+      name='conv_dw_%d' % block_id)(x)
   x = BatchNormalization(axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x)
   x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
index 46c0e635578c7f4707b027247943d75b16d703ad..c3a92bea8920cad3297fee3efc50158813e72361 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/_impl/keras/applications/resnet50.py
@@ -45,6 +45,7 @@ from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
 from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
 from tensorflow.python.keras._impl.keras.layers import Input
 from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
+from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
 from tensorflow.python.keras._impl.keras.models import Model
 from tensorflow.python.keras._impl.keras.utils import layer_utils
 from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
@@ -237,8 +238,7 @@ def ResNet50(include_top=True,
     bn_axis = 1
 
   x = Conv2D(
-      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(
-          img_input)
+      64, (7, 7), strides=(2, 2), padding='same', name='conv1')(img_input)
   x = BatchNormalization(axis=bn_axis, name='bn_conv1')(x)
   x = Activation('relu')(x)
   x = MaxPooling2D((3, 3), strides=(2, 2))(x)
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/_impl/keras/backend.py
index 7baf27642a475eb3a09687a1d19a6ed05de046e9..096db8db32db924718f5bac03746baa547b922e0 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/_impl/keras/backend.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import collections
 import json
 import os
+import weakref
 
 import numpy as np
 
@@ -34,7 +35,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -54,7 +55,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables as variables_module
-from tensorflow.python.training import moving_averages
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
@@ -262,6 +263,12 @@ def set_image_data_format(data_format):
   _IMAGE_DATA_FORMAT = str(data_format)
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
+
+
 @tf_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
@@ -282,17 +289,16 @@ def get_uid(prefix=''):
   ```
   """
   graph = ops.get_default_graph()
-  if graph not in tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS:
-    tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(
-        int)
-  layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS[graph]
+  if graph not in PER_GRAPH_LAYER_NAME_UIDS:
+    PER_GRAPH_LAYER_NAME_UIDS[graph] = collections.defaultdict(int)
+  layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS[graph]
   layer_name_uids[prefix] += 1
   return layer_name_uids[prefix]
 
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
-  per_graph_layer_name_uids = tf_base_layers.PER_GRAPH_LAYER_NAME_UIDS
+  per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
     del per_graph_layer_name_uids[key]
@@ -1275,6 +1281,11 @@ def moving_average_update(x, value, momentum):
   Returns:
       An Operation to update the variable.
   """
+  # `training` is higher-up than the Keras backend in the abstraction hierarchy.
+  # In particular, `training` depends on layers, and thus on Keras.
+  # moving_averages, being low-level ops, should not be part of the training
+  # module.
+  from tensorflow.python.training import moving_averages  # pylint: disable=g-import-not-at-top
   return moving_averages.assign_moving_average(
       x, value, momentum, zero_debias=True)
 
@@ -2795,6 +2806,8 @@ class Function(object):
     else:
       feed_dict = {}
 
+    session = get_session()
+    data_tensors_to_feed = []
     for tensor, value in zip(self.inputs, inputs):
       if value is None:
         continue
@@ -2803,9 +2816,20 @@ class Function(object):
         indices = np.concatenate((np.expand_dims(sparse_coo.row, 1),
                                   np.expand_dims(sparse_coo.col, 1)), 1)
         value = (indices, sparse_coo.data, sparse_coo.shape)
-      feed_dict[tensor] = value
+      elif tensor_util.is_tensor(value):
+        data_tensors_to_feed.append((tensor, value))
+      else:
+        feed_dict[tensor] = value
+
+    if data_tensors_to_feed:
+      # This is a *temporary* workaround (i.e. hack) to feed a symbolic tensor
+      # to `feed_dict`. It is very inefficient. It will be removed as soon
+      # as it becomes possible to pass symbolic tensors to `feed_dict`.
+      data_tensor_values = session.run([x[1] for x in data_tensors_to_feed])
+      for i, v in enumerate(data_tensor_values):
+        feed_dict[data_tensors_to_feed[i][0]] = v
+
     fetches = self.outputs + [self.updates_op] + self.fetches
-    session = get_session()
     updated = session.run(
         fetches=fetches, feed_dict=feed_dict, **self.session_kwargs)
     return updated[:len(self.outputs)]
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/_impl/keras/constraints.py
index 271fbbb63d3dfd50507837e190860d48315a14f2..abe95d8e0ca68b2e62f9574fba9ae912a9179fff 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/_impl/keras/constraints.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -65,7 +66,8 @@ class MaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = K.clip(norms, 0, self.max_value)
     return w * (desired / (K.epsilon() + norms))
 
@@ -79,7 +81,7 @@ class NonNeg(Constraint):
   """
 
   def __call__(self, w):
-    return w * K.cast(K.greater_equal(w, 0.), K.floatx())
+    return w * math_ops.cast(math_ops.greater_equal(w, 0.), K.floatx())
 
 
 @tf_export('keras.constraints.UnitNorm', 'keras.constraints.unit_norm')
@@ -105,7 +107,9 @@ class UnitNorm(Constraint):
 
   def __call__(self, w):
     return w / (
-        K.epsilon() + K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True)))
+        K.epsilon() + K.sqrt(
+            math_ops.reduce_sum(
+                math_ops.square(w), axis=self.axis, keepdims=True)))
 
   def get_config(self):
     return {'axis': self.axis}
@@ -148,7 +152,8 @@ class MinMaxNorm(Constraint):
     self.axis = axis
 
   def __call__(self, w):
-    norms = K.sqrt(K.sum(K.square(w), axis=self.axis, keepdims=True))
+    norms = K.sqrt(
+        math_ops.reduce_sum(math_ops.square(w), axis=self.axis, keepdims=True))
     desired = (
         self.rate * K.clip(norms, self.min_value, self.max_value) +
         (1 - self.rate) * norms)
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
index 5615241ae3077102ef40f9c0619161964a62a335..3b3af7d092534e36dd597970bb64204373ad2bf5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/base_layer.py
@@ -13,138 +13,145 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=protected-access
-"""Base layer code (`Layer`).
-"""
+"""Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
+import re
+
+import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.training import checkpointable
+from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
-# pylint: disable=invalid-name
-InputSpec = tf_base_layers.InputSpec
-Node = tf_base_layers.Node
-TFBaseLayer = tf_base_layers.Layer
-# pylint: enable=invalid-name
+@tf_export('keras.layers.Layer')
+class Layer(checkpointable.CheckpointableBase):
+  """Base layer class.
 
+  This is the class from which all layers inherit.
 
-@tf_export('keras.layers.Layer')
-class Layer(tf_base_layers.Layer):
-  """Abstract base layer class.
-
-  # Properties
-      name: String, must be unique within a model.
-      input_spec: List of InputSpec class instances
-          each entry describes one required input:
-              - ndim
-              - dtype
-          A layer with `n` input tensors must have
-          an `input_spec` of length `n`.
-      trainable: Boolean, whether the layer weights
-          will be updated during training.
-      uses_learning_phase: Whether any operation
-          of the layer uses `K.in_training_phase()`
-          or `K.in_test_phase()`.
-      input_shape: Shape tuple. Provided for convenience,
-          but note that there may be cases in which this
-          attribute is ill-defined (e.g. a shared layer
-          with multiple input shapes), in which case
-          requesting `input_shape` will raise an Exception.
-          Prefer using `layer.get_input_shape_for(input_shape)`,
-          or `layer.get_input_shape_at(node_index)`.
-      output_shape: Shape tuple. See above.
-      inbound_nodes: List of nodes.
-      outbound_nodes: List of nodes.
-      input, output: Input/output tensor(s). Note that if the layer is used
-          more than once (shared layer), this is ill-defined
-          and will raise an exception. In such cases, use
-          `layer.get_input_at(node_index)`.
-      input_mask, output_mask: Same as above, for masks.
-      trainable_weights: List of variables.
-      non_trainable_weights: List of variables.
-      weights: The concatenation of the lists trainable_weights and
-          non_trainable_weights (in this order).
-
-  # Methods
-      call(x, mask=None): Where the layer's logic lives.
-      __call__(x, mask=None): Wrapper around the layer logic (`call`).
-          If x is a Keras tensor:
-              - Connect current layer with last layer from tensor:
-                  `self._add_inbound_node(last_layer)`
-              - Add layer to tensor history
-          If layer is not built:
-              - Build from inputs shape
-      get_weights()
-      set_weights(weights)
-      get_config()
-      count_params()
-      compute_output_shape(input_shape)
-      compute_mask(x, mask)
-      get_input_at(node_index)
-      get_output_at(node_index)
-      get_input_shape_at(node_index)
-      get_output_shape_at(node_index)
-      get_input_mask_at(node_index)
-      get_output_mask_at(node_index)
-
-  # Class Methods
-      from_config(config)
-
-  # Internal methods:
-      build(input_shape)
-      _add_inbound_node(layer, index=0)
+  A layer is a class implementing common neural networks operations, such
+  as convolution, batch norm, etc. These operations require managing weights,
+  losses, updates, and inter-layer connectivity.
+
+  Users will just instantiate a layer and then treat it as a callable.
+
+  We recommend that descendants of `Layer` implement the following methods:
+  * `__init__()`: Save configuration in member variables
+  * `build()`: Called once from `__call__`, when we know the shapes of inputs
+    and `dtype`. Should have the calls to `add_weight()`, and then
+    call the super's `build()` (which sets `self.built = True`, which is
+    nice in case the user wants to call `build()` manually before the
+    first `__call__`).
+  * `call()`: Called in `__call__` after making sure `build()` has been called
+    once. Should actually perform the logic of applying the layer to the
+    input tensors (which should be passed in as the first argument).
+
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+
+  Read-only properties:
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
+      non-trainable.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
+
+  Mutable properties:
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
+      constraints on inputs that can be accepted by the layer.
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'activity_regularizer',
         'input_shape',
         'batch_input_shape',
         'batch_size',
-        'dtype',
-        'name',
-        'trainable',
         'weights',
+        'activity_regularizer',
     }
     # Validate optional keyword arguments.
     for kwarg in kwargs:
       if kwarg not in allowed_kwargs:
         raise TypeError('Keyword argument not understood:', kwarg)
 
-    # Get layer name.
-    name = kwargs.get('name')
-
-    # Get `trainable` status.
-    trainable = kwargs.get('trainable', True)
-
-    # Get `dtype`.
-    dtype = kwargs.get('dtype')
-    if dtype is None:
-      dtype = K.floatx()
-
-    # Call super, which will set all properties common to Keras layers
-    # and core TF layers.
-    super(Layer, self).__init__(
-        name=name, dtype=dtype, trainable=trainable,
-        activity_regularizer=kwargs.get('activity_regularizer'))
+    # Mutable properties
+    # Indicates whether the layer's weights are updated during training
+    # and whether the layer's updates are run during training
+    self.trainable = trainable
+    # A stateful layer is a layer whose updates are run during inference too,
+    # for instance stateful RNNs.
+    self.stateful = False
+    # Indicates whether `build` needs to be called upon layer call, to create
+    # the layer's weights.
+    self.built = False
+    # Provides information about which inputs are compatible with the layer.
+    self.input_spec = None
+
+    self._init_set_name(name)
+
+    activity_regularizer = kwargs.pop('activity_regularizer', None)
+    if activity_regularizer and context.executing_eagerly():
+      raise ValueError(
+          ('Activity regularization is not supported when executing eagerly. '
+           'Got activity_regularizer=%s') % (activity_regularizer,))
+    self._activity_regularizer = activity_regularizer
+    self._trainable_weights = []
+    self._non_trainable_weights = []
+    self._updates = []
+    # When executing eagerly, _losses is a list of zero-argument lambdas which
+    # return tensors. When using graph execution, _losses is a list of ops.
+    self._losses = []
+    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
+    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._compute_previous_mask = ('mask' in self._call_fn_args or
+                                   hasattr(self, 'compute_mask'))
+    self._uses_inputs_arg = True
+
+    # These lists will be filled via successive calls
+    # to self._add_inbound_node().
+    self._inbound_nodes = []
+    self._outbound_nodes = []
 
-    # Add properties that are Keras-only for now.
     self.supports_masking = False
 
     # Manage input shape information if passed.
@@ -167,39 +174,404 @@ class Layer(tf_base_layers.Layer):
     else:
       self._initial_weights = None
 
-  def add_weight(self,
-                 name,
-                 shape,
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = unique_layer_name(
+          to_snake_case(self.__class__.__name__), zero_based=zero_based)
+    else:
+      self._name = name
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = regularizer
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def updates(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.updates not supported in Eager mode.')
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
+
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
+    """
+    if context.executing_eagerly():
+      return  # Updates already applied when in eager mode.
+
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
+      else:
+        return ops.convert_to_tensor(x)
+
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
+
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
+
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
+
+    Note that when executing eagerly, getting this property evaluates
+    regularizers. When using graph execution, variable regularization ops have
+    already been created and are simply returned here.
+
+    Returns:
+      A list of tensors.
+    """
+    if context.executing_eagerly():
+      # _losses may only contain variable regularization losses when executing
+      # eagerly, and they have been saved as lambdas to be executed when
+      # requested.
+      return [regularizer() for regularizer in self._losses]
+    else:
+      return self._losses
+
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
+
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
+
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors.
+      inputs: If anything other than None is passed, it signals the losses
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for activity regularization losses, for instance.
+        If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      # TODO(fchollet): it should be possible (and highly desirable) to support
+      # `add_loss` in eager mode. This allows great convenience and flexibility
+      # in defining custom losses on the fly (e.g. in VAEs).
+      # Simply appending the loss value to `self._losses`
+      # is the correct behavior.
+      # The only caveat is that we need to force the user to only call
+      # `add_loss` from inside a model or Layer's `call` method
+      # (otherwise the loss computation cannot be backproped through).
+      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
+
+    losses = generic_utils.to_list(losses)
+    self._losses += losses
+    if inputs is None:
+      for loss in losses:
+        loss._unconditional_loss = True  # pylint: disable=protected-access
+    else:
+      for loss in losses:
+        loss._unconditional_loss = False  # pylint: disable=protected-access
+
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
+
+    Arguments:
+      inputs: Input tensor or list/tuple of input tensors.
+
+    Returns:
+      List of loss tensors of the layer that depend on `inputs`.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
+
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
+
+  def _name_scope(self):
+    return self.name
+
+  def build(self, _):
+    """Creates the variables of the layer."""
+    self.built = True
+
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  def add_weight(self, name, shape,
                  dtype=None,
                  initializer=None,
                  regularizer=None,
                  trainable=True,
-                 constraint=None):
-    """Adds a weight variable to the layer.
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 getter=None):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
-        name: String, the name for the weight variable.
-        shape: The shape tuple of the weight.
-        dtype: The dtype of the weight.
-        initializer: An Initializer instance (callable).
-        regularizer: An optional Regularizer instance.
-        trainable: A boolean, whether the weight should
-            be trained via backprop or not (assuming
-            that the layer itself is also trainable).
-        constraint: An optional Constraint instance.
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      getter: Variable getter argument to be passed to the `Checkpointable` API.
 
     Returns:
-        The created weight variable.
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
+
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
     """
     if dtype is None:
-      dtype = K.floatx()
-    weight = self.add_variable(name, shape,
-                               dtype=dtype,
-                               initializer=initializers.get(initializer),
-                               regularizer=regularizers.get(regularizer),
-                               constraint=constraints.get(constraint),
-                               trainable=trainable)
-    return weight
+      dtype = self.dtype or backend.floatx()
+    initializer = initializers.get(initializer)
+    if initializer is None:
+      # Default TensorFlow initializer.
+      initializer = initializers.glorot_uniform()
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
+
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtypes.as_dtype(dtype),
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource)
+
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
+
+    if trainable:
+      self._trainable_weights.append(variable)
+    else:
+      self._non_trainable_weights.append(variable)
+    return variable
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    # `init_graph` should point to the graph in which variable initialization
+    # will occur; it should be None if and only if initialization will take
+    # place in the eager context.
+    init_graph = None
+    if not context.executing_eagerly():
+      default_graph = ops.get_default_graph()
+      if default_graph.building_function:
+        with ops.init_scope():
+          # Retrieve the variables from the graph into which variables
+          # will be lifted; if initialization ops will be lifted into
+          # the eager context, then there is nothing to retrieve, since variable
+          # collections are not supported when eager execution is enabled.
+          if not context.executing_eagerly():
+            init_graph = ops.get_default_graph()
+      else:
+        # Initialization ops will not be lifted out of the default graph.
+        init_graph = default_graph
+
+    if init_graph is not None:  # pylint: disable=protected-access
+      # The variable was created and initialized in a graph.
+      if regularizer:
+        if isinstance(variable, tf_variables.PartitionedVariable):
+          for v in variable:
+            with ops.colocate_with(v.op):
+              with ops.name_scope(name + '/Regularizer'):
+                regularization = regularizer(v)
+            if regularization is not None:
+              self.add_loss(regularization)
+        else:
+          with ops.colocate_with(variable.op):
+            with ops.name_scope(name + '/Regularizer'):
+              regularization = regularizer(variable)
+          if regularization is not None:
+            self.add_loss(regularization)
+    elif regularizer:  # initialization took place in an eager context
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        raise RuntimeError(
+            'Partitioned variable regularization is not yet '
+            'supported when executing eagerly. File a feature request'
+            'if this is important to you.')
+      # Save a zero-argument lambda which runs the regularizer on the
+      # variable, to be executed when `Layer.losses` is requested.
+      # This makes losses responsive to variable updates when executing
+      # eagerly.
+      #
+      # TODO(akshayka): Do the same for graphs as well, so that losses
+      # collected in a while_loop can be run outside its control flow
+      # context and so that losses won't be swallowed up by graph functions
+      # (i.e., `.losses()` should always create regularizers).
+      self._losses.append(lambda: regularizer(variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      for output in output_list:
+        with ops.name_scope('ActivityRegularizer'):
+          activity_regularization = self._activity_regularizer(output)
+        self.add_loss(activity_regularization, inputs=inputs)
 
   def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
     """This is where the layer's logic lives.
@@ -213,53 +585,278 @@ class Layer(tf_base_layers.Layer):
     """
     return inputs
 
-  def __call__(self, inputs, **kwargs):
-    """Wrapper around self.call(), for handling internal references.
-
-    If a Keras tensor is passed:
-        - We call self._add_inbound_node().
-        - If necessary, we `build` the layer to match
-            the shape of the input(s).
-        - We update the _keras_history of the output tensor(s)
-            with the current layer.
-            This is done as part of _add_inbound_node().
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
 
     Arguments:
-        inputs: Can be a tensor or list/tuple of tensors.
-        **kwargs: Additional keyword arguments to be passed to `call()`.
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
 
     Returns:
-        Output of the layer's `call` method.
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
 
     Raises:
-        ValueError: in case the layer is missing shape information
-            for its `build` call.
+      ValueError: if the layer's `call` method returns None (an invalid value).
     """
-    # Actually call the layer (optionally building it).
-    output = super(Layer, self).__call__(inputs, **kwargs)
-    if context.executing_eagerly():
-      return output
+    input_list = nest.flatten(inputs)
+
+    build_graph = not context.executing_eagerly()
+    # TODO(fchollet, allenl): Make deferred mode work with subclassed Models
+    # which don't use an "inputs" argument.
+    in_deferred_mode = isinstance(input_list[0], DeferredTensor)
+
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if (not hasattr(self, '_compute_previous_mask') or
+        self._compute_previous_mask):
+      previous_mask = collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = estimator_util.fn_args(self.call)
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
+
+    input_shapes = None
+
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        if not build_graph:
+          # Activity regularization is currently unsupported in Eager mode.
+          if self._activity_regularizer:
+            raise ValueError(
+                'activity_regularizer currently unsupported with '
+                'eager execution enabled. Found an activity_regularizer in '
+                '%s(%s).' % (self.__class__.__name__, self))
+        if not build_graph and not in_deferred_mode:
+          for x in input_list:
+            if hasattr(x, '_keras_history'):
+              raise ValueError('_keras_history currently unsupported in '
+                               'Eager mode. Found _keras_history in %s while '
+                               'executing __call__ for %s(%s)' %
+                               (x, self.__class_.__name__, self))
+
+        # Check input assumptions set before layer building, e.g. input rank.
+        self._assert_input_compatibility(inputs)
+        if input_list and self._dtype is None:
+          try:
+            self._dtype = input_list[0].dtype.base_dtype.name
+          except AttributeError:
+            pass
+        if all(hasattr(x, 'get_shape') for x in input_list):
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+        self.build(input_shapes)
+
+      # Check input assumptions set after layer building, e.g. input shape.
+      if build_graph or in_deferred_mode:
+        self._assert_input_compatibility(inputs)
+
+      if not in_deferred_mode:
+        outputs = self.call(inputs, *args, **kwargs)
+        if outputs is None:
+          raise ValueError('A layer\'s `call` method should return a Tensor '
+                           'or a list of Tensors, not None (layer: ' +
+                           self.name + ').')
+      else:
+        # Deferred mode behavior: use `compute_output_shape` to
+        # infer the number of outputs of the layer and their shapes.
+        if input_shapes is None:
+          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
+
+        output_shapes = self.compute_output_shape(input_shapes)
+        output_shapes = nest.flatten(output_shapes)
+        outputs = [
+            # TODO(fchollet): name the deferred tensors?
+            DeferredTensor(shape=shape, dtype=self._dtype)
+            for shape in output_shapes
+        ]
+        if len(outputs) == 1:
+          outputs = outputs[0]
+
+      if build_graph:
+        self._handle_activity_regularization(inputs, outputs)
+        # TODO(fchollet): consider enabling masking for Eager mode.
+        self._set_mask_metadata(inputs, outputs, previous_mask)
+
+      if in_deferred_mode or build_graph and have_all_keras_metadata(inputs):
+        inputs, outputs = self._set_connectivity_metadata_(
+            inputs, outputs, args, kwargs)
+
+      self.built = True
+      if context.executing_eagerly():
+        return outputs
+
+      if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
+        # Subclassed network: explicitly set metadata normally set by a call to
+        # self._set_inputs(). This is not relevant in eager execution.
+        self._symbolic_set_inputs(inputs, outputs)
+
+      if in_deferred_mode or build_graph:
+        self._set_learning_phase_metadata(inputs, outputs)
 
-    if hasattr(self, '_symbolic_set_inputs') and not self.inputs:
-      # Subclassed network: explicitly set metadata normally set by a call to
-      # self._set_inputs().
-      self._symbolic_set_inputs(inputs, output)
+    # Optionally load weight values that were specified at layer instantiation.
+    # TODO(fchollet): consider enabling this with eager execution too.
+    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
+      self.set_weights(self._initial_weights)
+      del self._initial_weights
+    return outputs
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This simply wraps `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
 
-    # Update learning phase info.
-    output_tensors = generic_utils.to_list(output)
+  def _set_learning_phase_metadata(self, inputs, outputs):
+    # Update learning phase info. To work with subclassed models,
+    # this should be done even if Keras metadata is absent.
+    output_tensors = generic_utils.to_list(outputs)
     uses_lp = any(
         [getattr(x, '_uses_learning_phase', False)
          for x in generic_utils.to_list(inputs)])
     uses_lp = getattr(self, 'uses_learning_phase', False) or uses_lp
     for i in range(len(output_tensors)):
-      output_tensors[i]._uses_learning_phase = getattr(
-          output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      try:
+        output_tensors[i]._uses_learning_phase = getattr(
+            output_tensors[i], '_uses_learning_phase', False) or uses_lp
+      except AttributeError:
+        # An output element happens to be a C type (such as tuple or dict).
+        # We don't track learning phase info in such edge cases.
+        pass
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    if hasattr(self, 'compute_mask'):
+      output_mask = self.compute_mask(inputs, previous_mask)
+      if isinstance(outputs, (list, tuple)):
+        if output_mask is None:
+          output_mask = [None for _ in range(len(outputs))]
+        for x, m in zip(outputs, output_mask):
+          try:
+            x._keras_mask = m  # pylint: disable=protected-access
+          except AttributeError:
+            pass  # C type such as dict. Masking not supported in this case.
+      else:
+        try:
+          outputs._keras_mask = output_mask  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    if args and getattr(self, '_uses_inputs_arg', True):
+      raise TypeError(
+          'This Layer takes an `inputs` argument to call(), and only the '
+          '`inputs` argument may be specified as a positional argument. '
+          'Pass everything else as a keyword argument (those arguments will'
+          ' not be tracked as inputs to the Layer).')
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
 
-    # Optionally load weight values that were specified at layer instantiation.
-    if hasattr(self, '_initial_weights') and self._initial_weights is not None:
-      self.set_weights(self._initial_weights)
-      del self._initial_weights
-    return output
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
+
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
+
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
+
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
+    """
+    if getattr(self, '_uses_inputs_arg', True):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
+    else:
+      call_arg_spec = tf_inspect.getargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layer.call() may not accept both *args and arguments with '
+              'default values (unable to determine which are inputs to the '
+              'Layer).')
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.keywords is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.keywords)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument to call() after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the Layer.')
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
 
   def compute_output_shape(self, input_shape):
     """Computes the output shape of the layer.
@@ -276,13 +873,7 @@ class Layer(tf_base_layers.Layer):
     Returns:
         An input shape tuple.
     """
-    logging.warning(
-        'All custom layers should implement the '
-        '`compute_output_shape` method. This layer (' + self.name + ') '
-        'is relying on the base `Layer.compute_output_shape` implementation, '
-        'which will start raising a `NotImplementedError` '
-        'as of July 1st, 2018.')
-    return input_shape
+    raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
     """Computes an output mask tensor.
@@ -310,6 +901,87 @@ class Layer(tf_base_layers.Layer):
     # carry over the input mask
     return mask
 
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
+
+    Arguments:
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
+    """
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
+
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
+
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
+
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
+
+    Arguments:
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
+
+    Returns:
+        The layer's attribute `attr` at the node of index `node_index`.
+
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
+
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
 
@@ -390,6 +1062,325 @@ class Layer(tf_base_layers.Layer):
     else:
       return getattr(output, '_keras_mask', None)
 
+  def get_input_shape_at(self, node_index):
+    """Retrieves the input shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_shapes',
+                                             'input shape')
+
+  def get_output_shape_at(self, node_index):
+    """Retrieves the output shape(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A shape tuple
+        (or list of shape tuples if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_shapes',
+                                             'output shape')
+
+  def get_input_at(self, node_index):
+    """Retrieves the input tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple inputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'input_tensors',
+                                             'input')
+
+  def get_output_at(self, node_index):
+    """Retrieves the output tensor(s) of a layer at a given node.
+
+    Arguments:
+        node_index: Integer, index of the node
+            from which to retrieve the attribute.
+            E.g. `node_index=0` will correspond to the
+            first time the layer was called.
+
+    Returns:
+        A tensor (or list of tensors if the layer has multiple outputs).
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    return self._get_node_attribute_at_index(node_index, 'output_tensors',
+                                             'output')
+
+  @property
+  def input(self):
+    """Retrieves the input tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+        Input tensor or list of input tensors.
+
+    Raises:
+        AttributeError: if the layer is connected to
+        more than one incoming layers.
+
+    Raises:
+      RuntimeError: If called in Eager mode.
+      AttributeError: If no inbound nodes are found.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name +
+                           ' is not connected, no input to return.')
+    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
+
+  @property
+  def output(self):
+    """Retrieves the output tensor(s) of a layer.
+
+    Only applicable if the layer has exactly one output,
+    i.e. if it is connected to one incoming layer.
+
+    Returns:
+      Output tensor or list of output tensors.
+
+    Raises:
+      AttributeError: if the layer is connected to more than one incoming
+        layers.
+      RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
+    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
+
+  @property
+  def input_shape(self):
+    """Retrieves the input shape(s) of a layer.
+
+    Only applicable if the layer has exactly one input,
+    i.e. if it is connected to one incoming layer, or if all inputs
+    have the same shape.
+
+    Returns:
+        Input shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per input tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined input_shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined input shape.')
+    all_input_shapes = set(
+        [str(node.input_shapes) for node in self._inbound_nodes])
+    if len(all_input_shapes) == 1:
+      input_shapes = self._inbound_nodes[0].input_shapes
+      if len(input_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in input_shapes
+        ]
+    else:
+      raise AttributeError('The layer "' + str(self.name) +
+                           ' has multiple inbound nodes, '
+                           'with different input shapes. Hence '
+                           'the notion of "input shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_input_shape_at(node_index)` '
+                           'instead.')
+
+  def count_params(self):
+    """Count the total number of scalars composing the weights.
+
+    Returns:
+        An integer count.
+
+    Raises:
+        ValueError: if the layer isn't yet built
+          (in which case its weights aren't yet defined).
+    """
+    if not self.built:
+      if self.__class__.__name__ == 'Sequential':
+        self.build()  # pylint: disable=no-value-for-parameter
+      else:
+        raise ValueError('You tried to call `count_params` on ' + self.name +
+                         ', but the layer isn\'t built. '
+                         'You can build it manually via: `' + self.name +
+                         '.build(batch_input_shape)`.')
+    weight_shapes = [w.get_shape().as_list() for w in self.weights]
+    return int(sum([np.prod(w) for w in weight_shapes]))
+
+  @property
+  def output_shape(self):
+    """Retrieves the output shape(s) of a layer.
+
+    Only applicable if the layer has one output,
+    or if all outputs have the same shape.
+
+    Returns:
+        Output shape, as an integer shape tuple
+        (or list of shape tuples, one tuple per output tensor).
+
+    Raises:
+        AttributeError: if the layer has no defined output shape.
+        RuntimeError: if called in Eager mode.
+    """
+    if not self._inbound_nodes:
+      raise AttributeError('The layer has never been called '
+                           'and thus has no defined output shape.')
+    all_output_shapes = set(
+        [str(node.output_shapes) for node in self._inbound_nodes])
+    if len(all_output_shapes) == 1:
+      output_shapes = self._inbound_nodes[0].output_shapes
+      if len(output_shapes) == 1:
+        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
+      else:
+        return [
+            tuple(tensor_shape.TensorShape(shape).as_list())
+            for shape in output_shapes
+        ]
+    else:
+      raise AttributeError('The layer "%s"'
+                           ' has multiple inbound nodes, '
+                           'with different output shapes. Hence '
+                           'the notion of "output shape" is '
+                           'ill-defined for the layer. '
+                           'Use `get_output_shape_at(node_index)` '
+                           'instead.' % self.name)
+
+  @property
+  def inbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._inbound_nodes
+
+  @property
+  def outbound_nodes(self):
+    """Deprecated, do NOT use! Only for compatibility with external Keras."""
+    return self._outbound_nodes
+
+  def _assert_input_compatibility(self, inputs):
+    """Checks compatibility between the layer and provided inputs.
+
+    This checks that the tensor(s) `inputs` verify the input assumptions
+    of the layer (if any). If not, a clear and actional exception gets raised.
+
+    Arguments:
+        inputs: input tensor or list of input tensors.
+
+    Raises:
+        ValueError: in case of mismatch between
+            the provided inputs and the expectations of the layer.
+    """
+    if not self.input_spec:
+      return
+    if not isinstance(self.input_spec, (list, tuple)):
+      input_spec = nest.flatten(self.input_spec)
+    else:
+      input_spec = self.input_spec
+    inputs = nest.flatten(inputs)
+    if len(inputs) != len(input_spec):
+      raise ValueError('Layer ' + self.name + ' expects ' +
+                       str(len(input_spec)) + ' inputs, '
+                       'but it received ' + str(len(inputs)) +
+                       ' input tensors. Inputs received: ' + str(inputs))
+    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+      if spec is None:
+        continue
+
+      if (spec.ndim is not None or
+          spec.min_ndim is not None or
+          spec.max_ndim is not None):
+        if x.get_shape().ndims is None:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'its rank is undefined, but the layer requires a '
+                           'defined rank.')
+
+      # Check ndim.
+      if spec.ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim != spec.ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                           str(ndim) + '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      if spec.max_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim > spec.max_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected max_ndim=' + str(spec.max_ndim) +
+                           ', found ndim=' + str(ndim))
+      if spec.min_ndim is not None:
+        ndim = x.get_shape().ndims
+        if ndim is not None and ndim < spec.min_ndim:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           ': expected min_ndim=' + str(spec.min_ndim) +
+                           ', found ndim=' + str(ndim) +
+                           '. Full shape received: ' +
+                           str(x.get_shape().as_list()))
+      # Check dtype.
+      if spec.dtype is not None:
+        if x.dtype != spec.dtype:
+          raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                           self.name + ' is incompatible with the layer: '
+                           'expected dtype=' + str(spec.dtype) +
+                           ', found dtype=' + str(x.dtype))
+      # Check specific shape axes.
+      if spec.axes:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for axis, value in spec.axes.items():
+            if hasattr(value, 'value'):
+              value = value.value
+            if value is not None and shape[int(axis)] not in {value, None}:
+              raise ValueError(
+                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
+                  ' incompatible with the layer: expected axis ' + str(axis) +
+                  ' of input shape to have value ' + str(value) +
+                  ' but received input with shape ' + str(shape))
+      # Check shape.
+      if spec.shape is not None:
+        shape = x.get_shape().as_list()
+        if shape is not None:
+          for spec_dim, dim in zip(spec.shape, shape):
+            if spec_dim is not None and dim is not None:
+              if spec_dim != dim:
+                raise ValueError('Input ' + str(input_index) +
+                                 ' is incompatible with layer ' + self.name +
+                                 ': expected shape=' + str(spec.shape) +
+                                 ', found shape=' + str(shape))
+
   def set_weights(self, weights):
     """Sets the weights of the layer, from Numpy arrays.
 
@@ -414,14 +1405,14 @@ class Layer(tf_base_layers.Layer):
     if not params:
       return
     weight_value_tuples = []
-    param_values = K.batch_get_value(params)
+    param_values = backend.batch_get_value(params)
     for pv, p, w in zip(param_values, params, weights):
       if pv.shape != w.shape:
         raise ValueError('Layer weight shape ' + str(pv.shape) +
                          ' not compatible with '
                          'provided weight shape ' + str(w.shape))
       weight_value_tuples.append((p, w))
-    K.batch_set_value(weight_value_tuples)
+    backend.batch_set_value(weight_value_tuples)
 
   def get_weights(self):
     """Returns the current weights of the layer.
@@ -430,7 +1421,7 @@ class Layer(tf_base_layers.Layer):
         Weights values as a list of numpy arrays.
     """
     params = self.weights
-    return K.batch_get_value(params)
+    return backend.batch_get_value(params)
 
   def get_config(self):
     """Returns the config of the layer.
@@ -472,9 +1463,196 @@ class Layer(tf_base_layers.Layer):
     """
     return cls(**config)
 
-  @tf_base_layers.Layer.activity_regularizer.setter
-  def activity_regularizer(self, activity_regularizer):
-    self._activity_regularizer = activity_regularizer
+
+@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+class Node(object):
+  """A `Node` describes the connectivity between two layers.
+
+  Each time a layer is connected to some new input,
+  a node is added to `layer._inbound_nodes`.
+  Each time the output of a layer is used by another layer,
+  a node is added to `layer._outbound_nodes`.
+
+  Arguments:
+      outbound_layer: the layer that takes
+          `input_tensors` and turns them into `output_tensors`
+          (the node gets created when the `call`
+          method of the layer was called).
+      inbound_layers: a list of layers, the same length as `input_tensors`,
+          the layers from where `input_tensors` originate.
+      node_indices: a list of integers, the same length as `inbound_layers`.
+          `node_indices[i]` is the origin node of `input_tensors[i]`
+          (necessary since each inbound layer might have several nodes,
+          e.g. if the layer is being shared with a different data stream).
+      tensor_indices: a list of integers,
+          the same length as `inbound_layers`.
+          `tensor_indices[i]` is the index of `input_tensors[i]` within the
+          output of the inbound layer
+          (necessary since each inbound layer might
+          have multiple tensor outputs, with each one being
+          independently manipulable).
+      input_tensors: list of input tensors.
+      output_tensors: list of output tensors.
+      arguments: dictionary of keyword arguments that were passed to the
+          `call` method of the layer at the call that created the node.
+
+  `node_indices` and `tensor_indices` are basically fine-grained coordinates
+  describing the origin of the `input_tensors`.
+
+  A node from layer A to layer B is added to:
+    - A._outbound_nodes
+    - B._inbound_nodes
+  """
+
+  def __init__(self,
+               outbound_layer,
+               inbound_layers,
+               node_indices,
+               tensor_indices,
+               input_tensors,
+               output_tensors,
+               arguments=None):
+    # Layer instance (NOT a list).
+    if isinstance(outbound_layer, list):
+      raise ValueError(
+          '`outbound_layer` should be a layer instance, not a list.')
+    # this is the layer that takes a list of input tensors
+    # and turns them into a list of output tensors.
+    # the current node will be added to
+    # the inbound_nodes of outbound_layer.
+    self.outbound_layer = outbound_layer
+
+    # The following 3 properties describe where
+    # the input tensors come from: which layers,
+    # and for each layer, which node and which
+    # tensor output of each node.
+
+    # List of layer instances.
+    self.inbound_layers = inbound_layers
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.node_indices = node_indices
+    # List of integers, 1:1 mapping with inbound_layers.
+    self.tensor_indices = tensor_indices
+
+    # Following 2 properties:
+    # tensor inputs and outputs of outbound_layer.
+
+    # List of tensors. 1:1 mapping with inbound_layers.
+    self.input_tensors = input_tensors
+    # List of tensors, created by outbound_layer.call().
+    self.output_tensors = output_tensors
+
+    # Following 2 properties: input and output shapes.
+
+    # List of shape tuples, shapes of input_tensors.
+    self.input_shapes = [static_shape(x) for x in input_tensors]
+    # List of shape tuples, shapes of output_tensors.
+    self.output_shapes = [static_shape(x) for x in output_tensors]
+
+    # Optional keyword arguments to layer's `call`.
+    self.arguments = arguments
+
+    # Add nodes to all layers involved.
+    for layer in inbound_layers:
+      if layer is not None:
+        # For compatibility with external Keras, we use the deprecated
+        # accessor here.
+        layer.outbound_nodes.append(self)
+    # For compatibility with external Keras, we use the deprecated
+    # accessor here.
+    outbound_layer.inbound_nodes.append(self)
+
+  def get_config(self):
+    inbound_names = []
+    for layer in self.inbound_layers:
+      if layer:
+        inbound_names.append(layer.name)
+      else:
+        inbound_names.append(None)
+    return {
+        'outbound_layer': self.outbound_layer.name,
+        'inbound_layers': inbound_names,
+        'node_indices': self.node_indices,
+        'tensor_indices': self.tensor_indices
+    }
+
+
+class DeferredTensor(object):
+  """Tensor-like object used to build graphs of layers in Eager mode.
+
+  When calling a layer on a DeferredTensor, the layer will not perform any
+  computation and will simply perfom shape inference to return new
+  DeferredTensors with appropriate shape information. Thus DeferredTensor
+  behaves like a graph-mode Tensor when manipulated by layers.
+  """
+
+  def __init__(self, shape, dtype, name=None):
+    self.shape = tensor_shape.TensorShape(shape)
+    if dtype is None:
+      self.dtype = dtypes.as_dtype(np.float32)
+    else:
+      self.dtype = dtypes.as_dtype(dtype)
+    self.name = name
+
+  def get_shape(self):
+    return self.shape
+
+  def __str__(self):
+    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
+                                                         self.get_shape(),
+                                                         self.dtype.name)
+
+  def __repr__(self):
+    return "<DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
+                                                        self.get_shape(),
+                                                        self.dtype.name)
 
 
 def shape_type_conversion(fn):
@@ -503,3 +1681,251 @@ def shape_type_conversion(fn):
       return tensor_shape.TensorShape(output_shape)
 
   return wrapper
+
+
+def object_list_uid(object_list):
+  """Creates a single string from object ids."""
+  object_list = nest.flatten(object_list)
+  return ', '.join([str(abs(id(x))) for x in object_list])
+
+
+def static_shape(x):
+  """Get the static shape of a Tensor, or None if it is unavailable."""
+  if x is None:
+    return None
+  try:
+    return tuple(x.get_shape().as_list())
+  except ValueError:
+    return None
+
+
+def get_reachable_from_inputs(inputs, targets=None):
+  """Returns the set of tensors/ops reachable from `inputs`.
+
+  Stops if all targets have been found (target is optional).
+
+  Only valid in Symbolic mode, not Eager mode.
+
+  Args:
+    inputs: List of tensors.
+    targets: List of tensors.
+
+  Returns:
+    A set of tensors reachable from the inputs (includes the inputs themselves).
+  """
+  reachable = set(inputs)
+  if targets:
+    targets = set(targets)
+  queue = inputs[:]
+
+  while queue:
+    x = queue.pop()
+    if isinstance(x, ops.Operation):
+      outputs = x.outputs[:] or []
+      outputs += x._control_outputs
+    elif isinstance(x, ops.Tensor):
+      outputs = x.consumers()
+    elif isinstance(x, tf_variables.Variable):
+      outputs = [x.op]
+    else:
+      raise TypeError('Expected Operation, Variable, or Tensor, got ' + str(x))
+
+    for y in outputs:
+      if y not in reachable:
+        reachable.add(y)
+        queue.insert(0, y)
+
+    if targets and targets.issubset(reachable):
+      return reachable
+  return reachable
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def to_snake_case(name):
+  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
+  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
+  # If the class is private the name starts with "_" which is not secure
+  # for creating scopes. We prefix the name with "private" in this case.
+  if insecure[0] != '_':
+    return insecure
+  return 'private' + insecure
+
+
+def is_all_none(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  # We cannot use Python's `any` because the iterable may return Tensors.
+  for element in iterable:
+    if element is not None:
+      return False
+  return True
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = iterable_or_element
+  return all([hasattr(x, '_keras_history') for x in iterable])
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def is_tensor_or_tensor_list(v):
+  v = nest.flatten(v)
+  if v and isinstance(v[0], ops.Tensor):
+    return True
+  else:
+    return False
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=True,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable.
+    caching_device: Passed to `vs.variable`.
+    validate_shape: Passed to `vs.variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  v = vs.variable(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource)
+  return v
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
index b51dd8a2189d0c8542c84dfeac9be0d72b96ff1b..bd9dcbe3c576851123dfcabe3e36379019627ac5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/_impl/keras/engine/input_layer.py
@@ -23,7 +23,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.layers import base as tf_base_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -95,7 +94,7 @@ class InputLayer(base_layer.Layer):
 
       if context.executing_eagerly():
         # In eager mode, create a temporary placeholder to call the layer on.
-        input_tensor = tf_base_layers._DeferredTensor(  # pylint: disable=protected-access
+        input_tensor = base_layer.DeferredTensor(  # pylint: disable=protected-access
             shape=batch_input_shape,
             dtype=dtype,
             name=self.name)
@@ -123,7 +122,7 @@ class InputLayer(base_layer.Layer):
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-    tf_base_layers.Node(
+    base_layer.Node(
         self,
         inbound_layers=[],
         node_indices=[],
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/_impl/keras/engine/network.py
index ea4be0d293b7c4f50cec47eb067f7a928375be0b..cc177c14a894040df37f75bbdc6b2651336fe869 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/_impl/keras/engine/network.py
@@ -35,8 +35,6 @@ from tensorflow.python.keras._impl.keras.engine import saving
 from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
-from tensorflow.python.layers import base as tf_base_layers
-from tensorflow.python.layers import utils as tf_layers_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
@@ -82,7 +80,7 @@ class Network(base_layer.Layer):
     # self.losses
     # self.updates
 
-    self._init_set_name(name)
+    self._init_set_name(name, zero_based=True)
     self._activity_regularizer = None
     # This acts just like the `trainable` attribute of any layer instance.
     # It does not affect users of the underlying layers, only users of the
@@ -117,6 +115,7 @@ class Network(base_layer.Layer):
     self._inbound_nodes = []
 
   def _init_graph_network(self, inputs, outputs, name=None):
+    self._uses_inputs_arg = True
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -131,14 +130,14 @@ class Network(base_layer.Layer):
     if context.executing_eagerly():
       # Check that all inputs/outputs are DeferredTensors.
       for tensor in self.inputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'inputs must come from a call to '
                           '`tf.keras.Input` (called after '
                           'tfe.enable_eager_execution()). '
                           'Received invalid input: ' + str(tensor))
       for tensor in self.outputs:
-        if not isinstance(tensor, tf_base_layers._DeferredTensor):  # pylint: disable=protected-access
+        if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'outputs must come from a call to '
                           'a layer (called after '
@@ -229,7 +228,7 @@ class Network(base_layer.Layer):
     self._layers_by_depth = layers_by_depth
 
     # Create the node linking internal inputs to internal outputs.
-    tf_base_layers.Node(
+    base_layer.Node(
         outbound_layer=self,
         inbound_layers=[],
         node_indices=[],
@@ -242,8 +241,8 @@ class Network(base_layer.Layer):
     for x in self.inputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
       masks.append(mask)
-    mask_cache_key = (tf_layers_util.object_list_uid(self.inputs) + '_' +
-                      tf_layers_util.object_list_uid(masks))
+    mask_cache_key = (base_layer.object_list_uid(self.inputs) + '_' +
+                      base_layer.object_list_uid(masks))
     masks = []
     for x in self.outputs:
       mask = x._keras_mask if hasattr(x, '_keras_mask') else None  # pylint: disable=protected-access
@@ -274,17 +273,21 @@ class Network(base_layer.Layer):
   def _init_subclassed_network(self, name=None):
     self._base_init(name=name)
     self._is_graph_network = False
-    if 'training' in tf_inspect.getargspec(self.call).args:
+    call_args = tf_inspect.getargspec(self.call).args
+    if 'training' in call_args:
       self._expects_training_arg = True
     else:
       self._expects_training_arg = False
-
+    if 'inputs' in call_args:
+      self._uses_inputs_arg = True
+    else:
+      self._uses_inputs_arg = False
     self.outputs = None
     self.inputs = None
     self.built = False
 
   def __setattr__(self, name, value):
-    if isinstance(value, (tf_base_layers.Layer, Network)):
+    if isinstance(value, (base_layer.Layer, Network)):
       try:
         is_graph_network = self._is_graph_network
       except AttributeError:
@@ -294,6 +297,10 @@ class Network(base_layer.Layer):
       if not is_graph_network:
         if value not in self._layers:
           self._layers.append(value)
+          if hasattr(value, '_use_resource_variables'):
+            # In subclassed models, legacy layers (tf.layers) must always use
+            # resource variables.
+            value._use_resource_variables = True
     if isinstance(value, checkpointable.CheckpointableBase):
       # Layer (and therefore Network/Model) inherit from CheckpointableBase
       # rather than Checkpointable, which means there is no Checkpointable
@@ -382,8 +389,8 @@ class Network(base_layer.Layer):
       masks = [None for _ in range(len(inputs))]
     else:
       masks = generic_utils.to_list(mask)
-    cache_key = (tf_layers_util.object_list_uid(inputs)
-                 + '_' + tf_layers_util.object_list_uid(masks))
+    cache_key = (base_layer.object_list_uid(inputs)
+                 + '_' + base_layer.object_list_uid(masks))
     if cache_key in self._output_mask_cache:
       return self._output_mask_cache[cache_key]
     else:
@@ -497,8 +504,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         updates)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
     unconditional_updates = [
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
@@ -535,8 +541,7 @@ class Network(base_layer.Layer):
         relevant_inputs += inputs
       else:
         relevant_inputs.append(inputs)
-    reachable = tf_layers_util.get_reachable_from_inputs(relevant_inputs,
-                                                         losses)
+    reachable = base_layer.get_reachable_from_inputs(relevant_inputs, losses)
     relevant_conditional_losses = [x for x in losses if x in reachable]
     unconditional_losses = [
         x for x in losses if x._unconditional_loss]  # pylint: disable=protected-access
@@ -618,8 +623,8 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Try to retrieve cached outputs if the layer has already been called
       # on these exact inputs.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       if cache_key in self._output_tensor_cache:
         # Cache hit.
         return self._output_tensor_cache[cache_key]
@@ -651,7 +656,7 @@ class Network(base_layer.Layer):
                        ': model has ' + str(len(self._input_layers)) +
                        ' tensor inputs.')
 
-    cache_key = tf_layers_util.object_list_uid(input_shapes)
+    cache_key = base_layer.object_list_uid(input_shapes)
     if cache_key not in self._output_shape_cache:
       # Cache miss. We have to run the network graph manually (recursive calls
       # to `compute_output_shape`).
@@ -840,7 +845,7 @@ class Network(base_layer.Layer):
     for x in self.outputs:
       assert str(id(x)) in tensor_map, 'Could not compute output ' + str(x)
       tensor, mask = tensor_map[str(id(x))]
-      output_shapes.append(tf_layers_util.static_shape(x))
+      output_shapes.append(base_layer.static_shape(x))
       output_tensors.append(tensor)
       output_masks.append(mask)
 
@@ -854,14 +859,14 @@ class Network(base_layer.Layer):
     if not context.executing_eagerly():
       # Update cache;
       # keys are based on ids on input tensors and inputs masks.
-      cache_key = (tf_layers_util.object_list_uid(inputs)
-                   + '_' + tf_layers_util.object_list_uid(masks))
+      cache_key = (base_layer.object_list_uid(inputs)
+                   + '_' + base_layer.object_list_uid(masks))
       self._output_tensor_cache[cache_key] = output_tensors
       self._output_mask_cache[cache_key] = output_masks
 
       if output_shapes is not None:
-        input_shapes = [tf_layers_util.static_shape(x) for x in inputs]
-        cache_key = tf_layers_util.object_list_uid(input_shapes)
+        input_shapes = [base_layer.static_shape(x) for x in inputs]
+        cache_key = base_layer.object_list_uid(input_shapes)
         self._output_shape_cache[cache_key] = output_shapes
 
     return output_tensors, output_masks
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
index dde090120456f968267e1c572f22eda1bd6ed7c4..3b1578cddfd97b31cae8619cdd2d8e1997585f51 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/saving_test.py
@@ -422,7 +422,7 @@ class TestWholeModelSaving(test.TestCase):
         f = keras.layers.Dense(2, name='nested_model_dense_%d' % (i,))(f)
       # This layer name will make the `weights_name`
       # HDF5 attribute blow out of proportion.
-      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**15)))(f)
+      f = keras.layers.Dense(2, name='nested_model_output' + ('x' * (2**14)))(f)
       nested_model = keras.Model(inputs=[x], outputs=[f], name='nested_model')
 
       x = keras.Input(shape=(2,), name='outer_model_input')
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/_impl/keras/engine/sequential.py
index 66cef1f5b9cef302117fe1fa67a0cfdf694403f1..bd13ca671340551c3e96895951be360b15e55cfe 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras._impl.keras.engine.input_layer import Input
 from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras._impl.keras.engine.training import Model
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -122,7 +123,7 @@ class Sequential(Model):
             multiple output tensors, or is already connected
             somewhere else (forbidden in `Sequential` models).
     """
-    if not isinstance(layer, (base_layer.Layer, base_layer.TFBaseLayer)):
+    if not isinstance(layer, base_layer.Layer):
       raise TypeError('The added layer must be '
                       'an instance of class Layer. '
                       'Found: ' + str(layer))
@@ -192,6 +193,36 @@ class Sequential(Model):
       self.build()
     else:
       self._layers.append(layer)
+    # In implementing Checkpointable, Sequential does not track its Layers
+    # normally, since they may be added and removed (in pop()). Instead, it
+    # names everything on demand (gathering dependencies in
+    # _checkpoint_dependencies, and looking them up in
+    # _lookup_dependency). _handle_deferred_dependencies just checks whether an
+    # existing checkpoint load targets this Layer, it does not create a
+    # dependency on the Layer.
+    self._handle_deferred_dependencies(
+        name='layer-%d' % (len(self._layers) - 1), checkpointable=layer)
+
+  @property
+  def _checkpoint_dependencies(self):
+    """For implementing Checkpointable. Layers which should be saved."""
+    return super(Sequential, self)._checkpoint_dependencies + [
+        checkpointable.CheckpointableReference(
+            name='layer-%d' % layer_index, ref=layer)
+        for layer_index, layer in enumerate(self._layers)]
+
+  def _lookup_dependency(self, name):
+    """For implementing Checkpointable. Looks up a Layer."""
+    super_lookup = super(Sequential, self)._lookup_dependency(name=name)
+    if super_lookup is not None:
+      return super_lookup
+    if name.startswith('layer-'):
+      try:
+        return self._layers[int(name[6:])]
+      except IndexError:
+        return None
+    else:
+      return None
 
   def pop(self):
     """Removes the last layer in the model.
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
index c9a47581df03e0fc1ad38552ba8634862435cd80..8aba16aef3e187e9e33bdb65c7d44b0e622730ef 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
@@ -151,6 +151,7 @@ class TestSequential(test.TestCase):
     with self.test_session():
       model = keras.models.Sequential()
       model.add(keras.layers.BatchNormalization(input_shape=(4,)))
+      assert model.updates
 
       model.trainable = False
       assert not model.updates
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
index b50277c8fff917d77694903c989fd02ea98b1711..49cc1cd3b38325b4f42d5b26bac9442d7cc09b05 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/topology_test.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
-from tensorflow.python.layers import base as tf_base_layers
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -52,11 +52,13 @@ class TopologyConstructionTest(test.TestCase):
                                    (1, 1),
                                    'float32',
                                    trainable=False)
-        self.add_update(state_ops.assign_add(self.a, [[1.]]))
+        self.add_update(state_ops.assign_add(self.a, [[1.]],
+                                             name='unconditional_update'))
         self.built = True
 
       def call(self, inputs):
-        self.add_update(state_ops.assign_add(self.a, inputs),
+        self.add_update(state_ops.assign_add(self.b, inputs,
+                                             name='conditional_update'),
                         inputs=True)
         return inputs + 1
 
@@ -97,10 +99,20 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 4)
     self.assertEqual(len(network.get_updates_for(None)), 2)
 
-    network.add_update(state_ops.assign_add(layer.a, x4), inputs=True)
+    network.add_update(state_ops.assign_add(layer.b, x4), inputs=True)
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  def test_get_updates_bn(self):
+    x1 = keras.Input(shape=(1,))
+    layer = keras.layers.BatchNormalization()
+    _ = layer.apply(x1)
+
+    print('BN updates', layer._updates)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.get_updates_for(x1)), 2)
+    self.assertEqual(len(layer.get_updates_for(None)), 0)
+
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -783,7 +795,7 @@ class TopologyConstructionTest(test.TestCase):
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
-      return keras.backend.sum(x)
+      return math_ops.reduce_sum(x)
 
     net_a_input = keras.Input((2,))
     net_a = net_a_input
@@ -875,25 +887,25 @@ class TopologyConstructionTest(test.TestCase):
 class DeferredModeTest(test.TestCase):
 
   def testDeferredTensorAttributes(self):
-    x = tf_base_layers._DeferredTensor(shape=(None, 2),
-                                       dtype='float32',
-                                       name='x')
+    x = base_layer.DeferredTensor(shape=(None, 2),
+                                  dtype='float32',
+                                  name='x')
     self.assertEqual(str(x),
                      'DeferredTensor(\'x\', shape=(?, 2), dtype=float32)')
     self.assertEqual(repr(x),
-                     '<_DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
+                     '<DeferredTensor \'x\' shape=(?, 2) dtype=float32>')
 
   @test_util.run_in_graph_and_eager_modes()
   def testSimpleNetworkBuilding(self):
     inputs = keras.engine.Input(shape=(32,))
     if context.executing_eagerly():
-      self.assertIsInstance(inputs, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(inputs, base_layer.DeferredTensor)
       self.assertEqual(inputs.dtype.name, 'float32')
       self.assertEqual(inputs.shape.as_list(), [None, 32])
 
     x = keras.layers.Dense(2)(inputs)
     if context.executing_eagerly():
-      self.assertIsInstance(x, tf_base_layers._DeferredTensor)
+      self.assertIsInstance(x, base_layer.DeferredTensor)
       self.assertEqual(x.dtype.name, 'float32')
       self.assertEqual(x.shape.as_list(), [None, 2])
 
@@ -936,5 +948,34 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[0].shape.as_list(), [10, 16])
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
+
+class GraphUtilsTest(test.TestCase):
+
+  def testGetReachableFromInputs(self):
+
+    with self.test_session():
+      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
+      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
+      x_1 = pl_1 + pl_2
+      x_2 = pl_2 * 2
+      x_3 = pl_3 + 1
+      x_4 = x_1 + x_2
+      x_5 = x_3 * pl_1
+
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1]),
+          {pl_1, x_1, x_4, x_5, x_1.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_1, pl_2]),
+          {pl_1, pl_2, x_1, x_2, x_4, x_5, x_1.op, x_2.op, x_4.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([pl_3]),
+          {pl_3, x_3, x_5, x_3.op, x_5.op})
+      self.assertEqual(
+          keras.engine.base_layer.get_reachable_from_inputs([x_3]),
+          {x_3, x_5, x_5.op})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/_impl/keras/engine/training.py
index 08288d353efdb233f87c1e3c7c09cd405c1e1688..7c4674381458d758939cc34344d7e11bf5941c3c 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training.py
@@ -31,10 +31,10 @@ from tensorflow.python.keras._impl.keras.engine import training_arrays
 from tensorflow.python.keras._impl.keras.engine import training_eager
 from tensorflow.python.keras._impl.keras.engine import training_generator
 from tensorflow.python.keras._impl.keras.engine import training_utils
+from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.network import Network
 from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.layers.base import _DeferredTensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
@@ -874,6 +874,11 @@ class Model(Network):
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
     """
+    if not getattr(self, '_uses_inputs_arg', True):
+      raise NotImplementedError(
+          'Subclassed Models without "inputs" in their call() signatures do '
+          'not yet support shape inference. File a feature request if this '
+          'limitation bothers you.')
     if self.__class__.__name__ == 'Sequential':
       # Note: we can't test whether the model is `Sequential` via `isinstance`
       # since `Sequential` depends on `Model`.
@@ -886,15 +891,6 @@ class Model(Network):
     else:
       self._symbolic_set_inputs(inputs, training=training)
 
-  def _set_scope(self, scope=None):
-    """Modify the Layer scope creation logic to create ResourceVariables."""
-    super(Model, self)._set_scope(scope=scope)
-    # Subclassed Models create ResourceVariables by default. This makes it
-    # easier to use Models in an eager/graph agnostic way (since eager execution
-    # always uses ResourceVariables).
-    if not self._is_graph_network:
-      self._scope.set_use_resource(True)
-
   def _eager_set_inputs(self, inputs):
     """Set model's input and output specs based on the input data received.
 
@@ -928,11 +924,11 @@ class Model(Network):
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_output_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        _DeferredTensor(shape=(None for _ in v.shape),
-                        dtype=v.dtype) for v in dummy_input_values]
+        DeferredTensor(shape=(None for _ in v.shape),
+                       dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
@@ -1176,6 +1172,9 @@ class Model(Network):
           batch_size=batch_size)
 
     elif validation_split and 0. < validation_split < 1.:
+      if training_utils.has_symbolic_tensors(x):
+        raise ValueError('If your data is in the form of symbolic tensors, '
+                         'you cannot use `validation_split`.')
       if hasattr(x[0], 'shape'):
         split_at = int(x[0].shape[0] * (1. - validation_split))
       else:
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
index 67858a578c5c95b3099e1e6713f3287748fc861f..4cdb5f108a05bb88ed328ca20351914160906e86 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
@@ -31,9 +31,8 @@ from tensorflow.python.keras._impl.keras import callbacks as cbks
 from tensorflow.python.keras._impl.keras import losses
 from tensorflow.python.keras._impl.keras import metrics as metrics_module
 from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -173,6 +172,41 @@ def _model_loss(model, inputs, targets, sample_weights=None, training=False):
   return outs, total_loss, loss_metrics
 
 
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -270,9 +304,8 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
-  metric_names, metrics_results = _eager_metrics_fn(
+  _, metrics_results = _eager_metrics_fn(
       model, outs, targets)
-  model.metrics_names.append(metric_names)
   if not isinstance(loss, list):
     loss = [loss]
   return loss + loss_metrics + metrics_results
@@ -328,6 +361,12 @@ def fit_loop(
   Raises:
     ValueError: In case of invalid argument values.
   """
+  if not batch_size:
+    raise ValueError('With eager execution, `batch_size` should be specified.')
+  if steps_per_epoch or validation_steps:
+    raise ValueError('With eager execution, `steps_per_epoch` and '
+                     '`validation_steps` are not valid arguments '
+                     '(set `batch_size` instead).')
   # Required for Eager mode
   with backend.learning_phase_scope(1):
     do_validation = False
@@ -410,15 +449,18 @@ def fit_loop(
       elif shuffle:
         np.random.shuffle(index_array)
 
-      batches = make_batches(num_train_samples, batch_size)
+      batches = generic_utils.make_batches(num_train_samples, batch_size)
 
       for batch_index, (batch_start, batch_end) in enumerate(batches):
         batch_ids = index_array[batch_start:batch_end]
         try:
-          inputs_batch = slice_arrays(inputs, batch_ids)
-          targets_batch = slice_arrays(targets, batch_ids)
+          inputs_batch = slice_arrays(inputs, batch_ids,
+                                      contiguous=not shuffle)
+          targets_batch = slice_arrays(targets, batch_ids,
+                                       contiguous=not shuffle)
           if sample_weights:
-            sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
+                                                contiguous=not shuffle)
           else:
             sample_weights_batch = None
         except TypeError:
@@ -539,8 +581,8 @@ def test_loop(model, inputs, targets,
         feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
     outs = []
     if verbose == 1:
-      progbar = Progbar(target=num_samples)
-    batches = make_batches(num_samples, batch_size)
+      progbar = generic_utils.Progbar(target=num_samples)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
@@ -620,12 +662,12 @@ def predict_loop(model, inputs,
         inputs, batch_size, steps, 'steps')
     if verbose == 1:
       if steps is not None:
-        progbar = Progbar(target=steps)
+        progbar = generic_utils.Progbar(target=steps)
       else:
-        progbar = Progbar(target=num_samples)
+        progbar = generic_utils.Progbar(target=num_samples)
 
     outs = []
-    batches = make_batches(num_samples, batch_size)
+    batches = generic_utils.make_batches(num_samples, batch_size)
     index_array = np.arange(num_samples)
     for batch_index, (batch_start, batch_end) in enumerate(batches):
       batch_ids = index_array[batch_start:batch_end]
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
index 8848b393d5e602e564cb357c32a937eaabd68203..6cdb6b0753fce1bebec0060524e76d32929d0228 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
 import numpy as np
 
 from tensorflow.python.framework import ops
@@ -308,6 +307,100 @@ class TrainingTest(test.TestCase):
       model.compile(loss=None,
                     optimizer='rms')
 
+  def test_model_methods_with_eager_tensors_multi_io(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    metrics = ['mae']
+    model.compile(
+        optimizer,
+        loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        sample_weight_mode=None)
+
+    input_a = keras.backend.zeros(shape=(10, 3))
+    input_b = keras.backend.zeros(shape=(10, 3))
+    target_d = keras.backend.zeros(shape=(10, 4))
+    target_e = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    # Test: no shuffle.
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    # Test: validation data.
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+    # Test: mix np and tensors.
+    input_b = np.zeros(shape=(10, 3)).astype('float32')
+    target_e = np.zeros(shape=(10, 4)).astype('float32')
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0)
+    model.fit([input_a, input_b], [target_d, target_e],
+              epochs=1, batch_size=2, verbose=0,
+              validation_data=([input_a, input_b], [target_d, target_e]))
+    model.fit(
+        [input_a, input_b], [target_d, target_e],
+        epochs=1,
+        batch_size=5,
+        verbose=0,
+        shuffle=False)
+    model.train_on_batch([input_a, input_b], [target_d, target_e])
+    model.predict([input_a, input_b], batch_size=5)
+    model.evaluate([input_a, input_b], [target_d, target_e],
+                   batch_size=2, verbose=0)
+    model.test_on_batch([input_a, input_b], [target_d, target_e])
+
+  def test_model_methods_with_eager_tensors_single_io(self):
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics)
+
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
+
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=0)
+    model.fit(inputs, targets, epochs=1, batch_size=3, verbose=0, shuffle=False)
+    model.fit(inputs, targets, epochs=1, batch_size=4, verbose=0,
+              validation_data=(inputs, targets))
+    model.evaluate(inputs, targets, batch_size=2, verbose=0)
+    model.predict(inputs, batch_size=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+
 
 class LossWeightingTest(test.TestCase):
 
@@ -533,14 +626,5 @@ class LossWeightingTest(test.TestCase):
 
 
 if __name__ == '__main__':
-  # Bazel sets these environment variables to very long paths.
-  # Tempfile uses them to create long paths, and in turn multiprocessing
-  # library tries to create sockets named after paths. Delete whatever bazel
-  # writes to these to avoid tests failing due to socket addresses being too
-  # long.
-  for var in ('TMPDIR', 'TMP', 'TEMP'):
-    if var in os.environ:
-      del os.environ[var]
-
   ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/_impl/keras/engine/training_test.py
index fd91dbba52ff7d152335514085ef3b057ae5eec4..08fd26dd18d5bc1b171d780be133f02f51b9c248 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_test.py
@@ -1117,6 +1117,121 @@ class TestTrainingUtils(test.TestCase):
 
 class TestTrainingWithDataTensors(test.TestCase):
 
+  def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+      model.predict(inputs, steps=2)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
+      model.fit(inputs, targets,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=(inputs, targets), validation_steps=2)
+
+  def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
+    with self.test_session():
+      a = keras.layers.Input(shape=(3,), name='input_a')
+      b = keras.layers.Input(shape=(3,), name='input_b')
+
+      dense = keras.layers.Dense(4, name='dense')
+      c = dense(a)
+      d = dense(b)
+      e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+      model = keras.models.Model([a, b], [d, e])
+
+      optimizer = 'rmsprop'
+      loss = 'mse'
+      loss_weights = [1., 0.5]
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+
+      input_a_tf = keras.backend.zeros(shape=(10, 3))
+      input_b_tf = keras.backend.zeros(shape=(10, 3))
+
+      output_d_tf = keras.backend.zeros(shape=(10, 4))
+      output_e_tf = keras.backend.zeros(shape=(10, 4))
+
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'should specify the `steps_per_epoch`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=1,
+            batch_size=5,
+            verbose=0)
+      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+      # Test with dictionary inputs
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0)
+      model.fit(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf},
+          validation_data=({'input_a': input_a_tf,
+                            'input_b': input_b_tf},
+                           {'dense': output_d_tf,
+                            'dropout': output_e_tf}),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      model.train_on_batch(
+          {'input_a': input_a_tf,
+           'input_b': input_b_tf},
+          {'dense': output_d_tf,
+           'dropout': output_e_tf})
+
+      # Test with validation data
+      model.fit(
+          [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+          validation_data=([input_a_tf, input_b_tf],
+                           [output_d_tf, output_e_tf]),
+          epochs=1,
+          steps_per_epoch=2,
+          validation_steps=2,
+          verbose=0)
+      # Test with validation split
+      with self.assertRaisesRegexp(ValueError,
+                                   'you cannot use `validation_split`'):
+        model.fit(
+            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+            epochs=2,
+            steps_per_epoch=2,
+            verbose=0,
+            validation_split=0.2,
+            validation_steps=2)
+
+      # Test evaluation / prediction methods
+      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                     steps=2, verbose=0)
+      model.predict([input_a_tf, input_b_tf], steps=2)
+      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
index 105638ce1087e8668b49b6653a847667e8f9157d..a3fc8ef2a0359c527a2757c1888d61822e35d7a9 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/_impl/keras/engine/training_utils.py
@@ -22,9 +22,11 @@ import copy
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import losses
+from tensorflow.python.ops import math_ops
 
 
 def check_num_samples(ins,
@@ -64,15 +66,29 @@ def check_num_samples(ins,
     if batch_size is not None:
       raise ValueError(
           'If ' + steps_name + ' is set, the `batch_size` must be None.')
-  elif ins and hasattr(ins[0], 'shape'):
-    num_samples = ins[0].shape[0]
-  else:
+  if has_symbolic_tensors(ins) and steps is None:
+    raise ValueError('If your data is in the form of symbolic tensors, '
+                     'you should specify the `' + steps_name + '` argument '
+                     '(instead of the `batch_size` argument).')
+  if ins and hasattr(ins[0], 'shape'):
+    num_samples = int(ins[0].shape[0])
+  elif steps is None:
     raise ValueError(
         'Either the input data should have '
         'a defined shape, or ' + steps_name + ' should be specified.')
   return num_samples
 
 
+def standardize_single_array(x):
+  if x is None:
+    return None
+  elif tensor_util.is_tensor(x):
+    return x
+  elif x.ndim == 1:
+    x = np.expand_dims(x, 1)
+  return x
+
+
 def standardize_input_data(data,
                            names,
                            shapes=None,
@@ -130,9 +146,7 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [
-      np.expand_dims(x, 1) if x is not None and x.ndim == 1 else x for x in data
-  ]
+  data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -158,7 +172,7 @@ def standardize_input_data(data,
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is not None:
+      if shapes[i] is not None and not tensor_util.is_tensor(data[i]):
         data_shape = data[i].shape
         shape = shapes[i]
         if data[i].ndim != len(shape):
@@ -245,12 +259,13 @@ def check_array_lengths(inputs, targets, weights=None):
   """
 
   def set_of_lengths(x):
-    # return a set with the variation between
+    # Returns a set with the variation between
     # different shapes, with None => 0
     if x is None:
       return {}
     else:
-      return set([y.shape[0] for y in x if y is not None])
+      return set([y.shape[0] for y in x
+                  if y is not None and not tensor_util.is_tensor(y)])
 
   set_x = set_of_lengths(inputs)
   set_y = set_of_lengths(targets)
@@ -422,7 +437,7 @@ def weighted_masked_objective(fn):
     score_array = fn(y_true, y_pred)
     if mask is not None:
       # Cast the mask to floatX to avoid float64 upcasting in theano
-      mask = K.cast(mask, K.floatx())
+      mask = math_ops.cast(mask, K.floatx())
       # mask should have the same shape as score_array
       score_array *= mask
       #  the loss per batch should be proportional
@@ -436,7 +451,8 @@ def weighted_masked_objective(fn):
       weight_ndim = K.ndim(weights)
       score_array = K.mean(score_array, axis=list(range(weight_ndim, ndim)))
       score_array *= weights
-      score_array /= K.mean(K.cast(K.not_equal(weights, 0), K.floatx()))
+      score_array /= K.mean(
+          math_ops.cast(math_ops.not_equal(weights, 0), K.floatx()))
     return K.mean(score_array)
 
   return weighted
@@ -532,3 +548,8 @@ def standardize_weights(y,
     return weights
   else:
     return None
+
+
+def has_symbolic_tensors(ls):
+  return (any(tensor_util.is_tensor(v) for v in ls)
+          and not context.executing_eagerly())
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/keras/_impl/keras/estimator.py
index 8426d84df964092435b10c9e28e1843df7e423f4..8043242b709e9df961b0241437070a8e1dc0c8ec 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/keras/_impl/keras/estimator.py
@@ -26,6 +26,7 @@ from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -465,11 +466,21 @@ def model_to_estimator(keras_model=None,
   estimator = estimator_lib.Estimator(
       keras_model_fn, model_dir=model_dir, config=config)
 
+  old_session = K._SESSION
   # Pass the config into keras backend's default session.
-  with session.Session(config=estimator._session_config) as sess:
-    K.set_session(sess)
+  sess = session.Session(config=estimator._session_config)
+  K.set_session(sess)
+  try:
+    keras_weights = keras_model.get_weights()
+  except errors.FailedPreconditionError as e:
+    if old_session is None:
+      raise e
+    logging.warning(
+        'The Keras backend session has already been '
+        'set. The _session_config passed to model_to_estimator is not used.')
+    K.set_session(old_session)
+    keras_weights = keras_model.get_weights()
 
-  keras_weights = keras_model.get_weights()
   if keras_model._is_graph_network:
     # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
     _save_first_checkpoint(keras_model,
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/keras/_impl/keras/estimator_test.py
index e076dc25b16900636313f0ddd85a61b8d917fc91..27b7ec7dd40cb6730b1a47542f0facc2500370b0 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/keras/_impl/keras/estimator_test.py
@@ -512,6 +512,26 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
                      ._config.gpu_options.per_process_gpu_memory_fraction,
                      gpu_options.per_process_gpu_memory_fraction)
 
+  def test_pretrained_weights(self):
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    keras_model.train_on_batch(
+        np.random.random((10,) + _INPUT_SIZE), np.random.random((10,
+                                                                 _NUM_CLASS)))
+    weights = keras_model.get_weights()
+    keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
+    keras_model.set_weights(weights)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+    keras.estimator.model_to_estimator(
+        keras_model=keras_model, config=self._config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/_impl/keras/initializers.py
index 300bed5e1437074d010760c427c14f68e58ac363..ecb71d00e2c78ced6095aaa3a0180b454b04917a 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/_impl/keras/initializers.py
@@ -201,6 +201,8 @@ def deserialize(config, custom_objects=None):
 
 @tf_export('keras.initializers.get')
 def get(identifier):
+  if identifier is None:
+    return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/_impl/keras/integration_test.py
index 280f7ed1b11e2026ac196eb319f7d5da8301f060..c44808421f4da08b59904b4940ee8a485de18d4a 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/_impl/keras/integration_test.py
@@ -29,16 +29,15 @@ from tensorflow.python.platform import test
 
 class KerasIntegrationTest(test.TestCase):
 
-  def test_vector_classification_declarative(self):
+  def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -48,23 +47,22 @@ class KerasIntegrationTest(test.TestCase):
           keras.layers.Dense(y_train.shape[-1], activation='softmax')
       ])
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(10,),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(20,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.Dense(16, activation='relu')(inputs)
@@ -73,77 +71,78 @@ class KerasIntegrationTest(test.TestCase):
 
       model = keras.models.Model(inputs, outputs)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_temporal_classification_declarative(self):
+  def test_temporal_classification_sequential(self):
     with self.test_session():
-      np.random.seed(1336)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(4, 8),
+      np.random.seed(1337)
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(4, 10),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.LSTM(5, return_sequences=True,
                                   input_shape=x_train.shape[1:]))
       model.add(keras.layers.GRU(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
-  def test_image_classification_declarative(self):
+  def test_image_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
-          input_shape=(8, 8, 3),
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
+          input_shape=(12, 12, 3),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       model = keras.models.Sequential()
       model.add(keras.layers.Conv2D(
-          8, 3,
+          4, 3,
+          padding='same',
           activation='relu',
           input_shape=x_train.shape[1:]))
-      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Conv2D(
           8, 3,
           padding='same',
           activation='relu'))
-      model.add(keras.layers.GlobalMaxPooling2D())
+      model.add(keras.layers.Conv2D(
+          16, 3,
+          padding='same',
+          activation='relu'))
+      model.add(keras.layers.Flatten())
       model.add(keras.layers.Dense(y_train.shape[-1], activation='softmax'))
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_video_classification_functional(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(4, 8, 8, 3),
           num_classes=3)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(shape=x_train.shape[1:])
       x = keras.layers.TimeDistributed(
@@ -159,22 +158,21 @@ class KerasIntegrationTest(test.TestCase):
                     optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.8),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.70)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       base_model = keras.models.Sequential([
           keras.layers.Dense(16,
@@ -189,27 +187,26 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       self.assertEqual(len(model.losses), 2)
       self.assertEqual(len(model.updates), 2)
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.84)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.layers.Input(x_train.shape[1:])
       x = keras.layers.Dense(16,
@@ -225,12 +222,12 @@ class KerasIntegrationTest(test.TestCase):
       y = keras.layers.Dense(y_train.shape[-1], activation='softmax')(y)
       model = keras.models.Model(x, y)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='rmsprop',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=2)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_embedding_with_clipnorm(self):
     with self.test_session():
@@ -242,9 +239,9 @@ class KerasIntegrationTest(test.TestCase):
   def test_using_tf_layers_in_keras_sequential_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
 
@@ -254,25 +251,23 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
   def test_using_tf_layers_in_keras_functional_model(self):
     with self.test_session():
       np.random.seed(1337)
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=200,
-          test_samples=100,
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=100,
+          test_samples=0,
           input_shape=(10,),
           num_classes=2)
       y_train = keras.utils.to_categorical(y_train)
-      y_test = keras.utils.to_categorical(y_test)
 
       inputs = keras.Input(shape=(10,))
       x = tf_core_layers.Dense(32, activation=nn.relu)(inputs)
@@ -281,12 +276,12 @@ class KerasIntegrationTest(test.TestCase):
       model.summary()
 
       model.compile(loss='categorical_crossentropy',
-                    optimizer='adam',
+                    optimizer=keras.optimizers.Adam(lr=0.1),
                     metrics=['accuracy'])
       history = model.fit(x_train, y_train, epochs=10, batch_size=16,
-                          validation_data=(x_test, y_test),
+                          validation_data=(x_train, y_train),
                           verbose=0)
-      self.assertGreater(history.history['val_acc'][-1], 0.85)
+      self.assertGreater(history.history['val_acc'][-1], 0.7)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
index c40ee109aaea7dacea72e095b1d8cea3ed2e9bf8..11ca89d625bebb607b2bddbe65b8251f52aa6e4c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
@@ -26,6 +26,7 @@ from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -146,7 +147,7 @@ class PReLU(Layer):
     if K.backend() == 'theano':
       neg = (
           K.pattern_broadcast(self.alpha, self.param_broadcast) *
-          (inputs - K.abs(inputs)) * 0.5)
+          (inputs - math_ops.abs(inputs)) * 0.5)
     else:
       neg = -self.alpha * K.relu(-inputs)
     return pos + neg
@@ -232,7 +233,8 @@ class ThresholdedReLU(Layer):
     self.theta = K.cast_to_floatx(theta)
 
   def call(self, inputs, mask=None):
-    return inputs * K.cast(K.greater(inputs, self.theta), K.floatx())
+    return inputs * math_ops.cast(
+        math_ops.greater(inputs, self.theta), K.floatx())
 
   def get_config(self):
     config = {'theta': float(self.theta)}
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
index 162ae6c28f1afae1dd8aaf70213b808d9ad9598f..d202b6551daf5e9a1233fe90ad8470f3cd06f5a4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional.py
@@ -19,14 +19,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
@@ -37,12 +39,232 @@ from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
 from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import convolutional as tf_convolutional_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Conv(Layer):
+  """Abstract nD convolution layer (private, used as implementation base).
+
+  This layer creates a convolution kernel that is convolved
+  (actually cross-correlated) with the layer input to produce a tensor of
+  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
+  a bias vector is created and added to the outputs. Finally, if
+  `activation` is not `None`, it is applied to the outputs as well.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+      length of the convolution window.
+    strides: An integer or tuple/list of n integers,
+      specifying the stride length of the convolution.
+      Specifying any stride value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of n integers, specifying
+      the dilation rate to use for dilated convolution.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any `strides` value != 1.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    kernel_initializer: An initializer for the convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    kernel_regularizer: Optional regularizer for the convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    kernel_constraint: Optional projection function to be applied to the
+        kernel after being updated by an `Optimizer` (e.g. used to implement
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               activation=None,
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(Conv, self).__init__(
+        trainable=trainable,
+        name=name,
+        activity_regularizer=regularizers.get(activity_regularizer),
+        **kwargs)
+    self.rank = rank
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(
+        kernel_size, rank, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, rank, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(
+        dilation_rate, rank, 'dilation_rate')
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+    self.input_spec = InputSpec(ndim=self.rank + 2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis].value
+    kernel_shape = self.kernel_size + (input_dim, self.filters)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    self._convolution_op = nn_ops.Convolution(
+        input_shape,
+        filter_shape=self.kernel.get_shape(),
+        dilation_rate=self.dilation_rate,
+        strides=self.strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format,
+                                                   self.rank + 2))
+    self.built = True
+
+  def call(self, inputs):
+    outputs = self._convolution_op(inputs, self.kernel)
+
+    if self.use_bias:
+      if self.data_format == 'channels_first':
+        if self.rank == 1:
+          # nn.bias_add does not accept a 1D input tensor.
+          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
+          outputs += bias
+        if self.rank == 2:
+          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
+        if self.rank == 3:
+          # As of Mar 2017, direct addition is significantly slower than
+          # bias_add when computing gradients. To use bias_add, we collapse Z
+          # and Y into a single dimension to obtain a 4D input tensor.
+          outputs_shape = outputs.shape.as_list()
+          if outputs_shape[0] is None:
+            outputs_shape[0] = -1
+          outputs_4d = array_ops.reshape(outputs,
+                                         [outputs_shape[0], outputs_shape[1],
+                                          outputs_shape[2] * outputs_shape[3],
+                                          outputs_shape[4]])
+          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
+          outputs = array_ops.reshape(outputs_4d, outputs_shape)
+      else:
+        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_last':
+      space = input_shape[1:-1]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0]] + new_space +
+                                      [self.filters])
+    else:
+      space = input_shape[2:]
+      new_space = []
+      for i in range(len(space)):
+        new_dim = conv_utils.conv_output_length(
+            space[i],
+            self.kernel_size[i],
+            padding=self.padding,
+            stride=self.strides[i],
+            dilation=self.dilation_rate[i])
+        new_space.append(new_dim)
+      return tensor_shape.TensorShape([input_shape[0], self.filters] +
+                                      new_space)
+
+  def get_config(self):
+    config = {
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(Conv, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.Conv1D', 'keras.layers.Convolution1D')
-class Conv1D(tf_convolutional_layers.Conv1D, Layer):
+class Conv1D(Conv):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -73,6 +295,8 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
           where the model should not violate the temporal order.
           See [WaveNet: A Generative Model for Raw Audio, section
             2.1](https://arxiv.org/abs/1609.03499).
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
       dilation_rate: an integer or tuple/list of a single integer, specifying
           the dilation rate to use for dilated convolution.
           Currently, specifying any `dilation_rate` value != 1 is
@@ -104,6 +328,7 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                kernel_size,
                strides=1,
                padding='valid',
+               data_format='channels_last',
                dilation_rate=1,
                activation=None,
                use_bias=True,
@@ -116,11 +341,12 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
                bias_constraint=None,
                **kwargs):
     super(Conv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
-        data_format='channels_last',
+        data_format=data_format,
         dilation_rate=dilation_rate,
         activation=activations.get(activation),
         use_bias=use_bias,
@@ -133,30 +359,9 @@ class Conv1D(tf_convolutional_layers.Conv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2D', 'keras.layers.Convolution2D')
-class Conv2D(tf_convolutional_layers.Conv2D, Layer):
+class Conv2D(Conv):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -246,9 +451,8 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -266,31 +470,9 @@ class Conv2D(tf_convolutional_layers.Conv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv3D', 'keras.layers.Convolution3D')
-class Conv3D(tf_convolutional_layers.Conv3D, Layer):
+class Conv3D(Conv):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -387,9 +569,8 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3D, self).__init__(
+        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -407,32 +588,10 @@ class Conv3D(tf_convolutional_layers.Conv3D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
 
 @tf_export('keras.layers.Conv2DTranspose',
            'keras.layers.Convolution2DTranspose')
-class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
+class Conv2DTranspose(Conv2D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -528,8 +687,6 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv2DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -547,31 +704,123 @@ class Conv2DTranspose(tf_convolutional_layers.Conv2DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Conv2DTranspose, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def build(self, input_shape):
+    if len(input_shape) != 4:
+      raise ValueError('Inputs should have rank 4. Received input shape: ' +
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis]
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+
+    self.kernel = self.add_variable(name='kernel',
+                                    shape=kernel_shape,
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_height, out_width)
+      strides = (1, 1, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_height, out_width, self.filters)
+      strides = (1, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv2d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, h_axis, w_axis = 1, 2, 3
+    else:
+      c_axis, h_axis, w_axis = 3, 1, 2
+
+    kernel_h, kernel_w = self.kernel_size
+    stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.Conv3DTranspose',
            'keras.layers.Convolution3DTranspose')
-class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
+class Conv3DTranspose(Conv3D):
   """Transposed convolution layer (sometimes called Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -678,8 +927,6 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(Conv3DTranspose, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
@@ -697,6 +944,313 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def build(self, input_shape):
+    if len(input_shape) != 5:
+      raise ValueError('Inputs should have rank 5, received input shape:',
+                       str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined, found None: ' + str(input_shape))
+    input_dim = input_shape[channel_axis]
+    kernel_shape = self.kernel_size + (self.filters, input_dim)
+    self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
+
+    self.kernel = self.add_variable(
+        'kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(
+          'bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs_shape = array_ops.shape(inputs)
+    batch_size = inputs_shape[0]
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    self.input_spec = InputSpec(ndim=5, axes={c_axis: inputs_shape[c_axis]})
+
+    depth = inputs_shape[d_axis]
+    height = inputs_shape[h_axis]
+    width = inputs_shape[w_axis]
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    # Infer the dynamic output shape:
+    out_depth = conv_utils.deconv_output_length(depth,
+                                                kernel_d,
+                                                self.padding,
+                                                stride_d)
+    out_height = conv_utils.deconv_output_length(height,
+                                                 kernel_h,
+                                                 self.padding,
+                                                 stride_h)
+    out_width = conv_utils.deconv_output_length(width,
+                                                kernel_w,
+                                                self.padding,
+                                                stride_w)
+    if self.data_format == 'channels_first':
+      output_shape = (batch_size, self.filters, out_depth, out_height,
+                      out_width)
+      strides = (1, 1, stride_d, stride_h, stride_w)
+    else:
+      output_shape = (batch_size, out_depth, out_height, out_width,
+                      self.filters)
+      strides = (1, stride_d, stride_h, stride_w, 1)
+
+    output_shape_tensor = array_ops.stack(output_shape)
+    outputs = nn.conv3d_transpose(
+        inputs,
+        self.kernel,
+        output_shape_tensor,
+        strides,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=5),
+        padding=self.padding.upper())
+
+    if not context.executing_eagerly():
+      # Infer the static output shape:
+      out_shape = inputs.get_shape().as_list()
+      out_shape[c_axis] = self.filters
+      out_shape[d_axis] = conv_utils.deconv_output_length(out_shape[d_axis],
+                                                          kernel_d,
+                                                          self.padding,
+                                                          stride_d)
+      out_shape[h_axis] = conv_utils.deconv_output_length(out_shape[h_axis],
+                                                          kernel_h,
+                                                          self.padding,
+                                                          stride_h)
+      out_shape[w_axis] = conv_utils.deconv_output_length(out_shape[w_axis],
+                                                          kernel_w,
+                                                          self.padding,
+                                                          stride_w)
+      outputs.set_shape(out_shape)
+
+    if self.use_bias:
+      outputs_shape = outputs.shape.as_list()
+      if outputs_shape[0] is None:
+        outputs_shape[0] = -1
+      if self.data_format == 'channels_first':
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1],
+            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
+        ])
+      else:
+        outputs_4d = array_ops.reshape(outputs, [
+            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
+            outputs_shape[3], outputs_shape[4]
+        ])
+      outputs_4d = nn.bias_add(
+          outputs_4d,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+      outputs = array_ops.reshape(outputs_4d, outputs_shape)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = list(input_shape)
+    if self.data_format == 'channels_first':
+      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
+    else:
+      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
+
+    kernel_d, kernel_h, kernel_w = self.kernel_size
+    stride_d, stride_h, stride_w = self.strides
+
+    output_shape[c_axis] = self.filters
+    output_shape[d_axis] = conv_utils.deconv_output_length(
+        output_shape[d_axis], kernel_d, self.padding, stride_d)
+    output_shape[h_axis] = conv_utils.deconv_output_length(
+        output_shape[h_axis], kernel_h, self.padding, stride_h)
+    output_shape[w_axis] = conv_utils.deconv_output_length(
+        output_shape[w_axis], kernel_w, self.padding, stride_w)
+    return tensor_shape.TensorShape(output_shape)
+
+
+class SeparableConv(Conv):
+  """Abstract base layer for separable nD convolution.
+
+  This layer performs a depthwise convolution that acts separately on
+  channels, followed by a pointwise convolution that mixes channels.
+  If `use_bias` is True and a bias initializer is provided,
+  it adds a bias vector to the output.
+  It then optionally applies an activation function to produce the final output.
+
+  Arguments:
+    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
+    filters: Integer, the dimensionality of the output space (i.e. the number
+      of filters in the convolution).
+    kernel_size: A tuple or list of integers specifying the spatial
+      dimensions of the filters. Can be a single integer to specify the same
+      value for all spatial dimensions.
+    strides: A tuple or list of integers specifying the strides
+      of the convolution. Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Specifying any `stride` value != 1 is incompatible with specifying
+      any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, ..., channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, ...)`.
+    dilation_rate: An integer or tuple/list of 2 integers, specifying
+      the dilation rate to use for dilated convolution.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+      Currently, specifying any `dilation_rate` value != 1 is
+      incompatible with specifying any stride value != 1.
+    depth_multiplier: The number of depthwise convolution output channels for
+      each input channel. The total number of depthwise convolution output
+      channels will be equal to `num_filters_in * depth_multiplier`.
+    activation: Activation function. Set it to None to maintain a
+      linear activation.
+    use_bias: Boolean, whether the layer uses a bias.
+    depthwise_initializer: An initializer for the depthwise convolution kernel.
+    pointwise_initializer: An initializer for the pointwise convolution kernel.
+    bias_initializer: An initializer for the bias vector. If None, the default
+      initializer will be used.
+    depthwise_regularizer: Optional regularizer for the depthwise
+      convolution kernel.
+    pointwise_regularizer: Optional regularizer for the pointwise
+      convolution kernel.
+    bias_regularizer: Optional regularizer for the bias vector.
+    activity_regularizer: Optional regularizer function for the output.
+    depthwise_constraint: Optional projection function to be applied to the
+        depthwise kernel after being updated by an `Optimizer` (e.g. used for
+        norm constraints or value constraints for layer weights). The function
+        must take as input the unprojected variable and must return the
+        projected variable (which must have the same shape). Constraints are
+        not safe to use when doing asynchronous distributed training.
+    pointwise_constraint: Optional projection function to be applied to the
+        pointwise kernel after being updated by an `Optimizer`.
+    bias_constraint: Optional projection function to be applied to the
+        bias after being updated by an `Optimizer`.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self,
+               rank,
+               filters,
+               kernel_size,
+               strides=1,
+               padding='valid',
+               data_format=None,
+               dilation_rate=1,
+               depth_multiplier=1,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               pointwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               pointwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               pointwise_constraint=None,
+               bias_constraint=None,
+               trainable=True,
+               name=None,
+               **kwargs):
+    super(SeparableConv, self).__init__(
+        rank=rank,
+        filters=filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilation_rate=dilation_rate,
+        activation=activations.get(activation),
+        use_bias=use_bias,
+        bias_regularizer=regularizers.get(bias_regularizer),
+        activity_regularizer=regularizers.get(activity_regularizer),
+        bias_constraint=bias_constraint,
+        trainable=trainable,
+        name=name,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.pointwise_initializer = initializers.get(pointwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.pointwise_regularizer = regularizers.get(pointwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.pointwise_constraint = constraints.get(pointwise_constraint)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = -1
+    if input_shape[channel_axis].value is None:
+      raise ValueError('The channel dimension of the inputs '
+                       'should be defined. Found `None`.')
+    input_dim = input_shape[channel_axis].value
+    self.input_spec = InputSpec(ndim=self.rank + 2,
+                                axes={channel_axis: input_dim})
+    depthwise_kernel_shape = self.kernel_size + (input_dim,
+                                                 self.depth_multiplier)
+    pointwise_kernel_shape = (
+        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
+
+    self.depthwise_kernel = self.add_variable(
+        name='depthwise_kernel',
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    self.pointwise_kernel = self.add_variable(
+        name='pointwise_kernel',
+        shape=pointwise_kernel_shape,
+        initializer=self.pointwise_initializer,
+        regularizer=self.pointwise_regularizer,
+        constraint=self.pointwise_constraint,
+        trainable=True,
+        dtype=self.dtype)
+    if self.use_bias:
+      self.bias = self.add_variable(name='bias',
+                                    shape=(self.filters,),
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    trainable=True,
+                                    dtype=self.dtype)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    raise NotImplementedError
+
   def get_config(self):
     config = {
         'filters': self.filters,
@@ -704,24 +1258,34 @@ class Conv3DTranspose(tf_convolutional_layers.Conv3DTranspose, Layer):
         'strides': self.strides,
         'padding': self.padding,
         'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'depthwise_initializer':
+            initializers.serialize(self.depthwise_initializer),
+        'pointwise_initializer':
+            initializers.serialize(self.pointwise_initializer),
         'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'depthwise_regularizer':
+            regularizers.serialize(self.depthwise_regularizer),
+        'pointwise_regularizer':
+            regularizers.serialize(self.pointwise_regularizer),
         'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'depthwise_constraint':
+            constraints.serialize(self.depthwise_constraint),
+        'pointwise_constraint':
+            constraints.serialize(self.pointwise_constraint),
         'bias_constraint': constraints.serialize(self.bias_constraint)
     }
-    base_config = super(Conv3DTranspose, self).get_config()
+    base_config = super(SeparableConv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.SeparableConv1D',
            'keras.layers.SeparableConvolution1D')
-class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
+class SeparableConv1D(SeparableConv):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -801,15 +1365,15 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv1D, self).__init__(
+        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -824,44 +1388,46 @@ class SeparableConv1D(tf_convolutional_layers.SeparableConv1D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides * 2 + (1,)
+      spatial_start_dim = 1
+    else:
+      strides = (1, 1) + self.strides * 2
+      spatial_start_dim = 2
+
+    # Explicitly broadcast inputs and kernels to 4D.
+    # TODO(fchollet): refactor when a native separable_conv1d op is available.
+    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
+    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
+    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
+    dilation_rate = (1,) + self.dilation_rate
+
+    outputs = nn.separable_conv2d(
+        inputs,
+        depthwise_kernel,
+        pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
 
 
 @tf_export('keras.layers.SeparableConv2D',
            'keras.layers.SeparableConvolution2D')
-class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
+class SeparableConv2D(SeparableConv):
   """Depthwise separable 2D convolution.
 
   Separable convolutions consist in first performing
@@ -958,15 +1524,15 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
                pointwise_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
     super(SeparableConv2D, self).__init__(
+        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        depth_multiplier=depth_multiplier,
         activation=activations.get(activation),
         use_bias=use_bias,
         depthwise_initializer=initializers.get(depthwise_initializer),
@@ -981,47 +1547,224 @@ class SeparableConv2D(tf_convolutional_layers.SeparableConv2D, Layer):
         bias_constraint=constraints.get(bias_constraint),
         **kwargs)
 
+  def call(self, inputs):
+    # Apply the actual ops.
+    if self.data_format == 'channels_last':
+      strides = (1,) + self.strides + (1,)
+    else:
+      strides = (1, 1) + self.strides
+    outputs = nn.separable_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        self.pointwise_kernel,
+        strides=strides,
+        padding=self.padding.upper(),
+        rate=self.dilation_rate,
+        data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.use_bias:
+      outputs = nn.bias_add(
+          outputs,
+          self.bias,
+          data_format=conv_utils.convert_data_format(self.data_format, ndim=4))
+
+    if self.activation is not None:
+      return self.activation(outputs)
+    return outputs
+
+
+@tf_export('keras.layers.DepthwiseConv2D')
+class DepthwiseConv2D(Conv2D):
+  """Depthwise separable 2D convolution.
+
+  Depthwise Separable convolutions consists in performing
+  just the first step in a depthwise spatial convolution
+  (which acts on each input channel separately).
+  The `depth_multiplier` argument controls how many
+  output channels are generated per input channel in the depthwise step.
+
+  Arguments:
+    kernel_size: An integer or tuple/list of 2 integers, specifying the
+        width and height of the 2D convolution window.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+        specifying the strides of the convolution along the width and height.
+        Can be a single integer to specify the same value for
+        all spatial dimensions.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: one of `'valid'` or `'same'` (case-insensitive).
+    depth_multiplier: The number of depthwise convolution output channels
+        for each input channel.
+        The total number of depthwise convolution output
+        channels will be equal to `filters_in * depth_multiplier`.
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, height, width, channels)` while `channels_first`
+        corresponds to inputs with shape
+        `(batch, channels, height, width)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be 'channels_last'.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. 'linear' activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    depthwise_initializer: Initializer for the depthwise kernel matrix.
+    bias_initializer: Initializer for the bias vector.
+    depthwise_regularizer: Regularizer function applied to
+        the depthwise kernel matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+        the output of the layer (its 'activation').
+    depthwise_constraint: Constraint function applied to
+        the depthwise kernel matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    4D tensor with shape:
+    `[batch, channels, rows, cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, rows, cols, channels]` if data_format='channels_last'.
+
+  Output shape:
+    4D tensor with shape:
+    `[batch, filters, new_rows, new_cols]` if data_format='channels_first'
+    or 4D tensor with shape:
+    `[batch, new_rows, new_cols, filters]` if data_format='channels_last'.
+    `rows` and `cols` values might have changed due to padding.
+  """
+
+  def __init__(self,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               depth_multiplier=1,
+               data_format=None,
+               activation=None,
+               use_bias=True,
+               depthwise_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               depthwise_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               depthwise_constraint=None,
+               bias_constraint=None,
+               **kwargs):
+    super(DepthwiseConv2D, self).__init__(
+        filters=None,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        activation=activation,
+        use_bias=use_bias,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        bias_constraint=bias_constraint,
+        **kwargs)
+    self.depth_multiplier = depth_multiplier
+    self.depthwise_initializer = initializers.get(depthwise_initializer)
+    self.depthwise_regularizer = regularizers.get(depthwise_regularizer)
+    self.depthwise_constraint = constraints.get(depthwise_constraint)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+  def build(self, input_shape):
+    if len(input_shape) < 4:
+      raise ValueError('Inputs to `DepthwiseConv2D` should have rank 4. '
+                       'Received input shape:', str(input_shape))
+    if self.data_format == 'channels_first':
+      channel_axis = 1
+    else:
+      channel_axis = 3
+    if input_shape[channel_axis] is None:
+      raise ValueError('The channel dimension of the inputs to '
+                       '`DepthwiseConv2D` '
+                       'should be defined. Found `None`.')
+    input_dim = int(input_shape[channel_axis])
+    depthwise_kernel_shape = (self.kernel_size[0],
+                              self.kernel_size[1],
+                              input_dim,
+                              self.depth_multiplier)
+
+    self.depthwise_kernel = self.add_weight(
+        shape=depthwise_kernel_shape,
+        initializer=self.depthwise_initializer,
+        name='depthwise_kernel',
+        regularizer=self.depthwise_regularizer,
+        constraint=self.depthwise_constraint)
+
+    if self.use_bias:
+      self.bias = self.add_weight(shape=(input_dim * self.depth_multiplier,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+    else:
+      self.bias = None
+    # Set input spec.
+    self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
+    self.built = True
+
+  def call(self, inputs, training=None):
+    outputs = backend.depthwise_conv2d(
+        inputs,
+        self.depthwise_kernel,
+        strides=self.strides,
+        padding=self.padding,
+        dilation_rate=self.dilation_rate,
+        data_format=self.data_format)
+
+    if self.bias:
+      outputs = backend.bias_add(
+          outputs,
+          self.bias,
+          data_format=self.data_format)
+
+    if self.activation is not None:
+      return self.activation(outputs)
+
+    return outputs
+
+  @shape_type_conversion
+  def compute_output_shape(self, input_shape):
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+      out_filters = input_shape[1] * self.depth_multiplier
+    elif self.data_format == 'channels_last':
+      rows = input_shape[1]
+      cols = input_shape[2]
+      out_filters = input_shape[3] * self.depth_multiplier
+
+    rows = conv_utils.conv_output_length(rows, self.kernel_size[0],
+                                         self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.kernel_size[1],
+                                         self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return (input_shape[0], out_filters, rows, cols)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], rows, cols, out_filters)
+
   def get_config(self):
-    config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'depthwise_initializer':
-            initializers.serialize(self.depthwise_initializer),
-        'pointwise_initializer':
-            initializers.serialize(self.pointwise_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'depthwise_regularizer':
-            regularizers.serialize(self.depthwise_regularizer),
-        'pointwise_regularizer':
-            regularizers.serialize(self.pointwise_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'depthwise_constraint':
-            constraints.serialize(self.depthwise_constraint),
-        'pointwise_constraint':
-            constraints.serialize(self.pointwise_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(SeparableConv2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    config = super(DepthwiseConv2D, self).get_config()
+    config.pop('filters')
+    config.pop('kernel_initializer')
+    config.pop('kernel_regularizer')
+    config.pop('kernel_constraint')
+    config['depth_multiplier'] = self.depth_multiplier
+    config['depthwise_initializer'] = initializers.serialize(
+        self.depthwise_initializer)
+    config['depthwise_regularizer'] = regularizers.serialize(
+        self.depthwise_regularizer)
+    config['depthwise_constraint'] = constraints.serialize(
+        self.depthwise_constraint)
+    return config
 
 
 @tf_export('keras.layers.UpSampling1D')
@@ -1051,7 +1794,7 @@ class UpSampling1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], size, input_shape[2]])
 
   def call(self, inputs):
-    output = K.repeat_elements(inputs, self.size, axis=1)
+    output = backend.repeat_elements(inputs, self.size, axis=1)
     return output
 
   def get_config(self):
@@ -1120,7 +1863,8 @@ class UpSampling2D(Layer):
           [input_shape[0], height, width, input_shape[3]])
 
   def call(self, inputs):
-    return K.resize_images(inputs, self.size[0], self.size[1], self.data_format)
+    return backend.resize_images(
+        inputs, self.size[0], self.size[1], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1192,8 +1936,8 @@ class UpSampling3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.resize_volumes(inputs, self.size[0], self.size[1], self.size[2],
-                            self.data_format)
+    return backend.resize_volumes(
+        inputs, self.size[0], self.size[1], self.size[2], self.data_format)
 
   def get_config(self):
     config = {'size': self.size, 'data_format': self.data_format}
@@ -1234,7 +1978,7 @@ class ZeroPadding1D(Layer):
     return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
 
   def call(self, inputs):
-    return K.temporal_padding(inputs, padding=self.padding)
+    return backend.temporal_padding(inputs, padding=self.padding)
 
   def get_config(self):
     config = {'padding': self.padding}
@@ -1335,7 +2079,7 @@ class ZeroPadding2D(Layer):
           [input_shape[0], rows, cols, input_shape[3]])
 
   def call(self, inputs):
-    return K.spatial_2d_padding(
+    return backend.spatial_2d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
@@ -1453,7 +2197,7 @@ class ZeroPadding3D(Layer):
           [input_shape[0], dim1, dim2, dim3, input_shape[4]])
 
   def call(self, inputs):
-    return K.spatial_3d_padding(
+    return backend.spatial_3d_padding(
         inputs, padding=self.padding, data_format=self.data_format)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
index d95a0942452afa82e277c358be5c3b2ba061ac98..6b2a1d98fe736744755134aa3aef26c433a604bc 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+# pylint: disable=protected-access
 """Convolutional-recurrent layers.
 """
 from __future__ import absolute_import
@@ -26,179 +27,456 @@ from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
+from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
-from tensorflow.python.keras._impl.keras.layers.recurrent import Recurrent
+from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
 from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras._impl.keras.utils import generic_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-class ConvRecurrent2D(Recurrent):
-  """Abstract base class for convolutional recurrent layers.
-
-  Do not use in a model -- it's not a functional layer!
+class ConvRNN2D(RNN):
+  """Base class for convolutional-recurrent layers.
 
   Arguments:
-      filters: Integer, the dimensionality of the output space
-          (i.e. the number of output filters in the convolution).
-      kernel_size: An integer or tuple/list of n integers, specifying the
-          dimensions of the convolution window.
-      strides: An integer or tuple/list of n integers,
-          specifying the strides of the convolution.
-          Specifying any stride value != 1 is incompatible with specifying
-          any `dilation_rate` value != 1.
-      padding: One of `"valid"` or `"same"` (case-insensitive).
-      data_format: A string,
-          one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
-          It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
-          If you never set it, then it will be "channels_last".
-      dilation_rate: An integer or tuple/list of n integers, specifying
-          the dilation rate to use for dilated convolution.
-          Currently, specifying any `dilation_rate` value != 1 is
-          incompatible with specifying any `strides` value != 1.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
+    cell: A RNN cell instance. A RNN cell is a class that has:
+        - a `call(input_at_t, states_at_t)` method, returning
+            `(output_at_t, states_at_t_plus_1)`. The call method of the
+            cell can also take the optional argument `constants`, see
+            section "Note on passing external constants" below.
+        - a `state_size` attribute. This can be a single integer
+            (single state) in which case it is
+            the number of channels of the recurrent state
+            (which should be the same as the number of channels of the cell
+            output). This can also be a list/tuple of integers
+            (one size per state). In this case, the first entry
+            (`state_size[0]`) should be the same as
+            the size of the cell output.
+    return_sequences: Boolean. Whether to return the last output.
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    input_shape: Use this argument to specify the shape of the
+        input when this layer is the first one in a model.
 
   Input shape:
-      5D tensor with shape `(num_samples, timesteps, channels, rows, cols)`.
+    5D tensor with shape:
+    `(samples, timesteps, channels, rows, cols)`
+    if data_format='channels_first' or 5D tensor with shape:
+    `(samples, timesteps, rows, cols, channels)`
+    if data_format='channels_last'.
 
   Output shape:
-      - if `return_sequences`: 5D tensor with shape
-          `(num_samples, timesteps, channels, rows, cols)`.
-      - else, 4D tensor with shape `(num_samples, channels, rows, cols)`.
-
-  # Masking
-      This layer supports masking for input data with a variable number
-      of timesteps. To introduce masks to your data,
-      use an `Embedding` layer with the `mask_zero` parameter
-      set to `True`.
-      **Note:** for the time being, masking is only supported with Theano.
-
-  # Note on using statefulness in RNNs
-      You can set RNN layers to be 'stateful', which means that the states
-      computed for the samples in one batch will be reused as initial states
-      for the samples in the next batch.
-      This assumes a one-to-one mapping between
-      samples in different successive batches.
-
-      To enable statefulness:
-          - specify `stateful=True` in the layer constructor.
-          - specify a fixed batch size for your model, by passing
-              a `batch_input_size=(...)` to the first layer in your model.
-              This is the expected shape of your inputs *including the batch
-              size*.
-              It should be a tuple of integers, e.g. `(32, 10, 100)`.
-
-      To reset the states of your model, call `.reset_states()` on either
-      a specific layer, or on your entire model.
+    - if `return_state`: a list of tensors. The first tensor is
+        the output. The remaining tensors are the last states,
+        each 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+        `rows` and `cols` values might have changed due to padding.
+    - if `return_sequences`: 5D tensor with shape:
+        `(samples, timesteps, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 5D tensor with shape:
+        `(samples, timesteps, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+    - else, 4D tensor with shape:
+        `(samples, filters, new_rows, new_cols)`
+        if data_format='channels_first'
+        or 4D tensor with shape:
+        `(samples, new_rows, new_cols, filters)`
+        if data_format='channels_last'.
+
+  Masking:
+    This layer supports masking for input data with a variable number
+    of timesteps. To introduce masks to your data,
+    use an Embedding layer with the `mask_zero` parameter
+    set to `True`.
+
+  Note on using statefulness in RNNs:
+    You can set RNN layers to be 'stateful', which means that the states
+    computed for the samples in one batch will be reused as initial states
+    for the samples in the next batch. This assumes a one-to-one mapping
+    between samples in different successive batches.
+    To enable statefulness:
+        - specify `stateful=True` in the layer constructor.
+        - specify a fixed batch size for your model, by passing
+             - if sequential model:
+                `batch_input_shape=(...)` to the first layer in your model.
+             - if functional model with 1 or more Input layers:
+                `batch_shape=(...)` to all the first layers in your model.
+                This is the expected shape of your inputs
+                *including the batch size*.
+                It should be a tuple of integers,
+                e.g. `(32, 10, 100, 100, 32)`.
+                Note that the number of rows and columns should be specified
+                too.
+        - specify `shuffle=False` when calling fit().
+    To reset the states of your model, call `.reset_states()` on either
+    a specific layer, or on your entire model.
+
+  Note on specifying the initial state of RNNs:
+    You can specify the initial state of RNN layers symbolically by
+    calling them with the keyword argument `initial_state`. The value of
+    `initial_state` should be a tensor or list of tensors representing
+    the initial state of the RNN layer.
+    You can specify the initial state of RNN layers numerically by
+    calling `reset_states` with the keyword argument `states`. The value of
+    `states` should be a numpy array or list of numpy arrays representing
+    the initial state of the RNN layer.
+
+  Note on passing external constants to RNNs:
+    You can pass "external" constants to the cell using the `constants`
+    keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
+    requires that the `cell.call` method accepts the same keyword argument
+    `constants`. Such constants can be used to condition the cell
+    transformation on additional static inputs (not changing over time),
+    a.k.a. an attention mechanism.
   """
 
   def __init__(self,
-               filters,
-               kernel_size,
-               strides=(1, 1),
-               padding='valid',
-               data_format=None,
-               dilation_rate=(1, 1),
+               cell,
                return_sequences=False,
+               return_state=False,
                go_backwards=False,
                stateful=False,
+               unroll=False,
                **kwargs):
-    super(ConvRecurrent2D, self).__init__(**kwargs)
-    self.filters = filters
-    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
-    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = conv_utils.normalize_padding(padding)
-    self.data_format = conv_utils.normalize_data_format(data_format)
-    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
-                                                    'dilation_rate')
-    self.return_sequences = return_sequences
-    self.go_backwards = go_backwards
-    self.stateful = stateful
+    if unroll:
+      raise TypeError('Unrolling isn\'t possible with '
+                      'convolutional RNNs.')
+    if isinstance(cell, (list, tuple)):
+      # The StackedConvRNN2DCells isn't implemented yet.
+      raise TypeError('It is not possible at the moment to'
+                      'stack convolutional cells.')
+    super(ConvRNN2D, self).__init__(cell,
+                                    return_sequences,
+                                    return_state,
+                                    go_backwards,
+                                    stateful,
+                                    unroll,
+                                    **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
-    self.state_spec = None
+    self.states = None
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
     if isinstance(input_shape, list):
       input_shape = input_shape[0]
-    if self.data_format == 'channels_first':
+
+    cell = self.cell
+    if cell.data_format == 'channels_first':
       rows = input_shape[3]
       cols = input_shape[4]
-    elif self.data_format == 'channels_last':
+    elif cell.data_format == 'channels_last':
       rows = input_shape[2]
       cols = input_shape[3]
-    rows = conv_utils.conv_output_length(
-        rows,
-        self.kernel_size[0],
-        padding=self.padding,
-        stride=self.strides[0],
-        dilation=self.dilation_rate[0])
-    cols = conv_utils.conv_output_length(
-        cols,
-        self.kernel_size[1],
-        padding=self.padding,
-        stride=self.strides[1],
-        dilation=self.dilation_rate[1])
+    rows = conv_utils.conv_output_length(rows,
+                                         cell.kernel_size[0],
+                                         padding=cell.padding,
+                                         stride=cell.strides[0],
+                                         dilation=cell.dilation_rate[0])
+    cols = conv_utils.conv_output_length(cols,
+                                         cell.kernel_size[1],
+                                         padding=cell.padding,
+                                         stride=cell.strides[1],
+                                         dilation=cell.dilation_rate[1])
+
+    if cell.data_format == 'channels_first':
+      output_shape = input_shape[:2] + (cell.filters, rows, cols)
+    elif cell.data_format == 'channels_last':
+      output_shape = input_shape[:2] + (rows, cols, cell.filters)
+
+    if not self.return_sequences:
+      output_shape = output_shape[:1] + output_shape[2:]
+
+    if self.return_state:
+      output_shape = [output_shape]
+      if cell.data_format == 'channels_first':
+        output_shape += [(input_shape[0], cell.filters, rows, cols)
+                         for _ in range(2)]
+      elif cell.data_format == 'channels_last':
+        output_shape += [(input_shape[0], rows, cols, cell.filters)
+                         for _ in range(2)]
+    return output_shape
+
+  @shape_type_conversion
+  def build(self, input_shape):
+    # Note input_shape will be list of shapes of initial states and
+    # constants if these are passed in __call__.
+    if self._num_constants is not None:
+      constants_shape = input_shape[-self._num_constants:]
+    else:
+      constants_shape = None
+
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+
+    batch_size = input_shape[0] if self.stateful else None
+    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:5])
+
+    # allow cell (if layer) to build before we set or validate state_spec
+    if isinstance(self.cell, Layer):
+      step_input_shape = (input_shape[0],) + input_shape[2:]
+      if constants_shape is not None:
+        self.cell.build([step_input_shape] + constants_shape)
+      else:
+        self.cell.build(step_input_shape)
+
+    # set or validate state_spec
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = list(self.cell.state_size)
+    else:
+      state_size = [self.cell.state_size]
+
+    if self.state_spec is not None:
+      # initial_state was passed in call, check compatibility
+      if self.cell.data_format == 'channels_first':
+        ch_dim = 1
+      elif self.cell.data_format == 'channels_last':
+        ch_dim = 3
+      if [spec.shape[ch_dim] for spec in self.state_spec] != state_size:
+        raise ValueError(
+            'An initial_state was passed that is not compatible with '
+            '`cell.state_size`. Received `state_spec`={}; '
+            'However `cell.state_size` is '
+            '{}'.format([spec.shape for spec in self.state_spec],
+                        self.cell.state_size))
+    else:
+      if self.cell.data_format == 'channels_first':
+        self.state_spec = [InputSpec(shape=(None, dim, None, None))
+                           for dim in state_size]
+      elif self.cell.data_format == 'channels_last':
+        self.state_spec = [InputSpec(shape=(None, None, None, dim))
+                           for dim in state_size]
+    if self.stateful:
+      self.reset_states()
+    self.built = True
+
+  def get_initial_state(self, inputs):
+    # (samples, timesteps, rows, cols, filters)
+    initial_state = K.zeros_like(inputs)
+    # (samples, rows, cols, filters)
+    initial_state = K.sum(initial_state, axis=1)
+    shape = list(self.cell.kernel_shape)
+    shape[-1] = self.cell.filters
+    initial_state = self.cell.input_conv(initial_state,
+                                         K.zeros(tuple(shape)),
+                                         padding=self.cell.padding)
+
+    if hasattr(self.cell.state_size, '__len__'):
+      return [initial_state for _ in self.cell.state_size]
+    else:
+      return [initial_state]
+
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    inputs, initial_state, constants = self._standardize_args(
+        inputs, initial_state, constants)
+
+    if initial_state is None and constants is None:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+    # If any of `initial_state` or `constants` are specified and are Keras
+    # tensors, then add them to the inputs and temporarily modify the
+    # input_spec to include them.
+
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      self.state_spec = []
+      for state in initial_state:
+        shape = K.int_shape(state)
+        self.state_spec.append(InputSpec(shape=shape))
+
+      additional_specs += self.state_spec
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      self.constants_spec = [InputSpec(shape=K.int_shape(constant))
+                             for constant in constants]
+      self._num_constants = len(constants)
+      additional_specs += self.constants_spec
+    # at this point additional_inputs cannot be empty
+    for tensor in additional_inputs:
+      if K.is_keras_tensor(tensor) != K.is_keras_tensor(additional_inputs[0]):
+        raise ValueError('The initial state or constants of an RNN'
+                         ' layer cannot be specified with a mix of'
+                         ' Keras tensors and non-Keras tensors')
+
+    if K.is_keras_tensor(additional_inputs[0]):
+      # Compute the full input spec, including state and constants
+      full_input = [inputs] + additional_inputs
+      full_input_spec = self.input_spec + additional_specs
+      # Perform the call with temporarily replaced input_spec
+      original_input_spec = self.input_spec
+      self.input_spec = full_input_spec
+      output = super(ConvRNN2D, self).__call__(full_input, **kwargs)
+      self.input_spec = original_input_spec
+      return output
+    else:
+      return super(ConvRNN2D, self).__call__(inputs, **kwargs)
+
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           initial_state=None,
+           constants=None):
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' +
+                       str(len(initial_state)) +
+                       ' initial states.')
+    timesteps = K.int_shape(inputs)[1]
+
+    kwargs = {}
+    if generic_utils.has_arg(self.cell.call, 'training'):
+      kwargs['training'] = training
+
+    if constants:
+      if not generic_utils.has_arg(self.cell.call, 'constants'):
+        raise ValueError('RNN cell does not support constants')
+
+      def step(inputs, states):
+        constants = states[-self._num_constants:]
+        states = states[:-self._num_constants]
+        return self.cell.call(inputs, states, constants=constants,
+                              **kwargs)
+    else:
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+    last_output, outputs, states = K.rnn(step,
+                                         inputs,
+                                         initial_state,
+                                         constants=constants,
+                                         go_backwards=self.go_backwards,
+                                         mask=mask,
+                                         input_length=timesteps)
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(K.update(self.states[i], states[i]))
+      self.add_update(updates, inputs=True)
+
     if self.return_sequences:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], input_shape[1], self.filters, rows,
-                        cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], input_shape[1], rows, cols,
-                        self.filters)
+      output = outputs
     else:
-      if self.data_format == 'channels_first':
-        output_shape = (input_shape[0], self.filters, rows, cols)
-      elif self.data_format == 'channels_last':
-        output_shape = (input_shape[0], rows, cols, self.filters)
+      output = last_output
+
+    # Properly set learning phase
+    if getattr(last_output, '_uses_learning_phase', False):
+      output._uses_learning_phase = True
 
     if self.return_state:
-      if self.data_format == 'channels_first':
-        output_shape = [output_shape] + [
-            (input_shape[0], self.filters, rows, cols) for _ in range(2)
-        ]
-      elif self.data_format == 'channels_last':
-        output_shape = [output_shape] + [
-            (input_shape[0], rows, cols, self.filters) for _ in range(2)
-        ]
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      else:
+        states = list(states)
+      return [output] + states
+    else:
+      return output
 
-    return output_shape
+  def reset_states(self, states=None):
+    if not self.stateful:
+      raise AttributeError('Layer must be stateful.')
+    input_shape = self.input_spec[0].shape
+    state_shape = self.compute_output_shape(input_shape)
+    if self.return_state:
+      state_shape = state_shape[0]
+    if self.return_sequences:
+      state_shape = state_shape[:1].concatenate(state_shape[2:])
+    if None in state_shape:
+      raise ValueError('If a RNN is stateful, it needs to know '
+                       'its batch size. Specify the batch size '
+                       'of your input tensors: \n'
+                       '- If using a Sequential model, '
+                       'specify the batch size by passing '
+                       'a `batch_input_shape` '
+                       'argument to your first layer.\n'
+                       '- If using the functional API, specify '
+                       'the time dimension by passing a '
+                       '`batch_shape` argument to your Input layer.\n'
+                       'The same thing goes for the number of rows and '
+                       'columns.')
 
-  def get_config(self):
-    config = {
-        'filters': self.filters,
-        'kernel_size': self.kernel_size,
-        'strides': self.strides,
-        'padding': self.padding,
-        'data_format': self.data_format,
-        'dilation_rate': self.dilation_rate,
-        'return_sequences': self.return_sequences,
-        'go_backwards': self.go_backwards,
-        'stateful': self.stateful
-    }
-    base_config = super(ConvRecurrent2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    # helper function
+    def get_tuple_shape(nb_channels):
+      result = list(state_shape)
+      if self.cell.data_format == 'channels_first':
+        result[1] = nb_channels
+      elif self.cell.data_format == 'channels_last':
+        result[3] = nb_channels
+      else:
+        raise KeyError
+      return tuple(result)
 
+    # initialize state if None
+    if self.states[0] is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        self.states = [K.zeros(get_tuple_shape(dim))
+                       for dim in self.cell.state_size]
+      else:
+        self.states = [K.zeros(get_tuple_shape(self.cell.state_size))]
+    elif states is None:
+      if hasattr(self.cell.state_size, '__len__'):
+        for state, dim in zip(self.states, self.cell.state_size):
+          K.set_value(state, np.zeros(get_tuple_shape(dim)))
+      else:
+        K.set_value(self.states[0],
+                    np.zeros(get_tuple_shape(self.cell.state_size)))
+    else:
+      if not isinstance(states, (list, tuple)):
+        states = [states]
+      if len(states) != len(self.states):
+        raise ValueError('Layer ' + self.name + ' expects ' +
+                         str(len(self.states)) + ' states, ' +
+                         'but it received ' + str(len(states)) +
+                         ' state values. Input received: ' + str(states))
+      for index, (value, state) in enumerate(zip(states, self.states)):
+        if hasattr(self.cell.state_size, '__len__'):
+          dim = self.cell.state_size[index]
+        else:
+          dim = self.cell.state_size
+        if value.shape != get_tuple_shape(dim):
+          raise ValueError('State ' + str(index) +
+                           ' is incompatible with layer ' +
+                           self.name + ': expected shape=' +
+                           str(get_tuple_shape(dim)) +
+                           ', found shape=' + str(value.shape))
+        # TODO(anjalisridhar): consider batch calls to `set_value`.
+        K.set_value(state, value)
 
-@tf_export('keras.layers.ConvLSTM2D')
-class ConvLSTM2D(ConvRecurrent2D):
-  """Convolutional LSTM.
 
-  It is similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+class ConvLSTM2DCell(Layer):
+  """Cell class for the ConvLSTM2D layer.
 
-  Arguments:
+  # Arguments
       filters: Integer, the dimensionality of the output space
           (i.e. the number of output filters in the convolution).
       kernel_size: An integer or tuple/list of n integers, specifying the
@@ -210,11 +488,6 @@ class ConvLSTM2D(ConvRecurrent2D):
       padding: One of `"valid"` or `"same"` (case-insensitive).
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
-          The ordering of the dimensions in the inputs.
-          `channels_last` corresponds to inputs with shape
-          `(batch, time, ..., channels)`
-          while `channels_first` corresponds to
-          inputs with shape `(batch, time, channels, ...)`.
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
@@ -229,71 +502,32 @@ class ConvLSTM2D(ConvRecurrent2D):
           for the recurrent step.
       use_bias: Boolean, whether the layer uses a bias vector.
       kernel_initializer: Initializer for the `kernel` weights matrix,
-          used for the linear transformation of the inputs..
+          used for the linear transformation of the inputs.
       recurrent_initializer: Initializer for the `recurrent_kernel`
           weights matrix,
-          used for the linear transformation of the recurrent state..
+          used for the linear transformation of the recurrent state.
       bias_initializer: Initializer for the bias vector.
       unit_forget_bias: Boolean.
           If True, add 1 to the bias of the forget gate at initialization.
           Use in combination with `bias_initializer="zeros"`.
-          This is recommended in [Jozefowicz et
-            al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+          This is recommended in [Jozefowicz et al.]
+          (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
       kernel_regularizer: Regularizer function applied to
           the `kernel` weights matrix.
       recurrent_regularizer: Regularizer function applied to
           the `recurrent_kernel` weights matrix.
       bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-          the output of the layer (its "activation")..
       kernel_constraint: Constraint function applied to
           the `kernel` weights matrix.
       recurrent_constraint: Constraint function applied to
           the `recurrent_kernel` weights matrix.
       bias_constraint: Constraint function applied to the bias vector.
-      return_sequences: Boolean. Whether to return the last output
-          in the output sequence, or the full sequence.
-      go_backwards: Boolean (default False).
-          If True, rocess the input sequence backwards.
-      stateful: Boolean (default False). If True, the last state
-          for each sample at index i in a batch will be used as initial
-          state for the sample of index i in the following batch.
       dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the inputs.
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-
-  Input shape:
-      - if data_format='channels_first'
-          5D tensor with shape:
-          `(samples,time, channels, rows, cols)`
-      - if data_format='channels_last'
-          5D tensor with shape:
-          `(samples,time, rows, cols, channels)`
-
-   Output shape:
-      - if `return_sequences`
-           - if data_format='channels_first'
-              5D tensor with shape:
-              `(samples, time, filters, output_row, output_col)`
-           - if data_format='channels_last'
-              5D tensor with shape:
-              `(samples, time, output_row, output_col, filters)`
-      - else
-          - if data_format ='channels_first'
-              4D tensor with shape:
-              `(samples, filters, output_row, output_col)`
-          - if data_format='channels_last'
-              4D tensor with shape:
-              `(samples, output_row, output_col, filters)`
-          where o_row and o_col depend on the shape of the filter and
-          the padding
-
-  Raises:
-      ValueError: in case of invalid constructor arguments.
-
   """
 
   def __init__(self,
@@ -313,27 +547,20 @@ class ConvLSTM2D(ConvRecurrent2D):
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
-               activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
-               return_sequences=False,
-               go_backwards=False,
-               stateful=False,
                dropout=0.,
                recurrent_dropout=0.,
                **kwargs):
-    super(ConvLSTM2D, self).__init__(
-        filters,
-        kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        return_sequences=return_sequences,
-        go_backwards=go_backwards,
-        stateful=stateful,
-        **kwargs)
+    super(ConvLSTM2DCell, self).__init__(**kwargs)
+    self.filters = filters
+    self.kernel_size = conv_utils.normalize_tuple(kernel_size, 2, 'kernel_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.dilation_rate = conv_utils.normalize_tuple(dilation_rate, 2,
+                                                    'dilation_rate')
     self.activation = activations.get(activation)
     self.recurrent_activation = activations.get(recurrent_activation)
     self.use_bias = use_bias
@@ -346,7 +573,6 @@ class ConvLSTM2D(ConvRecurrent2D):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
 
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.recurrent_constraint = constraints.get(recurrent_constraint)
@@ -354,45 +580,29 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-    self.state_spec = [InputSpec(ndim=4), InputSpec(ndim=4)]
+    self.state_size = (self.filters, self.filters)
+    self._dropout_mask = None
+    self._recurrent_dropout_mask = None
 
-  @shape_type_conversion
   def build(self, input_shape):
-    if isinstance(input_shape, list):
-      input_shape = input_shape[0]
-    batch_size = input_shape[0] if self.stateful else None
-    self.input_spec[0] = InputSpec(shape=(batch_size, None) + input_shape[2:])
-    if self.stateful:
-      self.reset_states()
-    else:
-      # initial states: 2 all-zero tensor of shape (filters)
-      self.states = [None, None]
 
     if self.data_format == 'channels_first':
-      channel_axis = 2
+      channel_axis = 1
     else:
       channel_axis = -1
     if input_shape[channel_axis] is None:
       raise ValueError('The channel dimension of the inputs '
                        'should be defined. Found `None`.')
     input_dim = input_shape[channel_axis]
-    state_shape = [None] * 4
-    state_shape[channel_axis] = input_dim
-    state_shape = tuple(state_shape)
-    self.state_spec = [
-        InputSpec(shape=state_shape),
-        InputSpec(shape=state_shape)
-    ]
     kernel_shape = self.kernel_size + (input_dim, self.filters * 4)
     self.kernel_shape = kernel_shape
     recurrent_kernel_shape = self.kernel_size + (self.filters, self.filters * 4)
 
-    self.kernel = self.add_weight(
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        name='kernel',
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint)
+    self.kernel = self.add_weight(shape=kernel_shape,
+                                  initializer=self.kernel_initializer,
+                                  name='kernel',
+                                  regularizer=self.kernel_regularizer,
+                                  constraint=self.kernel_constraint)
     self.recurrent_kernel = self.add_weight(
         shape=recurrent_kernel_shape,
         initializer=self.recurrent_initializer,
@@ -400,25 +610,24 @@ class ConvLSTM2D(ConvRecurrent2D):
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.filters * 4,),
-          initializer=self.bias_initializer,
-          name='bias',
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      self.bias = self.add_weight(shape=(self.filters * 4,),
+                                  initializer=self.bias_initializer,
+                                  name='bias',
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
         bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters:self.filters * 2] = 1.
+        bias_value[self.filters: self.filters * 2] = 1.
         K.set_value(self.bias, bias_value)
     else:
       self.bias = None
 
     self.kernel_i = self.kernel[:, :, :, :self.filters]
     self.recurrent_kernel_i = self.recurrent_kernel[:, :, :, :self.filters]
-    self.kernel_f = self.kernel[:, :, :, self.filters:self.filters * 2]
+    self.kernel_f = self.kernel[:, :, :, self.filters: self.filters * 2]
     self.recurrent_kernel_f = self.recurrent_kernel[:, :, :, self.filters:
                                                     self.filters * 2]
-    self.kernel_c = self.kernel[:, :, :, self.filters * 2:self.filters * 3]
+    self.kernel_c = self.kernel[:, :, :, self.filters * 2: self.filters * 3]
     self.recurrent_kernel_c = self.recurrent_kernel[:, :, :, self.filters * 2:
                                                     self.filters * 3]
     self.kernel_o = self.kernel[:, :, :, self.filters * 3:]
@@ -426,8 +635,8 @@ class ConvLSTM2D(ConvRecurrent2D):
 
     if self.use_bias:
       self.bias_i = self.bias[:self.filters]
-      self.bias_f = self.bias[self.filters:self.filters * 2]
-      self.bias_c = self.bias[self.filters * 2:self.filters * 3]
+      self.bias_f = self.bias[self.filters: self.filters * 2]
+      self.bias_c = self.bias[self.filters * 2: self.filters * 3]
       self.bias_o = self.bias[self.filters * 3:]
     else:
       self.bias_i = None
@@ -436,166 +645,419 @@ class ConvLSTM2D(ConvRecurrent2D):
       self.bias_o = None
     self.built = True
 
-  def get_initial_state(self, inputs):
-    # (samples, timesteps, rows, cols, filters)
-    initial_state = K.zeros_like(inputs)
-    # (samples, rows, cols, filters)
-    initial_state = K.sum(initial_state, axis=1)
-    shape = list(self.kernel_shape)
-    shape[-1] = self.filters
-    initial_state = self.input_conv(
-        initial_state, K.zeros(tuple(shape)), padding=self.padding)
-
-    initial_states = [initial_state for _ in range(2)]
-    return initial_states
+  def call(self, inputs, states, training=None):
+    if 0 < self.dropout < 1 and self._dropout_mask is None:
+      self._dropout_mask = _generate_dropout_mask(
+          K.ones_like(inputs),
+          self.dropout,
+          training=training,
+          count=4)
+    if (0 < self.recurrent_dropout < 1 and
+        self._recurrent_dropout_mask is None):
+      self._recurrent_dropout_mask = _generate_dropout_mask(
+          K.ones_like(states[1]),
+          self.recurrent_dropout,
+          training=training,
+          count=4)
 
-  def reset_states(self):
-    if not self.stateful:
-      raise RuntimeError('Layer must be stateful.')
-    input_shape = self.input_spec[0].shape
+    # dropout matrices for input units
+    dp_mask = self._dropout_mask
+    # dropout matrices for recurrent units
+    rec_dp_mask = self._recurrent_dropout_mask
 
-    if not input_shape[0]:
-      raise ValueError('If a RNN is stateful, a complete '
-                       'input_shape must be provided '
-                       '(including batch size). '
-                       'Got input shape: ' + str(input_shape))
+    h_tm1 = states[0]  # previous memory state
+    c_tm1 = states[1]  # previous carry state
 
-    if self.return_state:
-      output_shape = tuple(self.compute_output_shape(input_shape)[0].as_list())
+    if 0 < self.dropout < 1.:
+      inputs_i = inputs * dp_mask[0]
+      inputs_f = inputs * dp_mask[1]
+      inputs_c = inputs * dp_mask[2]
+      inputs_o = inputs * dp_mask[3]
     else:
-      output_shape = tuple(self.compute_output_shape(input_shape).as_list())
-    if self.return_sequences:
-      output_shape = (input_shape[0],) + output_shape[2:]
-    else:
-      output_shape = (input_shape[0],) + output_shape[1:]
+      inputs_i = inputs
+      inputs_f = inputs
+      inputs_c = inputs
+      inputs_o = inputs
 
-    if hasattr(self, 'states'):
-      K.set_value(self.states[0],
-                  np.zeros(output_shape))
-      K.set_value(self.states[1],
-                  np.zeros(output_shape))
-    else:
-      self.states = [
-          K.zeros(output_shape),
-          K.zeros(output_shape)
-      ]
-
-  def get_constants(self, inputs, training=None):
-    constants = []
-    if self.implementation == 0 and 0 < self.dropout < 1:
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
-      ones += 1
-
-      def dropped_inputs():
-        return K.dropout(ones, self.dropout)
-
-      dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(dp_mask)
-    else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-
-    if 0 < self.recurrent_dropout < 1:
-      shape = list(self.kernel_shape)
-      shape[-1] = self.filters
-      ones = K.zeros_like(inputs)
-      ones = K.sum(ones, axis=1)
-      ones = self.input_conv(ones, K.zeros(shape), padding=self.padding)
-      ones += 1.
-
-      def dropped_inputs():  # pylint: disable=function-redefined
-        return K.dropout(ones, self.recurrent_dropout)
-
-      rec_dp_mask = [
-          K.in_train_phase(dropped_inputs, ones, training=training)
-          for _ in range(4)
-      ]
-      constants.append(rec_dp_mask)
+    if 0 < self.recurrent_dropout < 1.:
+      h_tm1_i = h_tm1 * rec_dp_mask[0]
+      h_tm1_f = h_tm1 * rec_dp_mask[1]
+      h_tm1_c = h_tm1 * rec_dp_mask[2]
+      h_tm1_o = h_tm1 * rec_dp_mask[3]
     else:
-      constants.append([K.cast_to_floatx(1.) for _ in range(4)])
-    return constants
+      h_tm1_i = h_tm1
+      h_tm1_f = h_tm1
+      h_tm1_c = h_tm1
+      h_tm1_o = h_tm1
+
+    x_i = self.input_conv(inputs_i, self.kernel_i, self.bias_i,
+                          padding=self.padding)
+    x_f = self.input_conv(inputs_f, self.kernel_f, self.bias_f,
+                          padding=self.padding)
+    x_c = self.input_conv(inputs_c, self.kernel_c, self.bias_c,
+                          padding=self.padding)
+    x_o = self.input_conv(inputs_o, self.kernel_o, self.bias_o,
+                          padding=self.padding)
+    h_i = self.recurrent_conv(h_tm1_i,
+                              self.recurrent_kernel_i)
+    h_f = self.recurrent_conv(h_tm1_f,
+                              self.recurrent_kernel_f)
+    h_c = self.recurrent_conv(h_tm1_c,
+                              self.recurrent_kernel_c)
+    h_o = self.recurrent_conv(h_tm1_o,
+                              self.recurrent_kernel_o)
+
+    i = self.recurrent_activation(x_i + h_i)
+    f = self.recurrent_activation(x_f + h_f)
+    c = f * c_tm1 + i * self.activation(x_c + h_c)
+    o = self.recurrent_activation(x_o + h_o)
+    h = o * self.activation(c)
+
+    if 0 < self.dropout + self.recurrent_dropout:
+      if training is None:
+        h._uses_learning_phase = True
+
+    return h, [h, c]
 
   def input_conv(self, x, w, b=None, padding='valid'):
-    conv_out = K.conv2d(
-        x,
-        w,
-        strides=self.strides,
-        padding=padding,
-        data_format=self.data_format,
-        dilation_rate=self.dilation_rate)
+    conv_out = K.conv2d(x, w, strides=self.strides,
+                        padding=padding,
+                        data_format=self.data_format,
+                        dilation_rate=self.dilation_rate)
     if b is not None:
-      conv_out = K.bias_add(conv_out, b, data_format=self.data_format)
+      conv_out = K.bias_add(conv_out, b,
+                            data_format=self.data_format)
     return conv_out
 
   def recurrent_conv(self, x, w):
-    conv_out = K.conv2d(
-        x, w, strides=(1, 1), padding='same', data_format=self.data_format)
+    conv_out = K.conv2d(x, w, strides=(1, 1),
+                        padding='same',
+                        data_format=self.data_format)
     return conv_out
 
-  def step(self, inputs, states):
-    assert len(states) == 4
-    h_tm1 = states[0]
-    c_tm1 = states[1]
-    dp_mask = states[2]
-    rec_dp_mask = states[3]
-
-    x_i = self.input_conv(
-        inputs * dp_mask[0], self.kernel_i, self.bias_i, padding=self.padding)
-    x_f = self.input_conv(
-        inputs * dp_mask[1], self.kernel_f, self.bias_f, padding=self.padding)
-    x_c = self.input_conv(
-        inputs * dp_mask[2], self.kernel_c, self.bias_c, padding=self.padding)
-    x_o = self.input_conv(
-        inputs * dp_mask[3], self.kernel_o, self.bias_o, padding=self.padding)
-    h_i = self.recurrent_conv(h_tm1 * rec_dp_mask[0], self.recurrent_kernel_i)
-    h_f = self.recurrent_conv(h_tm1 * rec_dp_mask[1], self.recurrent_kernel_f)
-    h_c = self.recurrent_conv(h_tm1 * rec_dp_mask[2], self.recurrent_kernel_c)
-    h_o = self.recurrent_conv(h_tm1 * rec_dp_mask[3], self.recurrent_kernel_o)
+  def get_config(self):
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
+    base_config = super(ConvLSTM2DCell, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
 
-    i = self.recurrent_activation(x_i + h_i)
-    f = self.recurrent_activation(x_f + h_f)
-    c = f * c_tm1 + i * self.activation(x_c + h_c)
-    o = self.recurrent_activation(x_o + h_o)
-    h = o * self.activation(c)
-    return h, [h, c]
+
+@tf_export('keras.layers.ConvLSTM2D')
+class ConvLSTM2D(ConvRNN2D):
+  """Convolutional LSTM.
+
+  It is similar to an LSTM layer, but the input transformations
+  and recurrent transformations are both convolutional.
+
+  Arguments:
+    filters: Integer, the dimensionality of the output space
+        (i.e. the number output of filters in the convolution).
+    kernel_size: An integer or tuple/list of n integers, specifying the
+        dimensions of the convolution window.
+    strides: An integer or tuple/list of n integers,
+        specifying the strides of the convolution.
+        Specifying any stride value != 1 is incompatible with specifying
+        any `dilation_rate` value != 1.
+    padding: One of `"valid"` or `"same"` (case-insensitive).
+    data_format: A string,
+        one of `channels_last` (default) or `channels_first`.
+        The ordering of the dimensions in the inputs.
+        `channels_last` corresponds to inputs with shape
+        `(batch, time, ..., channels)`
+        while `channels_first` corresponds to
+        inputs with shape `(batch, time, channels, ...)`.
+        It defaults to the `image_data_format` value found in your
+        Keras config file at `~/.keras/keras.json`.
+        If you never set it, then it will be "channels_last".
+    dilation_rate: An integer or tuple/list of n integers, specifying
+        the dilation rate to use for dilated convolution.
+        Currently, specifying any `dilation_rate` value != 1 is
+        incompatible with specifying any `strides` value != 1.
+    activation: Activation function to use.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use
+        for the recurrent step.
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+        used for the linear transformation of the inputs.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+        weights matrix,
+        used for the linear transformation of the recurrent state.
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean.
+        If True, add 1 to the bias of the forget gate at initialization.
+        Use in combination with `bias_initializer="zeros"`.
+        This is recommended in [Jozefowicz et al.]
+        (http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to
+        the `kernel` weights matrix.
+    recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to.
+    kernel_constraint: Constraint function applied to
+        the `kernel` weights matrix.
+    recurrent_constraint: Constraint function applied to
+        the `recurrent_kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+    dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1.
+        Fraction of the units to drop for
+        the linear transformation of the recurrent state.
+
+  Input shape:
+    - if data_format='channels_first'
+        5D tensor with shape:
+        `(samples,time, channels, rows, cols)`
+    - if data_format='channels_last'
+        5D tensor with shape:
+        `(samples,time, rows, cols, channels)`
+
+  Output shape:
+    - if `return_sequences`
+         - if data_format='channels_first'
+            5D tensor with shape:
+            `(samples, time, filters, output_row, output_col)`
+         - if data_format='channels_last'
+            5D tensor with shape:
+            `(samples, time, output_row, output_col, filters)`
+    - else
+        - if data_format ='channels_first'
+            4D tensor with shape:
+            `(samples, filters, output_row, output_col)`
+        - if data_format='channels_last'
+            4D tensor with shape:
+            `(samples, output_row, output_col, filters)`
+        where o_row and o_col depend on the shape of the filter and
+        the padding
+
+  Raises:
+    ValueError: in case of invalid constructor arguments.
+
+  References:
+    - [Convolutional LSTM Network: A Machine Learning Approach for
+    Precipitation Nowcasting](http://arxiv.org/abs/1506.04214v1)
+    The current implementation does not include the feedback loop on the
+    cells output.
+
+  """
+
+  def __init__(self,
+               filters,
+               kernel_size,
+               strides=(1, 1),
+               padding='valid',
+               data_format=None,
+               dilation_rate=(1, 1),
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               go_backwards=False,
+               stateful=False,
+               dropout=0.,
+               recurrent_dropout=0.,
+               **kwargs):
+    cell = ConvLSTM2DCell(filters=filters,
+                          kernel_size=kernel_size,
+                          strides=strides,
+                          padding=padding,
+                          data_format=data_format,
+                          dilation_rate=dilation_rate,
+                          activation=activation,
+                          recurrent_activation=recurrent_activation,
+                          use_bias=use_bias,
+                          kernel_initializer=kernel_initializer,
+                          recurrent_initializer=recurrent_initializer,
+                          bias_initializer=bias_initializer,
+                          unit_forget_bias=unit_forget_bias,
+                          kernel_regularizer=kernel_regularizer,
+                          recurrent_regularizer=recurrent_regularizer,
+                          bias_regularizer=bias_regularizer,
+                          kernel_constraint=kernel_constraint,
+                          recurrent_constraint=recurrent_constraint,
+                          bias_constraint=bias_constraint,
+                          dropout=dropout,
+                          recurrent_dropout=recurrent_dropout)
+    super(ConvLSTM2D, self).__init__(cell,
+                                     return_sequences=return_sequences,
+                                     go_backwards=go_backwards,
+                                     stateful=stateful,
+                                     **kwargs)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    return super(ConvLSTM2D, self).call(inputs,
+                                        mask=mask,
+                                        training=training,
+                                        initial_state=initial_state)
+
+  @property
+  def filters(self):
+    return self.cell.filters
+
+  @property
+  def kernel_size(self):
+    return self.cell.kernel_size
+
+  @property
+  def strides(self):
+    return self.cell.strides
+
+  @property
+  def padding(self):
+    return self.cell.padding
+
+  @property
+  def data_format(self):
+    return self.cell.data_format
+
+  @property
+  def dilation_rate(self):
+    return self.cell.dilation_rate
+
+  @property
+  def activation(self):
+    return self.cell.activation
+
+  @property
+  def recurrent_activation(self):
+    return self.cell.recurrent_activation
+
+  @property
+  def use_bias(self):
+    return self.cell.use_bias
+
+  @property
+  def kernel_initializer(self):
+    return self.cell.kernel_initializer
+
+  @property
+  def recurrent_initializer(self):
+    return self.cell.recurrent_initializer
+
+  @property
+  def bias_initializer(self):
+    return self.cell.bias_initializer
+
+  @property
+  def unit_forget_bias(self):
+    return self.cell.unit_forget_bias
+
+  @property
+  def kernel_regularizer(self):
+    return self.cell.kernel_regularizer
+
+  @property
+  def recurrent_regularizer(self):
+    return self.cell.recurrent_regularizer
+
+  @property
+  def bias_regularizer(self):
+    return self.cell.bias_regularizer
+
+  @property
+  def kernel_constraint(self):
+    return self.cell.kernel_constraint
+
+  @property
+  def recurrent_constraint(self):
+    return self.cell.recurrent_constraint
+
+  @property
+  def bias_constraint(self):
+    return self.cell.bias_constraint
+
+  @property
+  def dropout(self):
+    return self.cell.dropout
+
+  @property
+  def recurrent_dropout(self):
+    return self.cell.recurrent_dropout
 
   def get_config(self):
-    config = {
-        'activation':
-            activations.serialize(self.activation),
-        'recurrent_activation':
-            activations.serialize(self.recurrent_activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'recurrent_initializer':
-            initializers.serialize(self.recurrent_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'unit_forget_bias':
-            self.unit_forget_bias,
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'recurrent_regularizer':
-            regularizers.serialize(self.recurrent_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
-        'activity_regularizer':
-            regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'recurrent_constraint':
-            constraints.serialize(self.recurrent_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint),
-        'dropout':
-            self.dropout,
-        'recurrent_dropout':
-            self.recurrent_dropout
-    }
+    config = {'filters': self.filters,
+              'kernel_size': self.kernel_size,
+              'strides': self.strides,
+              'padding': self.padding,
+              'data_format': self.data_format,
+              'dilation_rate': self.dilation_rate,
+              'activation': activations.serialize(self.activation),
+              'recurrent_activation': activations.serialize(
+                  self.recurrent_activation),
+              'use_bias': self.use_bias,
+              'kernel_initializer': initializers.serialize(
+                  self.kernel_initializer),
+              'recurrent_initializer': initializers.serialize(
+                  self.recurrent_initializer),
+              'bias_initializer': initializers.serialize(self.bias_initializer),
+              'unit_forget_bias': self.unit_forget_bias,
+              'kernel_regularizer': regularizers.serialize(
+                  self.kernel_regularizer),
+              'recurrent_regularizer': regularizers.serialize(
+                  self.recurrent_regularizer),
+              'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+              'activity_regularizer': regularizers.serialize(
+                  self.activity_regularizer),
+              'kernel_constraint': constraints.serialize(
+                  self.kernel_constraint),
+              'recurrent_constraint': constraints.serialize(
+                  self.recurrent_constraint),
+              'bias_constraint': constraints.serialize(self.bias_constraint),
+              'dropout': self.dropout,
+              'recurrent_dropout': self.recurrent_dropout}
     base_config = super(ConvLSTM2D, self).get_config()
+    del base_config['cell']
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
index 60137bdd724676af2c89bb7531cf4ea4e529b2a1..9e768b4e9552d126eac9c586d19810634fac9013 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
@@ -64,6 +64,7 @@ class ConvLSTMTest(test.TestCase):
           self.assertEqual(len(states), 2)
           model = keras.models.Model(x, states[0])
           state = model.predict(inputs)
+
           self.assertAllClose(
               keras.backend.eval(layer.states[0]), state, atol=1e-4)
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
index f4a134b96cec0385cb24a208f3403db944b68edc..12b42676759d499c910707cb1b78e788e3c443fd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
@@ -961,5 +961,43 @@ class CroppingTest(test.TestCase):
       keras.layers.Cropping3D(cropping=None)
 
 
+class DepthwiseConv2DTest(test.TestCase):
+
+  def _run_test(self, kwargs, arg, values):
+    num_samples = 2
+    stack_size = 3
+    num_row = 7
+    num_col = 6
+
+    test_kwargs = copy.copy(kwargs)
+    for value in values:
+      test_kwargs[arg] = value
+      with self.test_session(use_gpu=True):
+        testing_utils.layer_test(
+            keras.layers.DepthwiseConv2D,
+            kwargs=test_kwargs,
+            input_shape=(num_samples, num_row, num_col, stack_size))
+
+  def test_depthwise_conv2d(self):
+    kwargs = {'kernel_size': (3, 3)}
+
+    self._run_test(kwargs, 'padding', ['valid', 'same'])
+    self._run_test(kwargs, 'strides', [(2, 2)])
+    if test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, 'data_format', ['channels_first'])
+    self._run_test(kwargs, 'depth_multiplier', [1, 2])
+
+    kwargs = {'kernel_size': 3,
+              'padding': 'valid',
+              'data_format': 'channels_first',
+              'activation': None,
+              'depthwise_regularizer': 'l2',
+              'bias_regularizer': 'l2',
+              'activity_regularizer': 'l2',
+              'depthwise_constraint': 'unit_norm',
+              'strides': (2, 2),
+             }
+    self._run_test(kwargs, 'depth_multiplier', [1])
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/_impl/keras/layers/core.py
index 73e4f15f7e259211c892fdc663e14dcb14aec58d..f64174a23fe086326a7cfb3540954f0195ce01c1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core.py
@@ -24,6 +24,7 @@ import types as python_types
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import activations
 from tensorflow.python.keras._impl.keras import backend as K
@@ -32,11 +33,14 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_dump
-from tensorflow.python.keras._impl.keras.utils.generic_utils import func_load
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import core as tf_core_layers
+from tensorflow.python.keras._impl.keras.utils import generic_utils
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -75,12 +79,12 @@ class Masking(Layer):
     self.mask_value = mask_value
 
   def compute_mask(self, inputs, mask=None):
-    return K.any(K.not_equal(inputs, self.mask_value), axis=-1)
+    return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
 
   def call(self, inputs):
     boolean_mask = K.any(
-        K.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
-    return inputs * K.cast(boolean_mask, inputs.dtype)
+        math_ops.not_equal(inputs, self.mask_value), axis=-1, keepdims=True)
+    return inputs * math_ops.cast(boolean_mask, inputs.dtype)
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -92,7 +96,7 @@ class Masking(Layer):
 
 
 @tf_export('keras.layers.Dropout')
-class Dropout(tf_core_layers.Dropout, Layer):
+class Dropout(Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting
@@ -111,23 +115,39 @@ class Dropout(tf_core_layers.Dropout, Layer):
   """
 
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
-    # Inheritance call order:
-    # 1) tf.layers.Dropout, 2) keras.layers.Layer, 3) tf.layers.Layer
-    super(Dropout, self).__init__(rate=rate,
-                                  noise_shape=noise_shape,
-                                  seed=seed,
-                                  **kwargs)
+    super(Dropout, self).__init__(**kwargs)
+    self.rate = rate
+    self.noise_shape = noise_shape
+    self.seed = seed
     self.supports_masking = True
 
+  def _get_noise_shape(self, inputs):
+    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
+    # which will override `self.noise_shape`, and allows for custom noise
+    # shapes with dynamically sized inputs.
+    if self.noise_shape is None:
+      return self.noise_shape
+    return nn_ops._get_noise_shape(inputs, self.noise_shape)  # pylint: disable=protected-access
+
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
-    output = super(Dropout, self).call(inputs, training=training)
+
+    def dropped_inputs():
+      return nn.dropout(inputs, 1  - self.rate,
+                        noise_shape=self._get_noise_shape(inputs),
+                        seed=self.seed)
+    output = tf_utils.smart_cond(training,
+                                 dropped_inputs,
+                                 lambda: array_ops.identity(inputs))
     # EagerTensor object has no attribute _uses_learning_phase
     if not context.executing_eagerly() and training is K.learning_phase():
       output._uses_learning_phase = True  # pylint: disable=protected-access
     return output
 
+  def compute_output_shape(self, input_shape):
+    return input_shape
+
   def get_config(self):
     config = {
         'rate': self.rate,
@@ -170,7 +190,7 @@ class SpatialDropout1D(Dropout):
     self.input_spec = InputSpec(ndim=3)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     noise_shape = (input_shape[0], 1, input_shape[2])
     return noise_shape
 
@@ -222,7 +242,7 @@ class SpatialDropout2D(Dropout):
     self.input_spec = InputSpec(ndim=4)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1)
     elif self.data_format == 'channels_last':
@@ -275,7 +295,7 @@ class SpatialDropout3D(Dropout):
     self.input_spec = InputSpec(ndim=5)
 
   def _get_noise_shape(self, inputs):
-    input_shape = K.shape(inputs)
+    input_shape = array_ops.shape(inputs)
     if self.data_format == 'channels_first':
       return (input_shape[0], input_shape[1], 1, 1, 1)
     elif self.data_format == 'channels_last':
@@ -414,7 +434,8 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.reshape(inputs, (K.shape(inputs)[0],) + self.target_shape)
+    return array_ops.reshape(inputs,
+                             (array_ops.shape(inputs)[0],) + self.target_shape)
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
@@ -467,7 +488,7 @@ class Permute(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return K.permute_dimensions(inputs, (0,) + self.dims)
+    return array_ops.transpose(inputs, perm=(0,) + self.dims)
 
   def get_config(self):
     config = {'dims': self.dims}
@@ -476,7 +497,7 @@ class Permute(Layer):
 
 
 @tf_export('keras.layers.Flatten')
-class Flatten(tf_core_layers.Flatten, Layer):
+class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
   Example:
@@ -492,7 +513,25 @@ class Flatten(tf_core_layers.Flatten, Layer):
       # now: model.output_shape == (None, 65536)
   ```
   """
-  pass
+
+  def __init__(self, **kwargs):
+    super(Flatten, self).__init__(**kwargs)
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def call(self, inputs):
+    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    if not context.executing_eagerly():
+      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    output_shape = [input_shape[0]]
+    if all(input_shape[1:]):
+      output_shape += [np.prod(input_shape[1:])]
+    else:
+      output_shape += [None]
+    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('keras.layers.RepeatVector')
@@ -608,10 +647,12 @@ class Lambda(Layer):
                         'must be a list, a tuple, or a function.')
       self._output_shape = output_shape
 
-  def _compute_output_shape(self, input_shape):
+  def compute_output_shape(self, input_shape):
     input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
 
     if self._output_shape is None:
+      if context.executing_eagerly():
+        raise NotImplementedError
       x = K.placeholder(shape=input_shape)
       x = self.call(x)
       if isinstance(x, list):
@@ -637,7 +678,7 @@ class Lambda(Layer):
 
   def call(self, inputs, mask=None):
     arguments = self.arguments
-    if has_arg(self.function, 'mask'):
+    if generic_utils.has_arg(self.function, 'mask'):
       arguments['mask'] = mask
     return self.function(inputs, **arguments)
 
@@ -648,14 +689,14 @@ class Lambda(Layer):
 
   def get_config(self):
     if isinstance(self.function, python_types.LambdaType):
-      function = func_dump(self.function)
+      function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
     else:
       function = self.function.__name__
       function_type = 'function'
 
     if isinstance(self._output_shape, python_types.LambdaType):
-      output_shape = func_dump(self._output_shape)
+      output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
@@ -683,26 +724,27 @@ class Lambda(Layer):
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
-      function = deserialize_keras_object(
+      function = generic_utils.deserialize_keras_object(
           config['function'],
           custom_objects=custom_objects,
           printable_module_name='function in Lambda layer')
     elif function_type == 'lambda':
       # Unsafe deserialization from bytecode
-      function = func_load(config['function'], globs=globs)
+      function = generic_utils.func_load(config['function'], globs=globs)
     else:
       raise TypeError('Unknown function type:', function_type)
 
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
-      output_shape = deserialize_keras_object(
+      output_shape = generic_utils.deserialize_keras_object(
           config['output_shape'],
           custom_objects=custom_objects,
           printable_module_name='output_shape function in Lambda layer')
     elif output_shape_type == 'lambda':
       # Unsafe deserialization from bytecode
-      output_shape = func_load(config['output_shape'], globs=globs)
+      output_shape = generic_utils.func_load(config['output_shape'],
+                                             globs=globs)
     else:
       output_shape = config['output_shape']
 
@@ -722,7 +764,7 @@ class Lambda(Layer):
 
 
 @tf_export('keras.layers.Dense')
-class Dense(tf_core_layers.Dense, Layer):
+class Dense(Layer):
   """Just your regular densely-connected NN layer.
 
   `Dense` implements the operation:
@@ -792,21 +834,74 @@ class Dense(tf_core_layers.Dense, Layer):
     if 'input_shape' not in kwargs and 'input_dim' in kwargs:
       kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 
-    # Inheritance call order:
-    # 1) tf.layers.Dense, 2) keras.layers.Layer, 3) tf.layers.Layer
     super(Dense, self).__init__(
-        units,
-        activation=activations.get(activation),
-        use_bias=use_bias,
-        kernel_initializer=initializers.get(kernel_initializer),
-        bias_initializer=initializers.get(bias_initializer),
-        kernel_regularizer=regularizers.get(kernel_regularizer),
-        bias_regularizer=regularizers.get(bias_regularizer),
-        activity_regularizer=regularizers.get(activity_regularizer),
-        kernel_constraint=constraints.get(kernel_constraint),
-        bias_constraint=constraints.get(bias_constraint),
-        **kwargs)
+        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+    self.units = int(units)
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
     self.supports_masking = True
+    self.input_spec = InputSpec(min_ndim=2)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if input_shape[-1].value is None:
+      raise ValueError('The last dimension of the inputs to `Dense` '
+                       'should be defined. Found `None`.')
+    self.input_spec = InputSpec(min_ndim=2,
+                                axes={-1: input_shape[-1].value})
+    self.kernel = self.add_variable('kernel',
+                                    shape=[input_shape[-1].value, self.units],
+                                    initializer=self.kernel_initializer,
+                                    regularizer=self.kernel_regularizer,
+                                    constraint=self.kernel_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    if self.use_bias:
+      self.bias = self.add_variable('bias',
+                                    shape=[self.units,],
+                                    initializer=self.bias_initializer,
+                                    regularizer=self.bias_regularizer,
+                                    constraint=self.bias_constraint,
+                                    dtype=self.dtype,
+                                    trainable=True)
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
+    shape = inputs.get_shape().as_list()
+    if len(shape) > 2:
+      # Broadcasting is required for the inputs.
+      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
+                                                             [0]])
+      # Reshape the output back to the original ndim of the input.
+      if not context.executing_eagerly():
+        output_shape = shape[:-1] + [self.units]
+        outputs.set_shape(output_shape)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    if self.use_bias:
+      outputs = nn.bias_add(outputs, self.bias)
+    if self.activation is not None:
+      return self.activation(outputs)  # pylint: disable=not-callable
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if input_shape[-1].value is None:
+      raise ValueError(
+          'The innermost dimension of input_shape must be defined, but saw: %s'
+          % input_shape)
+    return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/_impl/keras/layers/core_test.py
index 2ca816adbdcecaf371776d99f3da60d0d8790832..d22d8d12dc4e76998c177dbe96fb87e3fffa5175 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/core_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras._impl import keras
 from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -128,7 +129,6 @@ class CoreLayersTest(test.TestCase):
     testing_utils.layer_test(
         keras.layers.RepeatVector, kwargs={'n': 3}, input_shape=(3, 2))
 
-  @tf_test_util.run_in_graph_and_eager_modes()
   def test_lambda(self):
     testing_utils.layer_test(
         keras.layers.Lambda,
@@ -159,7 +159,7 @@ class CoreLayersTest(test.TestCase):
 
     # test with lambda
     ld = keras.layers.Lambda(
-        lambda x: keras.backend.concatenate([keras.backend.square(x), x]))
+        lambda x: keras.backend.concatenate([math_ops.square(x), x]))
     config = ld.get_config()
     ld = keras.layers.Lambda.from_config(config)
 
@@ -235,4 +235,3 @@ class CoreLayersTest(test.TestCase):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
index 006ecd3135be25d43133daed1603734ecd1be955..591bab7cd86aefcad58d47eafbd061da4ca04b83 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings.py
@@ -24,6 +24,8 @@ from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -100,7 +102,8 @@ class Embedding(Layer):
         kwargs['input_shape'] = (input_length,)
       else:
         kwargs['input_shape'] = (None,)
-    super(Embedding, self).__init__(**kwargs)
+    dtype = kwargs.pop('dtype', K.floatx())
+    super(Embedding, self).__init__(dtype=dtype, **kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -118,15 +121,14 @@ class Embedding(Layer):
         initializer=self.embeddings_initializer,
         name='embeddings',
         regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        dtype=self.dtype)
+        constraint=self.embeddings_constraint)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
     if not self.mask_zero:
       return None
     else:
-      return K.not_equal(inputs, 0)
+      return math_ops.not_equal(inputs, 0)
 
   @shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -152,8 +154,8 @@ class Embedding(Layer):
 
   def call(self, inputs):
     if K.dtype(inputs) != 'int32':
-      inputs = K.cast(inputs, 'int32')
-    out = K.gather(self.embeddings, inputs)
+      inputs = math_ops.cast(inputs, 'int32')
+    out = array_ops.gather(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
index 26fd1f1c114587c2f1b3e0155f1259dd5f0dcf60..9f6793eac85854ea82d36b425a883f9abe54f1eb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.platform import test
 
 class EmbeddingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes()
+  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def test_embedding(self):
     testing_utils.layer_test(
         keras.layers.Embedding,
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/_impl/keras/layers/merge.py
index c660cbd449b11a139f64cfa8b3a35310a597491c..7c87e6c0671138efacbf1bca02fdf6779e21537f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/_impl/keras/layers/merge.py
@@ -23,6 +23,9 @@ from __future__ import print_function
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -127,7 +130,7 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           for _ in range(max_ndim - x_ndim):
-            x = K.expand_dims(x, 1)
+            x = array_ops.expand_dims(x, axis=1)
           reshaped_inputs.append(x)
         return self._merge_function(reshaped_inputs)
       else:
@@ -137,19 +140,22 @@ class _Merge(Layer):
         for x in inputs:
           x_ndim = K.ndim(x)
           if x_ndim is None:
-            x_shape = K.shape(x)
+            x_shape = array_ops.shape(x)
             batch_size = x_shape[0]
-            new_shape = K.concatenate([x_shape[1:], K.expand_dims(batch_size)])
-            x_transposed = K.reshape(x,
-                                     K.stack([batch_size,
-                                              K.prod(x_shape[1:])]))
-            x_transposed = K.permute_dimensions(x_transposed, (1, 0))
-            x_transposed = K.reshape(x_transposed, new_shape)
+            new_shape = K.concatenate(
+                [x_shape[1:],
+                 array_ops.expand_dims(batch_size, axis=-1)])
+            x_transposed = array_ops.reshape(
+                x,
+                array_ops.stack(
+                    [batch_size, math_ops.reduce_prod(x_shape[1:])], axis=0))
+            x_transposed = array_ops.transpose(x_transposed, perm=(1, 0))
+            x_transposed = array_ops.reshape(x_transposed, new_shape)
             reshaped_inputs.append(x_transposed)
             transposed = True
           elif x_ndim > 1:
             dims = list(range(1, x_ndim)) + [0]
-            reshaped_inputs.append(K.permute_dimensions(x, dims))
+            reshaped_inputs.append(array_ops.transpose(x, perm=dims))
             transposed = True
           else:
             # We don't transpose inputs if they are 1D vectors or scalars.
@@ -159,17 +165,18 @@ class _Merge(Layer):
         if transposed:
           # If inputs have been transposed, we have to transpose the output too.
           if y_ndim is None:
-            y_shape = K.shape(y)
-            y_ndim = K.shape(y_shape)[0]
+            y_shape = array_ops.shape(y)
+            y_ndim = array_ops.shape(y_shape)[0]
             batch_size = y_shape[y_ndim - 1]
-            new_shape = K.concatenate(
-                [K.expand_dims(batch_size), y_shape[:y_ndim - 1]])
-            y = K.reshape(y, (-1, batch_size))
-            y = K.permute_dimensions(y, (1, 0))
-            y = K.reshape(y, new_shape)
+            new_shape = K.concatenate([
+                array_ops.expand_dims(batch_size, axis=-1), y_shape[:y_ndim - 1]
+            ])
+            y = array_ops.reshape(y, (-1, batch_size))
+            y = array_ops.transpose(y, perm=(1, 0))
+            y = array_ops.reshape(y, new_shape)
           elif y_ndim > 1:
             dims = [y_ndim - 1] + list(range(y_ndim - 1))
-            y = K.permute_dimensions(y, dims)
+            y = array_ops.transpose(y, perm=dims)
         return y
     else:
       return self._merge_function(inputs)
@@ -207,7 +214,7 @@ class _Merge(Layer):
                        'should have the same length.')
     if all([m is None for m in mask]):
       return None
-    masks = [K.expand_dims(m, 0) for m in mask if m is not None]
+    masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
 
 
@@ -325,7 +332,7 @@ class Maximum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.maximum(output, inputs[i])
+      output = math_ops.maximum(output, inputs[i])
     return output
 
 
@@ -340,7 +347,7 @@ class Minimum(_Merge):
   def _merge_function(self, inputs):
     output = inputs[0]
     for i in range(1, len(inputs)):
-      output = K.minimum(output, inputs[i])
+      output = math_ops.minimum(output, inputs[i])
     return output
 
 
@@ -418,10 +425,10 @@ class Concatenate(_Merge):
     for input_i, mask_i in zip(inputs, mask):
       if mask_i is None:
         # Input is unmasked. Append all 1s to masks,
-        masks.append(K.ones_like(input_i, dtype='bool'))
+        masks.append(array_ops.ones_like(input_i, dtype='bool'))
       elif K.ndim(mask_i) < K.ndim(input_i):
         # Mask is smaller than the input, expand it
-        masks.append(K.expand_dims(mask_i))
+        masks.append(array_ops.expand_dims(mask_i, axis=-1))
       else:
         masks.append(mask_i)
     concatenated = K.concatenate(masks, axis=self.axis)
@@ -511,8 +518,8 @@ class Dot(_Merge):
         else:
           axes.append(self.axes[i])
     if self.normalize:
-      x1 = K.l2_normalize(x1, axis=axes[0])
-      x2 = K.l2_normalize(x2, axis=axes[1])
+      x1 = nn.l2_normalize(x1, axis=axes[0])
+      x2 = nn.l2_normalize(x2, axis=axes[1])
     output = K.batch_dot(x1, x2, axes)
     return output
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/_impl/keras/layers/noise.py
index e309d160e5a9be97ff5f5356dad9dfaf85430233..72dc7a1ff8b7887ed97ae44bddf8ae3cd32c408d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/_impl/keras/layers/noise.py
@@ -23,6 +23,8 @@ import numpy as np
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,7 +60,7 @@ class GaussianNoise(Layer):
 
     def noised():
       return inputs + K.random_normal(
-          shape=K.shape(inputs), mean=0., stddev=self.stddev)
+          shape=array_ops.shape(inputs), mean=0., stddev=self.stddev)
 
     return K.in_train_phase(noised, inputs, training=training)
 
@@ -104,7 +106,7 @@ class GaussianDropout(Layer):
       def noised():
         stddev = np.sqrt(self.rate / (1.0 - self.rate))
         return inputs * K.random_normal(
-            shape=K.shape(inputs), mean=1.0, stddev=stddev)
+            shape=array_ops.shape(inputs), mean=1.0, stddev=stddev)
 
       return K.in_train_phase(noised, inputs, training=training)
     return inputs
@@ -153,7 +155,7 @@ class AlphaDropout(Layer):
     self.supports_masking = True
 
   def _get_noise_shape(self, inputs):
-    return self.noise_shape if self.noise_shape else K.shape(inputs)
+    return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
 
   def call(self, inputs, training=None):
     if 0. < self.rate < 1.:
@@ -164,9 +166,9 @@ class AlphaDropout(Layer):
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
 
-        kept_idx = K.greater_equal(
+        kept_idx = math_ops.greater_equal(
             K.random_uniform(noise_shape, seed=seed), rate)
-        kept_idx = K.cast(kept_idx, K.floatx())
+        kept_idx = math_ops.cast(kept_idx, K.floatx())
 
         # Get affine transformation params
         a = ((1 - rate) * (1 + rate * alpha_p**2))**-0.5
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/_impl/keras/layers/normalization.py
index 3b44b20bf822429351002c0f81fe8f9596d595d3..b73025a5a8abf46ae5a9c65d7050a08817e5ea67 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization.py
@@ -19,17 +19,28 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras import constraints
 from tensorflow.python.keras._impl.keras import initializers
 from tensorflow.python.keras._impl.keras import regularizers
+from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.layers import normalization as tf_normalization_layers
+from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import state_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
+class BatchNormalization(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -37,28 +48,63 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
   close to 0 and the activation standard deviation close to 1.
 
   Arguments:
-      axis: Integer, the axis that should be normalized
-          (typically the features axis).
-          For instance, after a `Conv2D` layer with
-          `data_format="channels_first"`,
-          set `axis=1` in `BatchNormalization`.
-      momentum: Momentum for the moving average.
-      epsilon: Small float added to variance to avoid dividing by zero.
-      center: If True, add offset of `beta` to normalized tensor.
-          If False, `beta` is ignored.
-      scale: If True, multiply by `gamma`.
-          If False, `gamma` is not used.
-          When the next layer is linear (also e.g. `nn.relu`),
-          this can be disabled since the scaling
-          will be done by the next layer.
-      beta_initializer: Initializer for the beta weight.
-      gamma_initializer: Initializer for the gamma weight.
-      moving_mean_initializer: Initializer for the moving mean.
-      moving_variance_initializer: Initializer for the moving variance.
-      beta_regularizer: Optional regularizer for the beta weight.
-      gamma_regularizer: Optional regularizer for the gamma weight.
-      beta_constraint: Optional constraint for the beta weight.
-      gamma_constraint: Optional constraint for the gamma weight.
+    axis: Integer, the axis that should be normalized
+        (typically the features axis).
+        For instance, after a `Conv2D` layer with
+        `data_format="channels_first"`,
+        set `axis=1` in `BatchNormalization`.
+    momentum: Momentum for the moving average.
+    epsilon: Small float added to variance to avoid dividing by zero.
+    center: If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: If True, multiply by `gamma`.
+        If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: Initializer for the beta weight.
+    gamma_initializer: Initializer for the gamma weight.
+    moving_mean_initializer: Initializer for the moving mean.
+    moving_variance_initializer: Initializer for the moving variance.
+    beta_regularizer: Optional regularizer for the beta weight.
+    gamma_regularizer: Optional regularizer for the gamma weight.
+    beta_constraint: Optional constraint for the beta weight.
+    gamma_constraint: Optional constraint for the gamma weight.
+    renorm: Whether to use Batch Renormalization
+      (https://arxiv.org/abs/1702.03275). This adds extra variables during
+      training. The inference is the same for either value of this parameter.
+    renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
+      scalar `Tensors` used to clip the renorm correction. The correction
+      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
+      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      dmax are set to inf, 0, inf, respectively.
+    renorm_momentum: Momentum used to update the moving means and standard
+      deviations with renorm. Unlike `momentum`, this affects training
+      and should be neither too small (which would add noise) nor too large
+      (which would give stale estimates). Note that `momentum` is still applied
+      to get the means and variances for inference.
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.
+    trainable: Boolean, if `True` also add variables to the graph collection
+      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
+    virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
+      which means batch normalization is performed across the whole batch. When
+      `virtual_batch_size` is not `None`, instead perform "Ghost Batch
+      Normalization", which creates virtual sub-batches which are each
+      normalized separately (with shared gamma, beta, and moving statistics).
+      Must divide the actual batch size during execution.
+    adjustment: A function taking the `Tensor` containing the (dynamic) shape of
+      the input tensor and returning a pair (scale, bias) to apply to the
+      normalized values (before gamma and beta), only during training. For
+      example, if axis==-1,
+        `adjustment = lambda shape: (
+          tf.random_uniform(shape[-1:], 0.93, 1.07),
+          tf.random_uniform(shape[-1:], -0.1, 0.1))`
+      will scale the normalized value by up to 7% up or down, then shift the
+      result by up to 0.1 (with independent scaling and bias for each feature
+      but shared across all examples), and finally apply gamma and/or beta. If
+      `None`, no adjustment is applied. Cannot be specified if
+      virtual_batch_size is specified.
 
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
@@ -87,33 +133,507 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
                gamma_regularizer=None,
                beta_constraint=None,
                gamma_constraint=None,
+               renorm=False,
+               renorm_clipping=None,
+               renorm_momentum=0.99,
+               fused=None,
+               trainable=True,
+               virtual_batch_size=None,
+               adjustment=None,
+               name=None,
                **kwargs):
-    self.supports_masking = True
     super(BatchNormalization, self).__init__(
-        axis=axis,
-        momentum=momentum,
-        epsilon=epsilon,
-        center=center,
-        scale=scale,
-        beta_initializer=initializers.get(beta_initializer),
-        gamma_initializer=initializers.get(gamma_initializer),
-        moving_mean_initializer=initializers.get(moving_mean_initializer),
-        moving_variance_initializer=initializers.get(
-            moving_variance_initializer),
-        beta_regularizer=regularizers.get(beta_regularizer),
-        gamma_regularizer=regularizers.get(gamma_regularizer),
-        beta_constraint=constraints.get(beta_constraint),
-        gamma_constraint=constraints.get(gamma_constraint),
-        **kwargs
-    )
+        name=name, trainable=trainable, **kwargs)
+    if isinstance(axis, list):
+      self.axis = axis[:]
+    else:
+      self.axis = axis
+    self.momentum = momentum
+    self.epsilon = epsilon
+    self.center = center
+    self.scale = scale
+    self.beta_initializer = initializers.get(beta_initializer)
+    self.gamma_initializer = initializers.get(gamma_initializer)
+    self.moving_mean_initializer = initializers.get(moving_mean_initializer)
+    self.moving_variance_initializer = initializers.get(
+        moving_variance_initializer)
+    self.beta_regularizer = regularizers.get(beta_regularizer)
+    self.gamma_regularizer = regularizers.get(gamma_regularizer)
+    self.beta_constraint = constraints.get(beta_constraint)
+    self.gamma_constraint = constraints.get(gamma_constraint)
+    self.renorm = renorm
+    self.virtual_batch_size = virtual_batch_size
+    self.adjustment = adjustment
+    if fused is None:
+      fused = True
+    self.supports_masking = True
+
+    self.fused = fused
+    self._bessels_correction_test_only = True
+
+    if renorm:
+      renorm_clipping = renorm_clipping or {}
+      keys = ['rmax', 'rmin', 'dmax']
+      if set(renorm_clipping) - set(keys):
+        raise ValueError('renorm_clipping %s contains keys not in %s' %
+                         (renorm_clipping, keys))
+      self.renorm_clipping = renorm_clipping
+      self.renorm_momentum = renorm_momentum
+
+  def _add_tower_local_variable(self, *args, **kwargs):
+    tower_context = distribute_lib.get_tower_context()
+    with tower_context.tower_local_var_scope('mean'):
+      return self.add_variable(*args, **kwargs)
+
+  def build(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape)
+    if not input_shape.ndims:
+      raise ValueError('Input has undefined rank:', input_shape)
+    ndims = len(input_shape)
+
+    # Convert axis to list and resolve negatives
+    if isinstance(self.axis, int):
+      self.axis = [self.axis]
+
+    if not isinstance(self.axis, list):
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
+
+    for idx, x in enumerate(self.axis):
+      if x < 0:
+        self.axis[idx] = ndims + x
+
+    # Validate axes
+    for x in self.axis:
+      if x < 0 or x >= ndims:
+        raise ValueError('Invalid axis: %d' % x)
+    if len(self.axis) != len(set(self.axis)):
+      raise ValueError('Duplicate axis: %s' % self.axis)
+
+    if self.virtual_batch_size is not None:
+      if self.virtual_batch_size <= 0:
+        raise ValueError('virtual_batch_size must be a positive integer that '
+                         'divides the true batch size of the input Tensor')
+      # If using virtual batches, the first dimension must be the batch
+      # dimension and cannot be the batch norm axis
+      if 0 in self.axis:
+        raise ValueError('When using virtual_batch_size, the batch dimension '
+                         'must be 0 and thus axis cannot include 0')
+      if self.adjustment is not None:
+        raise ValueError('When using virtual_batch_size, adjustment cannot '
+                         'be specified')
+
+    if self.fused:
+      # Currently fused batch norm doesn't support renorm. It also only supports
+      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
+      # output back to its original shape accordingly.
+      self.fused = (not self.renorm and
+                    ndims == 4 and
+                    self.axis in [[1], [3]] and
+                    self.virtual_batch_size is None and
+                    self.adjustment is None)
+      # TODO(chrisying): fused batch norm is currently not supported for
+      # multi-axis batch norm and by extension virtual batches. In some cases,
+      # it might be possible to use fused batch norm but would require reshaping
+      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
+      # particularly tricky. A compromise might be to just support the most
+      # common use case (turning 5D w/ virtual batch to NCHW)
+
+    if self.fused:
+      if self.axis == [1]:
+        self._data_format = 'NCHW'
+      elif self.axis == [3]:
+        self._data_format = 'NHWC'
+      else:
+        raise ValueError('Unsupported axis, fused batch norm only supports '
+                         'axis == [1] or axis == [3]')
+
+    # Raise parameters of fp16 batch norm to fp32
+    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
+      param_dtype = dtypes.float32
+    else:
+      param_dtype = self.dtype or dtypes.float32
+
+    axis_to_dim = {x: input_shape[x].value for x in self.axis}
+    for x in axis_to_dim:
+      if axis_to_dim[x] is None:
+        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
+                         input_shape)
+    self.input_spec = InputSpec(ndim=ndims, axes=axis_to_dim)
+
+    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
+      # Single axis batch norm (most common/default use-case)
+      param_shape = (list(axis_to_dim.values())[0],)
+    else:
+      # Parameter shape is the original shape but with 1 in all non-axis dims
+      param_shape = [axis_to_dim[i] if i in axis_to_dim
+                     else 1 for i in range(ndims)]
+      if self.virtual_batch_size is not None:
+        # When using virtual batches, add an extra dim at index 1
+        param_shape.insert(1, 1)
+        for idx, x in enumerate(self.axis):
+          self.axis[idx] = x + 1      # Account for added dimension
+
+    if self.scale:
+      self.gamma = self.add_variable(
+          name='gamma',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.gamma_initializer,
+          regularizer=self.gamma_regularizer,
+          constraint=self.gamma_constraint,
+          trainable=True)
+    else:
+      self.gamma = None
+      if self.fused:
+        self._gamma_const = array_ops.constant(
+            1.0, dtype=param_dtype, shape=param_shape)
+
+    if self.center:
+      self.beta = self.add_variable(
+          name='beta',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.beta_initializer,
+          regularizer=self.beta_regularizer,
+          constraint=self.beta_constraint,
+          trainable=True)
+    else:
+      self.beta = None
+      if self.fused:
+        self._beta_const = array_ops.constant(
+            0.0, dtype=param_dtype, shape=param_shape)
+
+    try:
+      # Disable variable partitioning when creating the moving mean and variance
+      if hasattr(self, '_scope') and self._scope:
+        partitioner = self._scope.partitioner
+        self._scope.set_partitioner(None)
+      else:
+        partitioner = None
+      self.moving_mean = self._add_tower_local_variable(
+          name='moving_mean',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_mean_initializer,
+          trainable=False)
+
+      self.moving_variance = self._add_tower_local_variable(
+          name='moving_variance',
+          shape=param_shape,
+          dtype=param_dtype,
+          initializer=self.moving_variance_initializer,
+          trainable=False)
+
+      if self.renorm:
+        # Create variables to maintain the moving mean and standard deviation.
+        # These are used in training and thus are different from the moving
+        # averages above. The renorm variables are colocated with moving_mean
+        # and moving_variance.
+        # NOTE: below, the outer `with device` block causes the current device
+        # stack to be cleared. The nested ones use a `lambda` to set the desired
+        # device and ignore any devices that may be set by the custom getter.
+        def _renorm_variable(name, shape):
+          var = self._add_tower_local_variable(
+              name=name,
+              shape=shape,
+              dtype=param_dtype,
+              initializer=init_ops.zeros_initializer(),
+              trainable=False)
+          return var
+
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_mean):
+          self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
+          self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
+        # We initialize renorm_stddev to 0, and maintain the (0-initialized)
+        # renorm_stddev_weight. This allows us to (1) mix the average
+        # stddev with the minibatch stddev early in training, and (2) compute
+        # the unbiased average stddev by dividing renorm_stddev by the weight.
+        with distribute_lib.get_distribution_strategy().colocate_vars_with(
+            self.moving_variance):
+          self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
+          self.renorm_stddev_weight = _renorm_variable('renorm_stddev_weight',
+                                                       ())
+    finally:
+      if partitioner:
+        self._scope.set_partitioner(partitioner)
+    self.built = True
+
+  def _assign_moving_average(self, variable, value, momentum):
+    with ops.name_scope(None, 'AssignMovingAvg',
+                        [variable, value, momentum]) as scope:
+      decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
+      if decay.dtype != variable.dtype.base_dtype:
+        decay = math_ops.cast(decay, variable.dtype.base_dtype)
+      update_delta = (variable - value) * decay
+      return state_ops.assign_sub(variable, update_delta, name=scope)
+
+  def _fused_batch_norm(self, inputs, training):
+    """Returns the output of fused batch norm."""
+    beta = self.beta if self.center else self._beta_const
+    gamma = self.gamma if self.scale else self._gamma_const
+
+    def _fused_batch_norm_training():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          epsilon=self.epsilon,
+          data_format=self._data_format)
+
+    def _fused_batch_norm_inference():
+      return nn.fused_batch_norm(
+          inputs,
+          gamma,
+          beta,
+          mean=self.moving_mean,
+          variance=self.moving_variance,
+          epsilon=self.epsilon,
+          is_training=False,
+          data_format=self._data_format)
+
+    output, mean, variance = tf_utils.smart_cond(
+        training, _fused_batch_norm_training, _fused_batch_norm_inference)
+    if not self._bessels_correction_test_only:
+      # Remove Bessel's correction to be consistent with non-fused batch norm.
+      # Note that the variance computed by fused batch norm is
+      # with Bessel's correction.
+      sample_size = math_ops.cast(
+          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
+      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
+      variance *= factor
+
+    training_value = tf_utils.constant_value(training)
+    if training_value is None:
+      momentum = tf_utils.smart_cond(training,
+                                     lambda: self.momentum,
+                                     lambda: 1.0)
+    else:
+      momentum = ops.convert_to_tensor(self.momentum)
+    if training_value or training_value is None:
+      mean_update = self._assign_moving_average(self.moving_mean, mean,
+                                                momentum)
+      variance_update = self._assign_moving_average(self.moving_variance,
+                                                    variance, momentum)
+      self.add_update(mean_update, inputs=True)
+      self.add_update(variance_update, inputs=True)
+
+    return output
+
+  def _renorm_correction_and_moments(self, mean, variance, training):
+    """Returns the correction and update values for renorm."""
+    stddev = math_ops.sqrt(variance + self.epsilon)
+    # Compute the average mean and standard deviation, as if they were
+    # initialized with this batch's moments.
+    mixed_renorm_mean = (self.renorm_mean +
+                         (1. - self.renorm_mean_weight) * mean)
+    mixed_renorm_stddev = (self.renorm_stddev +
+                           (1. - self.renorm_stddev_weight) * stddev)
+    # Compute the corrections for batch renorm.
+    r = stddev / mixed_renorm_stddev
+    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
+    # Ensure the corrections use pre-update moving averages.
+    with ops.control_dependencies([r, d]):
+      mean = array_ops.identity(mean)
+      stddev = array_ops.identity(stddev)
+    rmin, rmax, dmax = [self.renorm_clipping.get(key)
+                        for key in ['rmin', 'rmax', 'dmax']]
+    if rmin is not None:
+      r = math_ops.maximum(r, rmin)
+    if rmax is not None:
+      r = math_ops.minimum(r, rmax)
+    if dmax is not None:
+      d = math_ops.maximum(d, -dmax)
+      d = math_ops.minimum(d, dmax)
+    # When not training, use r=1, d=0.
+    r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
+    d = tf_utils.smart_cond(training,
+                            lambda: d,
+                            lambda: array_ops.zeros_like(d))
+
+    def _update_renorm_variable(var, weight, value):
+      """Updates a moving average and weight, returns the unbiased value."""
+      value = array_ops.identity(value)
+      def _do_update():
+        """Updates the var and weight, returns their updated ratio."""
+        # Update the variables without zero debiasing. The debiasing will be
+        # accomplished by dividing the exponential moving average by the weight.
+        # For example, after a single update, the moving average would be
+        # (1-decay) * value. and the weight will be 1-decay, with their ratio
+        # giving the value.
+        # Make sure the weight is not updated until before r and d computation.
+        with ops.control_dependencies([value]):
+          weight_value = array_ops.constant(1., dtype=weight.dtype)
+        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
+        new_weight = self._assign_moving_average(weight, weight_value,
+                                                 self.renorm_momentum)
+        # TODO(yuefengz): the updates to var and weighted can not be batched
+        # together if we fetch their updated values here. Consider calculating
+        # new values and delaying the updates.
+        return new_var / new_weight
+
+      def _fake_update():
+        return array_ops.identity(var)
+      return tf_utils.smart_cond(training, _do_update, _fake_update)
+
+    # TODO(yuefengz): colocate the operations
+    new_mean = _update_renorm_variable(self.renorm_mean,
+                                       self.renorm_mean_weight, mean)
+    new_stddev = _update_renorm_variable(self.renorm_stddev,
+                                         self.renorm_stddev_weight, stddev)
+    # Make sqrt(moving_variance + epsilon) = new_stddev.
+    new_variance = math_ops.square(new_stddev) - self.epsilon
+
+    return (r, d, new_mean, new_variance)
 
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
-    output = super(BatchNormalization, self).call(inputs, training=training)
+
+    in_eager_mode = context.executing_eagerly()
+    if self.virtual_batch_size is not None:
+      # Virtual batches (aka ghost batches) can be simulated by reshaping the
+      # Tensor and reusing the existing batch norm implementation
+      original_shape = [-1] + inputs.shape.as_list()[1:]
+      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
+
+      # Will cause errors if virtual_batch_size does not divide the batch size
+      inputs = array_ops.reshape(inputs, expanded_shape)
+
+      def undo_virtual_batching(outputs):
+        outputs = array_ops.reshape(outputs, original_shape)
+        return outputs
+
+    if self.fused:
+      outputs = self._fused_batch_norm(inputs, training=training)
+      if self.virtual_batch_size is not None:
+        # Currently never reaches here since fused_batch_norm does not support
+        # virtual batching
+        outputs = undo_virtual_batching(outputs)
+      if not context.executing_eagerly() and training is K.learning_phase():
+        outputs._uses_learning_phase = True  # pylint: disable=protected-access
+      return outputs
+
+    # Compute the axes along which to reduce the mean / variance
+    input_shape = inputs.get_shape()
+    ndims = len(input_shape)
+    reduction_axes = [i for i in range(ndims) if i not in self.axis]
+    if self.virtual_batch_size is not None:
+      del reduction_axes[1]     # Do not reduce along virtual batch dim
+
+    # Broadcasting only necessary for single-axis batch norm where the axis is
+    # not the last dimension
+    broadcast_shape = [1] * ndims
+    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
+    def _broadcast(v):
+      if (v is not None and
+          len(v.get_shape()) != ndims and
+          reduction_axes != list(range(ndims - 1))):
+        return array_ops.reshape(v, broadcast_shape)
+      return v
+
+    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
+
+    def _compose_transforms(scale, offset, then_scale, then_offset):
+      if then_scale is not None:
+        scale *= then_scale
+        offset *= then_scale
+      if then_offset is not None:
+        offset += then_offset
+      return (scale, offset)
+
+    # Determine a boolean value for `training`: could be True, False, or None.
+    training_value = tf_utils.constant_value(training)
+    if training_value is not False:
+      if self.adjustment:
+        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
+        # Adjust only during training.
+        adj_scale = tf_utils.smart_cond(training,
+                                        lambda: adj_scale,
+                                        lambda: array_ops.ones_like(adj_scale))
+        adj_bias = tf_utils.smart_cond(training,
+                                       lambda: adj_bias,
+                                       lambda: array_ops.zeros_like(adj_bias))
+        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
+
+      # Some of the computations here are not necessary when training==False
+      # but not a constant. However, this makes the code simpler.
+      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
+      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
+      moving_mean = self.moving_mean
+      moving_variance = self.moving_variance
+
+      mean = tf_utils.smart_cond(training,
+                                 lambda: mean,
+                                 lambda: moving_mean)
+      variance = tf_utils.smart_cond(training,
+                                     lambda: variance,
+                                     lambda: moving_variance)
+
+      if self.renorm:
+        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
+            mean, variance, training)
+        # When training, the normalized values (say, x) will be transformed as
+        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
+        # = x * (r * gamma) + (d * gamma + beta) with renorm.
+        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
+        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
+        scale, offset = _compose_transforms(r, d, scale, offset)
+      else:
+        new_mean, new_variance = mean, variance
+
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(new_mean,
+                                        axis=1, keep_dims=True)
+        new_variance = math_ops.reduce_mean(new_variance,
+                                            axis=1, keep_dims=True)
+
+      def _do_update(var, value):
+        if in_eager_mode and not self.trainable:
+          return
+
+        return self._assign_moving_average(var, value, self.momentum)
+
+      mean_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_mean, new_mean),
+          lambda: self.moving_mean)
+      variance_update = tf_utils.smart_cond(
+          training,
+          lambda: _do_update(self.moving_variance, new_variance),
+          lambda: self.moving_variance)
+      if not context.executing_eagerly():
+        self.add_update(mean_update, inputs=True)
+        self.add_update(variance_update, inputs=True)
+
+    else:
+      mean, variance = self.moving_mean, self.moving_variance
+
+    outputs = nn.batch_normalization(inputs,
+                                     _broadcast(mean),
+                                     _broadcast(variance),
+                                     offset,
+                                     scale,
+                                     self.epsilon)
+    # If some components of the shape got lost due to adjustments, fix that.
+    outputs.set_shape(input_shape)
+
+    if self.virtual_batch_size is not None:
+      outputs = undo_virtual_batching(outputs)
     if not context.executing_eagerly() and training is K.learning_phase():
-      output._uses_learning_phase = True  # pylint: disable=protected-access
-    return output
+      outputs._uses_learning_phase = True  # pylint: disable=protected-access
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    return input_shape
 
   def get_config(self):
     config = {
@@ -133,5 +653,19 @@ class BatchNormalization(tf_normalization_layers.BatchNormalization, Layer):
         'beta_constraint': constraints.serialize(self.beta_constraint),
         'gamma_constraint': constraints.serialize(self.gamma_constraint)
     }
+    # Only add TensorFlow-specific parameters if they are set, so as to preserve
+    # model compatibility with external Keras.
+    if self.renorm:
+      config['renorm'] = True
+      config['renorm_clipping'] = self.renorm_clipping
+      config['renorm_momentum'] = self.renorm_momentum
+    if self.virtual_batch_size is not None:
+      config['virtual_batch_size'] = self.virtual_batch_size
+    # Note: adjustment is not serializable.
+    if self.adjustment is not None:
+      logging.warning('The `adjustment` function of this `BatchNormalization` '
+                      'layer cannot be serialized and has been omitted from '
+                      'the layer config. It will not be included when '
+                      're-creating the layer from the saved config.')
     base_config = super(BatchNormalization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
index 2b3628c3f1023612297465bdf3286246261992a2..fa9277e3d1e5bb0b9633abc46a96a11816dddb2d 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
@@ -114,6 +114,26 @@ class NormalizationLayersTest(test.TestCase):
         np.testing.assert_allclose(np.mean(out, axis=(0, 2, 3)), 0.0, atol=1e-1)
         np.testing.assert_allclose(np.std(out, axis=(0, 2, 3)), 1.0, atol=1e-1)
 
+  def test_batchnorm_convnet_channel_last(self):
+    with self.test_session():
+      # keras.backend.set_learning_phase(True)
+
+      model = keras.models.Sequential()
+      norm = keras.layers.BatchNormalization(
+          axis=-1, input_shape=(4, 4, 3), momentum=0.8)
+      model.add(norm)
+      model.compile(loss='mse', optimizer='sgd')
+
+      # centered on 5.0, variance 10.0
+      x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 4, 4, 3))
+      model.fit(x, x, epochs=4, verbose=0)
+      out = model.predict(x)
+      out -= np.reshape(keras.backend.eval(norm.beta), (1, 1, 1, 3))
+      out /= np.reshape(keras.backend.eval(norm.gamma), (1, 1, 1, 3))
+
+      np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
+      np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
+
   def test_shared_batchnorm(self):
     """Test that a BN layer can be shared across different data streams.
     """
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/_impl/keras/layers/pooling.py
index 15d53379769d8142f5b2755a07479f60751346d2..86bc8a680a529a9ea17592a42207fab58adeebce 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling.py
@@ -19,16 +19,98 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras import backend
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.layers import pooling as tf_pooling_layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Pooling1D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 1D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of a single integer,
+      representing the size of the pooling window.
+    strides: An integer or tuple/list of a single integer, specifying the
+      strides of the pooling operation.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, length, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, length)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling1D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
+    if strides is None:
+      strides = pool_size
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 1, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 1, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=3)
+
+  def call(self, inputs):
+    # There is no TF op for 1D pooling, hence we make the inputs 4D.
+    if self.data_format == 'channels_last':
+      # input is NWC, make it NHWC
+      inputs = array_ops.expand_dims(inputs, 1)
+      # pool on the W dim
+      pool_shape = (1, 1) + self.pool_size + (1,)
+      strides = (1, 1) + self.strides + (1,)
+      data_format = 'NHWC'
+    else:
+      # input is NCW, make it NCHW
+      inputs = array_ops.expand_dims(inputs, 2)
+      # pool on the W dim
+      pool_shape = (1, 1, 1) + self.pool_size
+      strides = (1, 1, 1) + self.strides
+      data_format = 'NCHW'
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=data_format)
+
+    if self.data_format == 'channels_last':
+      return array_ops.squeeze(outputs, 1)
+    else:
+      return array_ops.squeeze(outputs, 2)
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    length = conv_utils.conv_output_length(input_shape[1], self.pool_size[0],
+                                           self.padding, self.strides[0])
+    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
+
+  def get_config(self):
+    config = {
+        'strides': self.strides,
+        'pool_size': self.pool_size,
+        'padding': self.padding
+    }
+    base_config = super(Pooling1D, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 @tf_export('keras.layers.MaxPool1D', 'keras.layers.MaxPooling1D')
-class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
+class MaxPooling1D(Pooling1D):
   """Max pooling operation for temporal data.
 
   Arguments:
@@ -45,23 +127,20 @@ class MaxPooling1D(tf_pooling_layers.MaxPooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling1D, self).__init__(pool_size, strides, padding, **kwargs)
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
 
-  def get_config(self):
-    config = {
-        'strides': self.strides,
-        'pool_size': self.pool_size,
-        'padding': self.padding
-    }
-    base_config = super(MaxPooling1D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling1D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling1D', 'keras.layers.AvgPool1D')
-class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
+class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
   Arguments:
@@ -78,24 +157,104 @@ class AveragePooling1D(tf_pooling_layers.AveragePooling1D, Layer):
       3D tensor with shape: `(batch_size, downsampled_steps, features)`.
   """
 
-  def __init__(self, pool_size=2, strides=None, padding='valid', **kwargs):
+  def __init__(self, pool_size=2, strides=None,
+               padding='valid', data_format=None, **kwargs):
+    super(AveragePooling1D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        **kwargs)
+
+
+class Pooling2D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 2 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first` corresponds to
+      inputs with shape `(batch, channels, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format=None,
+               name=None, **kwargs):
+    super(Pooling2D, self).__init__(name=name, **kwargs)
+    if data_format is None:
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling1D, self).__init__(pool_size, strides, padding,
-                                           **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 2, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 2, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=4)
+
+  def call(self, inputs):
+    if self.data_format == 'channels_last':
+      pool_shape = (1,) + self.pool_size + (1,)
+      strides = (1,) + self.strides + (1,)
+    else:
+      pool_shape = (1, 1) + self.pool_size
+      strides = (1, 1) + self.strides
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper(),
+        data_format=conv_utils.convert_data_format(self.data_format, 4))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      rows = input_shape[2]
+      cols = input_shape[3]
+    else:
+      rows = input_shape[1]
+      cols = input_shape[2]
+    rows = conv_utils.conv_output_length(rows, self.pool_size[0], self.padding,
+                                         self.strides[0])
+    cols = conv_utils.conv_output_length(cols, self.pool_size[1], self.padding,
+                                         self.strides[1])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], rows, cols])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], rows, cols, input_shape[3]])
 
   def get_config(self):
     config = {
-        'strides': self.strides,
         'pool_size': self.pool_size,
-        'padding': self.padding
+        'padding': self.padding,
+        'strides': self.strides,
+        'data_format': self.data_format
     }
-    base_config = super(AveragePooling1D, self).get_config()
+    base_config = super(Pooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D')
-class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
+class MaxPooling2D(Pooling2D):
   """Max pooling operation for spatial data.
 
   Arguments:
@@ -142,26 +301,14 @@ class MaxPooling2D(tf_pooling_layers.MaxPooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling2D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling2D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling2D, self).__init__(
+        nn.max_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling2D', 'keras.layers.AvgPool2D')
-class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
+class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
   Arguments:
@@ -208,12 +355,96 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
+    super(AveragePooling2D, self).__init__(
+        nn.avg_pool,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
+
+
+class Pooling3D(Layer):
+  """Pooling layer for arbitrary pooling functions, for 3D inputs.
+
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Arguments:
+    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
+    pool_size: An integer or tuple/list of 3 integers:
+      (pool_depth, pool_height, pool_width)
+      specifying the size of the pooling window.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    strides: An integer or tuple/list of 3 integers,
+      specifying the strides of the pooling operation.
+      Can be a single integer to specify the same value for
+      all spatial dimensions.
+    padding: A string. The padding method, either 'valid' or 'same'.
+      Case-insensitive.
+    data_format: A string, one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, depth, height, width, channels)`
+      while `channels_first` corresponds to
+      inputs with shape `(batch, channels, depth, height, width)`.
+    name: A string, the name of the layer.
+  """
+
+  def __init__(self, pool_function, pool_size, strides,
+               padding='valid', data_format='channels_last',
+               name=None, **kwargs):
+    super(Pooling3D, self).__init__(name=name, **kwargs)
     if data_format is None:
-      data_format = K.image_data_format()
+      data_format = backend.image_data_format()
     if strides is None:
       strides = pool_size
-    super(AveragePooling2D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
+    self.pool_function = pool_function
+    self.pool_size = conv_utils.normalize_tuple(pool_size, 3, 'pool_size')
+    self.strides = conv_utils.normalize_tuple(strides, 3, 'strides')
+    self.padding = conv_utils.normalize_padding(padding)
+    self.data_format = conv_utils.normalize_data_format(data_format)
+    self.input_spec = InputSpec(ndim=5)
+
+  def call(self, inputs):
+    pool_shape = (1,) + self.pool_size + (1,)
+    strides = (1,) + self.strides + (1,)
+
+    if self.data_format == 'channels_first':
+      # TF does not support `channels_first` with 3D pooling operations,
+      # so we must handle this case manually.
+      # TODO(fchollet): remove this when TF pooling is feature-complete.
+      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
+
+    outputs = self.pool_function(
+        inputs,
+        ksize=pool_shape,
+        strides=strides,
+        padding=self.padding.upper())
+
+    if self.data_format == 'channels_first':
+      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if self.data_format == 'channels_first':
+      len_dim1 = input_shape[2]
+      len_dim2 = input_shape[3]
+      len_dim3 = input_shape[4]
+    else:
+      len_dim1 = input_shape[1]
+      len_dim2 = input_shape[2]
+      len_dim3 = input_shape[3]
+    len_dim1 = conv_utils.conv_output_length(len_dim1, self.pool_size[0],
+                                             self.padding, self.strides[0])
+    len_dim2 = conv_utils.conv_output_length(len_dim2, self.pool_size[1],
+                                             self.padding, self.strides[1])
+    len_dim3 = conv_utils.conv_output_length(len_dim3, self.pool_size[2],
+                                             self.padding, self.strides[2])
+    if self.data_format == 'channels_first':
+      return tensor_shape.TensorShape(
+          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
+    else:
+      return tensor_shape.TensorShape(
+          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
 
   def get_config(self):
     config = {
@@ -222,12 +453,12 @@ class AveragePooling2D(tf_pooling_layers.AveragePooling2D, Layer):
         'strides': self.strides,
         'data_format': self.data_format
     }
-    base_config = super(AveragePooling2D, self).get_config()
+    base_config = super(Pooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.MaxPool3D', 'keras.layers.MaxPooling3D')
-class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
+class MaxPooling3D(Pooling3D):
   """Max pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -270,26 +501,14 @@ class MaxPooling3D(tf_pooling_layers.MaxPooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(MaxPooling3D, self).__init__(pool_size, strides, padding, data_format,
-                                       **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(MaxPooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(MaxPooling3D, self).__init__(
+        nn.max_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
 @tf_export('keras.layers.AveragePooling3D', 'keras.layers.AvgPool3D')
-class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
+class AveragePooling3D(Pooling3D):
   """Average pooling operation for 3D data (spatial or spatio-temporal).
 
   Arguments:
@@ -332,30 +551,18 @@ class AveragePooling3D(tf_pooling_layers.AveragePooling3D, Layer):
                padding='valid',
                data_format=None,
                **kwargs):
-    if data_format is None:
-      data_format = K.image_data_format()
-    if strides is None:
-      strides = pool_size
-    super(AveragePooling3D, self).__init__(pool_size, strides, padding,
-                                           data_format, **kwargs)
-
-  def get_config(self):
-    config = {
-        'pool_size': self.pool_size,
-        'padding': self.padding,
-        'strides': self.strides,
-        'data_format': self.data_format
-    }
-    base_config = super(AveragePooling3D, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+    super(AveragePooling3D, self).__init__(
+        nn.avg_pool3d,
+        pool_size=pool_size, strides=strides,
+        padding=padding, data_format=data_format, **kwargs)
 
 
-class _GlobalPooling1D(Layer):
+class GlobalPooling1D(Layer):
   """Abstract class for different global pooling 1D layers.
   """
 
   def __init__(self, **kwargs):
-    super(_GlobalPooling1D, self).__init__(**kwargs)
+    super(GlobalPooling1D, self).__init__(**kwargs)
     self.input_spec = InputSpec(ndim=3)
 
   def compute_output_shape(self, input_shape):
@@ -368,7 +575,7 @@ class _GlobalPooling1D(Layer):
 
 @tf_export('keras.layers.GlobalAveragePooling1D',
            'keras.layers.GlobalAvgPool1D')
-class GlobalAveragePooling1D(_GlobalPooling1D):
+class GlobalAveragePooling1D(GlobalPooling1D):
   """Global average pooling operation for temporal data.
 
   Input shape:
@@ -380,11 +587,11 @@ class GlobalAveragePooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.mean(inputs, axis=1)
+    return backend.mean(inputs, axis=1)
 
 
 @tf_export('keras.layers.GlobalMaxPool1D', 'keras.layers.GlobalMaxPooling1D')
-class GlobalMaxPooling1D(_GlobalPooling1D):
+class GlobalMaxPooling1D(GlobalPooling1D):
   """Global max pooling operation for temporal data.
 
   Input shape:
@@ -396,15 +603,15 @@ class GlobalMaxPooling1D(_GlobalPooling1D):
   """
 
   def call(self, inputs):
-    return K.max(inputs, axis=1)
+    return backend.max(inputs, axis=1)
 
 
-class _GlobalPooling2D(Layer):
+class GlobalPooling2D(Layer):
   """Abstract class for different global pooling 2D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling2D, self).__init__(**kwargs)
+    super(GlobalPooling2D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -420,13 +627,13 @@ class _GlobalPooling2D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling2D, self).get_config()
+    base_config = super(GlobalPooling2D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling2D',
            'keras.layers.GlobalAvgPool2D')
-class GlobalAveragePooling2D(_GlobalPooling2D):
+class GlobalAveragePooling2D(GlobalPooling2D):
   """Global average pooling operation for spatial data.
 
   Arguments:
@@ -456,13 +663,13 @@ class GlobalAveragePooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2])
+      return backend.mean(inputs, axis=[1, 2])
     else:
-      return K.mean(inputs, axis=[2, 3])
+      return backend.mean(inputs, axis=[2, 3])
 
 
 @tf_export('keras.layers.GlobalMaxPool2D', 'keras.layers.GlobalMaxPooling2D')
-class GlobalMaxPooling2D(_GlobalPooling2D):
+class GlobalMaxPooling2D(GlobalPooling2D):
   """Global max pooling operation for spatial data.
 
   Arguments:
@@ -492,17 +699,17 @@ class GlobalMaxPooling2D(_GlobalPooling2D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2])
+      return backend.max(inputs, axis=[1, 2])
     else:
-      return K.max(inputs, axis=[2, 3])
+      return backend.max(inputs, axis=[2, 3])
 
 
-class _GlobalPooling3D(Layer):
+class GlobalPooling3D(Layer):
   """Abstract class for different global pooling 3D layers.
   """
 
   def __init__(self, data_format=None, **kwargs):
-    super(_GlobalPooling3D, self).__init__(**kwargs)
+    super(GlobalPooling3D, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
@@ -518,13 +725,13 @@ class _GlobalPooling3D(Layer):
 
   def get_config(self):
     config = {'data_format': self.data_format}
-    base_config = super(_GlobalPooling3D, self).get_config()
+    base_config = super(GlobalPooling3D, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 @tf_export('keras.layers.GlobalAveragePooling3D',
            'keras.layers.GlobalAvgPool3D')
-class GlobalAveragePooling3D(_GlobalPooling3D):
+class GlobalAveragePooling3D(GlobalPooling3D):
   """Global Average pooling operation for 3D data.
 
   Arguments:
@@ -554,13 +761,13 @@ class GlobalAveragePooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.mean(inputs, axis=[1, 2, 3])
+      return backend.mean(inputs, axis=[1, 2, 3])
     else:
-      return K.mean(inputs, axis=[2, 3, 4])
+      return backend.mean(inputs, axis=[2, 3, 4])
 
 
 @tf_export('keras.layers.GlobalMaxPool3D', 'keras.layers.GlobalMaxPooling3D')
-class GlobalMaxPooling3D(_GlobalPooling3D):
+class GlobalMaxPooling3D(GlobalPooling3D):
   """Global Max pooling operation for 3D data.
 
   Arguments:
@@ -590,9 +797,9 @@ class GlobalMaxPooling3D(_GlobalPooling3D):
 
   def call(self, inputs):
     if self.data_format == 'channels_last':
-      return K.max(inputs, axis=[1, 2, 3])
+      return backend.max(inputs, axis=[1, 2, 3])
     else:
-      return K.max(inputs, axis=[2, 3, 4])
+      return backend.max(inputs, axis=[2, 3, 4])
 
 
 # Aliases
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
index bb003c1dddf80e2a745c1268a3a7d045f4e8b036..2c08b647ea0fafb7519240b0c81e8fa77f034f7f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
@@ -27,14 +27,14 @@ from tensorflow.python.platform import test
 
 class GlobalPoolingTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_1d(self):
     testing_utils.layer_test(keras.layers.pooling.GlobalMaxPooling1D,
                              input_shape=(3, 4, 5))
     testing_utils.layer_test(
         keras.layers.pooling.GlobalAveragePooling1D, input_shape=(3, 4, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_2d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling2D,
@@ -53,7 +53,7 @@ class GlobalPoolingTest(test.TestCase):
         kwargs={'data_format': 'channels_last'},
         input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_globalpooling_3d(self):
     testing_utils.layer_test(
         keras.layers.pooling.GlobalMaxPooling3D,
@@ -75,7 +75,7 @@ class GlobalPoolingTest(test.TestCase):
 
 class Pooling2DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_2d(self):
     pool_size = (3, 3)
     for strides in [(1, 1), (2, 2)]:
@@ -88,7 +88,7 @@ class Pooling2DTest(test.TestCase):
           },
           input_shape=(3, 5, 6, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_2d(self):
     testing_utils.layer_test(
         keras.layers.AveragePooling2D,
@@ -122,7 +122,7 @@ class Pooling2DTest(test.TestCase):
 
 class Pooling3DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -141,7 +141,7 @@ class Pooling3DTest(test.TestCase):
         },
         input_shape=(3, 4, 11, 12, 10))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_3d(self):
     pool_size = (3, 3, 3)
     testing_utils.layer_test(
@@ -163,7 +163,7 @@ class Pooling3DTest(test.TestCase):
 
 class Pooling1DTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_maxpooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
@@ -173,7 +173,7 @@ class Pooling1DTest(test.TestCase):
                     'padding': padding},
             input_shape=(3, 5, 4))
 
-  @tf_test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_averagepooling_1d(self):
     for padding in ['valid', 'same']:
       for stride in [1, 2]:
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
index 791f9b311300ed05591083d551c040eb25ac8e22..f53db987ff379a70d3fd43bbc3442646635e3bd6 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent.py
@@ -33,6 +33,9 @@ from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -248,7 +251,7 @@ class RNN(Layer):
           It is also possible for `cell` to be a list of RNN cell instances,
           in which cases the cells get stacked on after the other in the RNN,
           implementing an efficient stacked RNN.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -503,9 +506,12 @@ class RNN(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     if hasattr(self.cell.state_size, '__len__'):
       return [K.tile(initial_state, [1, dim]) for dim in self.cell.state_size]
     else:
@@ -631,7 +637,7 @@ class RNN(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     if self.return_sequences:
@@ -791,10 +797,10 @@ class RNN(Layer):
 
   @property
   def losses(self):
-    losses = []
+    layer_losses = super(RNN, self).losses
     if isinstance(self.cell, Layer):
-      losses += self.cell.losses
-    return losses + self._losses
+      return self.cell.losses + layer_losses
+    return layer_losses
 
   @property
   def updates(self):
@@ -907,8 +913,7 @@ class SimpleRNNCell(Layer):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
@@ -1012,7 +1017,7 @@ class SimpleRNN(RNN):
       recurrent_dropout: Float between 0 and 1.
           Fraction of the units to drop for
           the linear transformation of the recurrent state.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1232,6 +1237,9 @@ class GRUCell(Layer):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
   """
 
   def __init__(self,
@@ -1251,6 +1259,7 @@ class GRUCell(Layer):
                dropout=0.,
                recurrent_dropout=0.,
                implementation=1,
+               reset_after=False,
                **kwargs):
     super(GRUCell, self).__init__(**kwargs)
     self.units = units
@@ -1273,6 +1282,7 @@ class GRUCell(Layer):
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
     self.implementation = implementation
+    self.reset_after = reset_after
     self.state_size = self.units
     self._dropout_mask = None
     self._recurrent_dropout_mask = None
@@ -1294,12 +1304,25 @@ class GRUCell(Layer):
         constraint=self.recurrent_constraint)
 
     if self.use_bias:
-      self.bias = self.add_weight(
-          shape=(self.units * 3,),
-          name='bias',
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint)
+      if not self.reset_after:
+        bias_shape = (3 * self.units,)
+      else:
+        # separate biases for input and recurrent kernels
+        # Note: the shape is intentionally different from CuDNNGRU biases
+        # `(2 * 3 * self.units,)`, so that we can distinguish the classes
+        # when loading and converting saved weights.
+        bias_shape = (2, 3 * self.units)
+      self.bias = self.add_weight(shape=bias_shape,
+                                  name='bias',
+                                  initializer=self.bias_initializer,
+                                  regularizer=self.bias_regularizer,
+                                  constraint=self.bias_constraint)
+      if not self.reset_after:
+        self.input_bias, self.recurrent_bias = self.bias, None
+      else:
+        self.input_bias = K.flatten(self.bias[0])
+        self.recurrent_bias = K.flatten(self.bias[1])
+
     else:
       self.bias = None
     self.built = True
@@ -1309,8 +1332,7 @@ class GRUCell(Layer):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=3)
@@ -1336,13 +1358,15 @@ class GRUCell(Layer):
         inputs_z = inputs
         inputs_r = inputs
         inputs_h = inputs
+
       x_z = K.dot(inputs_z, self.kernel[:, :self.units])
       x_r = K.dot(inputs_r, self.kernel[:, self.units:self.units * 2])
       x_h = K.dot(inputs_h, self.kernel[:, self.units * 2:])
+
       if self.use_bias:
-        x_z = K.bias_add(x_z, self.bias[:self.units])
-        x_r = K.bias_add(x_r, self.bias[self.units:self.units * 2])
-        x_h = K.bias_add(x_h, self.bias[self.units * 2:])
+        x_z = K.bias_add(x_z, self.input_bias[:self.units])
+        x_r = K.bias_add(x_r, self.input_bias[self.units: self.units * 2])
+        x_h = K.bias_add(x_h, self.input_bias[self.units * 2:])
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1_z = h_tm1 * rec_dp_mask[0]
@@ -1352,42 +1376,70 @@ class GRUCell(Layer):
         h_tm1_z = h_tm1
         h_tm1_r = h_tm1
         h_tm1_h = h_tm1
-      z = self.recurrent_activation(
-          x_z + K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units]))
-      r = self.recurrent_activation(
-          x_r + K.dot(h_tm1_r, self.recurrent_kernel[:, self.units:
-                                                     self.units * 2]))
-
-      hh = self.activation(x_h + K.dot(r * h_tm1_h,
-                                       self.recurrent_kernel[:,
-                                                             self.units * 2:]))
+
+      recurrent_z = K.dot(h_tm1_z, self.recurrent_kernel[:, :self.units])
+      recurrent_r = K.dot(h_tm1_r,
+                          self.recurrent_kernel[:, self.units:self.units * 2])
+      if self.reset_after and self.use_bias:
+        recurrent_z = K.bias_add(recurrent_z, self.recurrent_bias[:self.units])
+        recurrent_r = K.bias_add(recurrent_r,
+                                 self.recurrent_bias[self.units:
+                                                     self.units * 2])
+
+      z = self.recurrent_activation(x_z + recurrent_z)
+      r = self.recurrent_activation(x_r + recurrent_r)
+
+      # reset gate applied after/before matrix multiplication
+      if self.reset_after:
+        recurrent_h = K.dot(h_tm1_h, self.recurrent_kernel[:, self.units * 2:])
+        if self.use_bias:
+          recurrent_h = K.bias_add(recurrent_h,
+                                   self.recurrent_bias[self.units * 2:])
+        recurrent_h = r * recurrent_h
+      else:
+        recurrent_h = K.dot(r * h_tm1_h,
+                            self.recurrent_kernel[:, self.units * 2:])
+
+      hh = self.activation(x_h + recurrent_h)
     else:
       if 0. < self.dropout < 1.:
         inputs *= dp_mask[0]
+
+      # inputs projected by all gate matrices at once
       matrix_x = K.dot(inputs, self.kernel)
       if self.use_bias:
-        matrix_x = K.bias_add(matrix_x, self.bias)
+        # biases: bias_z_i, bias_r_i, bias_h_i
+        matrix_x = K.bias_add(matrix_x, self.input_bias)
+
+      x_z = matrix_x[:, :self.units]
+      x_r = matrix_x[:, self.units: 2 * self.units]
+      x_h = matrix_x[:, 2 * self.units:]
+
       if 0. < self.recurrent_dropout < 1.:
         h_tm1 *= rec_dp_mask[0]
       matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
-      x_z = matrix_x[:, :self.units]
-      x_r = matrix_x[:, self.units:2 * self.units]
       recurrent_z = matrix_inner[:, :self.units]
       recurrent_r = matrix_inner[:, self.units:2 * self.units]
 
       z = self.recurrent_activation(x_z + recurrent_z)
       r = self.recurrent_activation(x_r + recurrent_r)
 
-      x_h = matrix_x[:, 2 * self.units:]
-      recurrent_h = K.dot(r * h_tm1, self.recurrent_kernel[:, 2 * self.units:])
+      if self.reset_after:
+        recurrent_h = r * matrix_inner[:, 2 * self.units:]
+      else:
+        recurrent_h = K.dot(r * h_tm1,
+                            self.recurrent_kernel[:, 2 * self.units:])
+
       hh = self.activation(x_h + recurrent_h)
+    # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
     if 0 < self.dropout + self.recurrent_dropout:
       if training is None and not context.executing_eagerly():
         # This would be harmless to set in eager mode, but eager tensors
         # disallow setting arbitrary attributes.
         h._uses_learning_phase = True
+
     return h, [h]
 
   def get_config(self):
@@ -1411,7 +1463,8 @@ class GRUCell(Layer):
         'bias_constraint': constraints.serialize(self.bias_constraint),
         'dropout': self.dropout,
         'recurrent_dropout': self.recurrent_dropout,
-        'implementation': self.implementation
+        'implementation': self.implementation,
+        'reset_after': self.reset_after
     }
     base_config = super(GRUCell, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -1419,9 +1472,16 @@ class GRUCell(Layer):
 
 @tf_export('keras.layers.GRU')
 class GRU(RNN):
-  """Gated Recurrent Unit - Cho et al.
+  """Gated Recurrent Unit - Cho et al. 2014.
 
-  2014.
+  There are two variants. The default one is based on 1406.1078v3 and
+  has reset gate applied to hidden state before matrix multiplication. The
+  other one is based on original 1406.1078v1 and has the order reversed.
+
+  The second variant is compatible with CuDNNGRU (GPU-only) and allows
+  inference on CPU. Thus it has separate biases for `kernel` and
+  `recurrent_kernel`. Use `'reset_after'=True` and
+  `recurrent_activation='sigmoid'`.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -1465,7 +1525,7 @@ class GRU(RNN):
           batch them into fewer, larger operations. These modes will
           have different performance profiles on different hardware and
           for different applications.
-      return_sequences: Boolean. Whether to return the last output.
+      return_sequences: Boolean. Whether to return the last output
           in the output sequence, or the full sequence.
       return_state: Boolean. Whether to return the last state
           in addition to the output.
@@ -1481,6 +1541,9 @@ class GRU(RNN):
           Unrolling can speed-up a RNN,
           although it tends to be more memory-intensive.
           Unrolling is only suitable for short sequences.
+      reset_after: GRU convention (whether to apply reset gate after or
+          before matrix multiplication). False = "before" (default),
+          True = "after" (CuDNN compatible).
 
   """
 
@@ -1507,6 +1570,7 @@ class GRU(RNN):
                go_backwards=False,
                stateful=False,
                unroll=False,
+               reset_after=False,
                **kwargs):
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
@@ -1528,7 +1592,8 @@ class GRU(RNN):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation)
+        implementation=implementation,
+        reset_after=reset_after)
     super(GRU, self).__init__(
         cell,
         return_sequences=return_sequences,
@@ -1609,6 +1674,10 @@ class GRU(RNN):
   def implementation(self):
     return self.cell.implementation
 
+  @property
+  def reset_after(self):
+    return self.cell.reset_after
+
   def get_config(self):
     config = {
         'units':
@@ -1644,7 +1713,9 @@ class GRU(RNN):
         'recurrent_dropout':
             self.recurrent_dropout,
         'implementation':
-            self.implementation
+            self.implementation,
+        'reset_after':
+            self.reset_after
     }
     base_config = super(GRU, self).get_config()
     del base_config['cell']
@@ -1793,8 +1864,7 @@ class LSTMCell(Layer):
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs,
-                                 K.shape(inputs)[-1]),
+          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
           self.dropout,
           training=training,
           count=4)
@@ -1926,7 +1996,7 @@ class LSTMCell(Layer):
 
 @tf_export('keras.layers.LSTM')
 class LSTM(RNN):
-  """Long-Short Term Memory layer - Hochreiter 1997.
+  """Long Short-Term Memory layer - Hochreiter 1997.
 
   Arguments:
       units: Positive integer, dimensionality of the output space.
@@ -2176,7 +2246,7 @@ class LSTM(RNN):
 
 
 def _generate_dropout_ones(inputs, dims):
-  return K.ones((K.shape(inputs)[0], dims))
+  return K.ones((array_ops.shape(inputs)[0], dims))
 
 
 def _generate_dropout_mask(ones, rate, training=None, count=1):
@@ -2351,9 +2421,12 @@ class Recurrent(Layer):
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
-    initial_state = K.zeros_like(inputs)  # (samples, timesteps, input_dim)
-    initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
-    initial_state = K.expand_dims(initial_state)  # (samples, 1)
+    initial_state = array_ops.zeros_like(inputs)
+    # shape of initial_state = (samples, timesteps, input_dim)
+    initial_state = math_ops.reduce_sum(initial_state, axis=(1, 2))
+    # shape of initial_state = (samples,)
+    initial_state = array_ops.expand_dims(initial_state, axis=-1)
+    # shape of initial_state = (samples, 1)
     initial_state = K.tile(initial_state, [1,
                                            self.units])  # (samples, output_dim)
     initial_state = [initial_state for _ in range(len(self.states))]
@@ -2456,7 +2529,7 @@ class Recurrent(Layer):
     if self.stateful:
       updates = []
       for i in range(len(states)):
-        updates.append(K.update(self.states[i], states[i]))
+        updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates, inputs)
 
     # Properly set learning phase
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
index de022153f6f07240a0dff70e5faeed5b6d4a8c5f..4c68c18825a47d87806a7a09d4054f974d569e00 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
@@ -24,6 +24,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -229,6 +232,7 @@ class RNNTest(test.TestCase):
       cell = RNNCellWithConstants(32)
       layer = keras.layers.RNN(cell)
       y = layer(x, constants=c)
+
       model = keras.models.Model([x, c], y)
       model.compile(optimizer='rmsprop', loss='mse')
       model.train_on_batch(
@@ -276,6 +280,20 @@ class RNNTest(test.TestCase):
           np.zeros((6, 32))
       )
 
+    with self.test_session():
+      # Test GRUCell reset_after property.
+      x = keras.Input((None, 5))
+      c = keras.Input((3,))
+      cells = [keras.layers.recurrent.GRUCell(32, reset_after=True)]
+      layer = keras.layers.recurrent.RNN(cells)
+      y = layer(x, constants=c)
+      model = keras.models.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 32))
+      )
+
     with self.test_session():
       # Test stacked RNN serialization
       x_np = np.random.random((6, 5, 5))
@@ -395,8 +413,8 @@ class RNNTest(test.TestCase):
 
     # Test `get_losses_for` and `losses`
     x = keras.Input((None, 1))
-    loss_1 = keras.backend.sum(x)
-    loss_2 = keras.backend.sum(cells[0].kernel)
+    loss_1 = math_ops.reduce_sum(x)
+    loss_2 = math_ops.reduce_sum(cells[0].kernel)
     cells[0].add_loss(loss_1, inputs=x)
     cells[0].add_loss(loss_2)
     self.assertEqual(len(layer.losses), 2)
@@ -410,15 +428,15 @@ class RNNTest(test.TestCase):
     layer.build((None, None, 1))
 
     x = keras.Input((None, 1))
-    update_1 = keras.backend.update_add(
-        cells[0].kernel, x[0, 0, 0] * cells[0].kernel)
-    update_2 = keras.backend.update_add(
-        cells[0].kernel, keras.backend.ones_like(cells[0].kernel))
+    update_1 = state_ops.assign_add(cells[0].kernel,
+                                    x[0, 0, 0] * cells[0].kernel)
+    update_2 = state_ops.assign_add(cells[0].kernel,
+                                    array_ops.ones_like(cells[0].kernel))
     cells[0].add_update(update_1, inputs=x)
     cells[0].add_update(update_2)
     self.assertEqual(len(layer.updates), 2)
-    self.assertEqual(layer.get_updates_for(None), [update_2])
-    self.assertEqual(layer.get_updates_for(x), [update_1])
+    self.assertEqual(len(layer.get_updates_for(None)), 1)
+    self.assertEqual(len(layer.get_updates_for(x)), 1)
 
   def test_rnn_dynamic_trainability(self):
     layer_class = keras.layers.SimpleRNN
@@ -538,6 +556,5 @@ class RNNTest(test.TestCase):
         [tuple(o.as_list()) for o in output_shape],
         expected_output_shape)
 
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
index 76ddd9299dd669da35d89a6fe8fc521ce4c26337..9aee5f03b6d79f0b363f79d2b7a18c0b20a2883f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/_impl/keras/layers/wrappers.py
@@ -23,11 +23,12 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras._impl.keras import backend as K
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.keras._impl.keras.engine import InputSpec
 from tensorflow.python.keras._impl.keras.engine import Layer
 from tensorflow.python.keras._impl.keras.engine.base_layer import shape_type_conversion
 from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.layers import utils as tf_layers_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -209,11 +210,11 @@ class TimeDistributed(Wrapper):
       # We can go with reshape-based implementation for performance.
       input_length = input_shape[1]
       if not input_length:
-        input_length = K.shape(inputs)[1]
+        input_length = array_ops.shape(inputs)[1]
       # Shape: (num_samples * timesteps, ...). And track the
       # transformation in self._input_map.
-      input_uid = tf_layers_util.object_list_uid(inputs)
-      inputs = K.reshape(inputs, (-1,) + input_shape[2:])
+      input_uid = base_layer.object_list_uid(inputs)
+      inputs = array_ops.reshape(inputs, (-1,) + input_shape[2:])
       self._input_map[input_uid] = inputs
       # (num_samples * timesteps, ...)
       y = self.layer.call(inputs, **kwargs)
@@ -221,7 +222,7 @@ class TimeDistributed(Wrapper):
         uses_learning_phase = y._uses_learning_phase
       # Shape: (num_samples, timesteps, ...)
       output_shape = self.compute_output_shape(input_shape).as_list()
-      y = K.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
+      y = array_ops.reshape(y, (-1, input_length) + tuple(output_shape[2:]))
 
     # Apply activity regularizer if any:
     if (hasattr(self.layer, 'activity_regularizer') and
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/_impl/keras/losses.py
index 1576ed7b999f65992f46b357c8ebeda8935c68d0..1d634d38013164659f7360fce45704c19083f475 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/_impl/keras/losses.py
@@ -24,51 +24,55 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
            'keras.losses.mean_squared_error')
 def mean_squared_error(y_true, y_pred):
-  return K.mean(K.square(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
            'keras.losses.mean_absolute_error')
 def mean_absolute_error(y_true, y_pred):
-  return K.mean(K.abs(y_pred - y_true), axis=-1)
+  return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
            'keras.losses.mean_absolute_percentage_error')
 def mean_absolute_percentage_error(y_true, y_pred):
-  diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None))
+  diff = math_ops.abs(
+      (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
   return 100. * K.mean(diff, axis=-1)
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
            'keras.losses.mean_squared_logarithmic_error')
 def mean_squared_logarithmic_error(y_true, y_pred):
-  first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
-  second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
-  return K.mean(K.square(first_log - second_log), axis=-1)
+  first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
+  second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
+  return K.mean(math_ops.square(first_log - second_log), axis=-1)
 
 
 @tf_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
 def squared_hinge(y_true, y_pred):
-  return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1)
+  return K.mean(
+      math_ops.square(math_ops.maximum(1. - y_true * y_pred, 0.)), axis=-1)
 
 
 @tf_export('keras.metrics.hinge', 'keras.losses.hinge')
 def hinge(y_true, y_pred):
-  return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1)
+  return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
 
 
 @tf_export('keras.losses.categorical_hinge')
 def categorical_hinge(y_true, y_pred):
-  pos = K.sum(y_true * y_pred, axis=-1)
-  neg = K.max((1. - y_true) * y_pred, axis=-1)
-  return K.maximum(0., neg - pos + 1.)
+  pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
+  neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
+  return math_ops.maximum(0., neg - pos + 1.)
 
 
 @tf_export('keras.losses.logcosh')
@@ -89,7 +93,7 @@ def logcosh(y_true, y_pred):
   """
 
   def _logcosh(x):
-    return x + K.softplus(-2. * x) - K.log(2.)
+    return x + nn.softplus(-2. * x) - math_ops.log(2.)
 
   return K.mean(_logcosh(y_pred - y_true), axis=-1)
 
@@ -117,19 +121,19 @@ def binary_crossentropy(y_true, y_pred):
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
-  return K.sum(y_true * K.log(y_true / y_pred), axis=-1)
+  return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.poisson', 'keras.losses.poisson')
 def poisson(y_true, y_pred):
-  return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1)
+  return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
 @tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
 def cosine_proximity(y_true, y_pred):
-  y_true = K.l2_normalize(y_true, axis=-1)
-  y_pred = K.l2_normalize(y_pred, axis=-1)
-  return -K.sum(y_true * y_pred, axis=-1)
+  y_true = nn.l2_normalize(y_true, axis=-1)
+  y_pred = nn.l2_normalize(y_pred, axis=-1)
+  return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
 # Aliases.
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/_impl/keras/metrics.py
index 82778a3dc4fbdc13bb6682d01e28ff68882b6dd9..747c3e65157ded6b0d227c6d6667b9092d0eed44 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/_impl/keras/metrics.py
@@ -37,37 +37,45 @@ from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crosse
 from tensorflow.python.keras._impl.keras.losses import squared_hinge
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred):
-  return K.mean(K.equal(y_true, K.round(y_pred)), axis=-1)
+  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
 def categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)),
+      K.floatx())
 
 
 def sparse_categorical_accuracy(y_true, y_pred):
-  return K.cast(
-      K.equal(
-          K.max(y_true, axis=-1), K.cast(K.argmax(y_pred, axis=-1),
-                                         K.floatx())), K.floatx())
+  return math_ops.cast(
+      math_ops.equal(
+          math_ops.reduce_max(y_true, axis=-1),
+          math_ops.cast(math_ops.argmax(y_pred, axis=-1), K.floatx())),
+      K.floatx())
 
 
 @tf_export('keras.metrics.top_k_categorical_accuracy')
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
-  return K.mean(K.in_top_k(y_pred, K.argmax(y_true, axis=-1), k), axis=-1)
+  return K.mean(
+      nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), axis=-1)
 
 
 @tf_export('keras.metrics.sparse_top_k_categorical_accuracy')
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   return K.mean(
-      K.in_top_k(y_pred, K.cast(K.max(y_true, axis=-1), 'int32'), k), axis=-1)
-
+      nn.in_top_k(y_pred,
+                  math_ops.cast(math_ops.reduce_max(y_true, axis=-1), 'int32'),
+                  k),
+      axis=-1)
 
 # Aliases
 
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/_impl/keras/metrics_test.py
index 44289ea02abf5ae5f8befbe515552aea3d4b231e..9deaab0c056e4b71205422e56cc79202a8e73593 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/_impl/keras/metrics_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
 
 
@@ -104,16 +106,15 @@ class KerasMetricsTest(test.TestCase):
             The total number of true positives seen this epoch at the
                 completion of the batch.
         """
-        y_true = keras.backend.cast(y_true, 'int32')
-        y_pred = keras.backend.cast(keras.backend.round(y_pred), 'int32')
-        correct_preds = keras.backend.cast(
-            keras.backend.equal(y_pred, y_true), 'int32')
-        true_pos = keras.backend.cast(
-            keras.backend.sum(correct_preds * y_true), 'int32')
+        y_true = math_ops.cast(y_true, 'int32')
+        y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
+        correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
+        true_pos = math_ops.cast(
+            math_ops.reduce_sum(correct_preds * y_true), 'int32')
         current_true_pos = self.true_positives * 1
-        self.add_update(keras.backend.update_add(self.true_positives,
-                                                 true_pos),
-                        inputs=[y_true, y_pred])
+        self.add_update(
+            state_ops.assign_add(self.true_positives, true_pos),
+            inputs=[y_true, y_pred])
         return current_true_pos + true_pos
 
     metric_fn = BinaryTruePositives()
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
index 58b144365be6cd8ea5b2ea82e275eacdee6b6c84..bc8698f235aac0f5fb0c3303cc4c70aa1efa08bc 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
@@ -22,7 +22,9 @@ import os
 import tempfile
 
 import numpy as np
+import six
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras._impl import keras
@@ -36,6 +38,7 @@ except ImportError:
   h5py = None
 
 
+# pylint: disable=not-callable
 class SimpleTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=10):
@@ -104,7 +107,7 @@ class NestedTestModel1(keras.Model):
   def call(self, inputs):
     x = self.dense1(inputs)
     x = self.bn(x)
-    x = self.test_net(x)  # pylint: disable=not-callable
+    x = self.test_net(x)
     return self.dense2(x)
 
 
@@ -161,7 +164,7 @@ def get_nested_model_3(input_dim, num_classes):
       return tensor_shape.TensorShape((input_shape[0], 5))
 
   test_model = Inner()
-  x = test_model(x)  # pylint: disable=not-callable
+  x = test_model(x)
   outputs = keras.layers.Dense(num_classes)(x)
   return keras.Model(inputs, outputs, name='nested_model_3')
 
@@ -574,5 +577,123 @@ class ModelSubclassingTest(test.TestCase):
     self.assertGreater(loss, 0.1)
 
 
+class CustomCallModel(keras.Model):
+
+  def __init__(self):
+    super(CustomCallModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1, activation='relu')
+    self.dense2 = keras.layers.Dense(1, activation='softmax')
+
+  def call(self, first, second, fiddle_with_output='no', training=True):
+    combined = self.dense1(first) + self.dense2(second)
+    if fiddle_with_output == 'yes':
+      return 10. * combined
+    else:
+      return combined
+
+
+class CustomCallSignatureTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_no_inputs_in_signature(self):
+    model = CustomCallModel()
+    first = array_ops.ones([2, 3])
+    second = array_ops.ones([2, 5])
+    output = model(first, second)
+    self.evaluate([v.initializer for v in model.variables])
+    expected_output = self.evaluate(model.dense1(first) + model.dense2(second))
+    self.assertAllClose(expected_output, self.evaluate(output))
+    output = model(first, second, fiddle_with_output='yes')
+    self.assertAllClose(10. * expected_output, self.evaluate(output))
+    output = model(first, second=second, training=False)
+    self.assertAllClose(expected_output, self.evaluate(output))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_inputs_in_signature(self):
+
+    class HasInputsAndOtherPositional(keras.Model):
+
+      def call(self, inputs, some_other_arg, training=False):
+        return inputs
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    model = HasInputsAndOtherPositional()
+    with self.assertRaisesRegexp(
+        TypeError, 'everything else as a keyword argument'):
+      x1, x2 = keras.Input((1, 1)), keras.Input((1, 1))
+      model(x1, x2)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_kwargs_in_signature(self):
+
+    class HasKwargs(keras.Model):
+
+      def call(self, x, y=3, **key_words):
+        return x
+
+    model = HasKwargs()
+    arg = array_ops.ones([])
+    model(arg, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_args_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, *args, **kwargs):
+        return [x] + list(args)
+
+      def compute_output_shape(self, input_shape):
+        return input_shape
+
+    model = HasArgs()
+    x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+    model(x1, x2, x3, a=3)
+    if not context.executing_eagerly():
+      six.assertCountEqual(self, [x1, x2, x3], model.inputs)
+
+  def test_args_and_keywords_in_signature(self):
+
+    class HasArgs(keras.Model):
+
+      def call(self, x, training=True, *args, **kwargs):
+        return x
+
+    with context.graph_mode():
+      model = HasArgs()
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+      with self.assertRaisesRegexp(TypeError, 'args and arguments with'):
+        model(x1, x2, x3, a=3)
+
+  def test_training_no_default(self):
+
+    class TrainingNoDefault(keras.Model):
+
+      def call(self, x, training):
+        return x
+
+    with context.graph_mode():
+      model = TrainingNoDefault()
+      arg = array_ops.ones([])
+      model(arg, True)
+      six.assertCountEqual(self, [arg], model.inputs)
+
+  def test_training_no_default_with_positional(self):
+
+    class TrainingNoDefaultWithPositional(keras.Model):
+
+      def call(self, x, training, positional):
+        return x
+
+    with context.graph_mode():
+      model = TrainingNoDefaultWithPositional()
+      x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1))
+      with self.assertRaisesRegexp(TypeError, 'after a non-input'):
+        model(x1, x2, x3)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/_impl/keras/optimizers.py
index b715d722b98b9db3bdf0985da0954356a2facdfe..9f383deb725ac69bf2f17f3627010c4e1f567ef0 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/_impl/keras/optimizers.py
@@ -31,7 +31,10 @@ from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
+from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -116,7 +119,8 @@ class Optimizer(object):
                        'Common ops without gradient: '
                        'K.argmax, K.round, K.eval.')
     if hasattr(self, 'clipnorm') and self.clipnorm > 0:
-      norm = K.sqrt(sum([K.sum(K.square(g)) for g in grads]))
+      norm = K.sqrt(
+          sum([math_ops.reduce_sum(math_ops.square(g)) for g in grads]))
       grads = [clip_norm(g, self.clipnorm, norm) for g in grads]
     if hasattr(self, 'clipvalue') and self.clipvalue > 0:
       grads = [K.clip(g, -self.clipvalue, self.clipvalue) for g in grads]
@@ -202,20 +206,20 @@ class SGD(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
     # momentum
     shapes = [K.int_shape(p) for p in params]
     moments = [K.zeros(shape) for shape in shapes]
     self.weights = [self.iterations] + moments
     for p, g, m in zip(params, grads, moments):
       v = self.momentum * m - lr * g  # velocity
-      self.updates.append(K.update(m, v))
+      self.updates.append(state_ops.assign(m, v))
 
       if self.nesterov:
         new_p = p + self.momentum * v - lr * g
@@ -226,7 +230,7 @@ class SGD(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -275,25 +279,25 @@ class RMSprop(Optimizer):
     grads = self.get_gradients(loss, params)
     accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -337,24 +341,24 @@ class Adagrad(Optimizer):
     shapes = [K.int_shape(p) for p in params]
     accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a in zip(params, grads, accumulators):
-      new_a = a + K.square(g)  # update accumulator
-      self.updates.append(K.update(a, new_a))
+      new_a = a + math_ops.square(g)  # update accumulator
+      self.updates.append(state_ops.assign(a, new_a))
       new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -401,18 +405,18 @@ class Adadelta(Optimizer):
     accumulators = [K.zeros(shape) for shape in shapes]
     delta_accumulators = [K.zeros(shape) for shape in shapes]
     self.weights = accumulators + delta_accumulators
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
     for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
       # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * K.square(g)
-      self.updates.append(K.update(a, new_a))
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
 
       # use the new accumulator and the *old* delta_accumulator
       update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
@@ -422,11 +426,11 @@ class Adadelta(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
 
       # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * K.square(update)
-      self.updates.append(K.update(d_a, new_d_a))
+      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
+      self.updates.append(state_ops.assign(d_a, new_d_a))
     return self.updates
 
   def get_config(self):
@@ -481,17 +485,18 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
     lr_t = lr * (
-        K.sqrt(1. - K.pow(self.beta_2, t)) / (1. - K.pow(self.beta_1, t)))
+        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
+        (1. - math_ops.pow(self.beta_1, t)))
 
     ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
     vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
@@ -503,23 +508,23 @@ class Adam(Optimizer):
 
     for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
       if self.amsgrad:
-        vhat_t = K.maximum(vhat, v_t)
+        vhat_t = math_ops.maximum(vhat, v_t)
         p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(K.update(vhat, vhat_t))
+        self.updates.append(state_ops.assign(vhat, vhat_t))
       else:
         p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -571,16 +576,16 @@ class Adamax(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
     lr = self.lr
     if self.initial_decay > 0:
-      lr = lr * (1. /  # pylint: disable=g-no-augmented-assignment
-                 (1. + self.decay * K.cast(self.iterations,
-                                           K.dtype(self.decay))))
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. / (1. + self.decay * math_ops.cast(self.iterations,
+                                                K.dtype(self.decay))))
 
-    t = K.cast(self.iterations, K.floatx()) + 1
-    lr_t = lr / (1. - K.pow(self.beta_1, t))
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
 
     shapes = [K.int_shape(p) for p in params]
     # zero init of 1st moment
@@ -592,18 +597,18 @@ class Adamax(Optimizer):
     for p, g, m, u in zip(params, grads, ms, us):
 
       m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = K.maximum(self.beta_2 * u, K.abs(g))
+      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
       p_t = p - lr_t * m_t / (u_t + self.epsilon)
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(u, u_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(u, u_t))
       new_p = p_t
 
       # Apply constraints.
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -657,16 +662,17 @@ class Nadam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [K.update_add(self.iterations, 1)]
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
 
-    t = K.cast(self.iterations, K.floatx()) + 1
+    t = math_ops.cast(self.iterations, K.floatx()) + 1
 
     # Due to the recommendations in [2], i.e. warming momentum schedule
     momentum_cache_t = self.beta_1 * (
-        1. - 0.5 * (K.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
     momentum_cache_t_1 = self.beta_1 * (
         1. - 0.5 *
-        (K.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
     m_schedule_new = self.m_schedule * momentum_cache_t
     m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
     self.updates.append((self.m_schedule, m_schedule_new))
@@ -682,13 +688,13 @@ class Nadam(Optimizer):
       g_prime = g / (1. - m_schedule_new)
       m_t = self.beta_1 * m + (1. - self.beta_1) * g
       m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * K.square(g)
-      v_t_prime = v_t / (1. - K.pow(self.beta_2, t))
+      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
+      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
       m_t_bar = (
           1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
 
-      self.updates.append(K.update(m, m_t))
-      self.updates.append(K.update(v, v_t))
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
 
       p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
       new_p = p_t
@@ -697,7 +703,7 @@ class Nadam(Optimizer):
       if getattr(p, 'constraint', None) is not None:
         new_p = p.constraint(new_p)
 
-      self.updates.append(K.update(p, new_p))
+      self.updates.append(state_ops.assign(p, new_p))
     return self.updates
 
   def get_config(self):
@@ -728,12 +734,27 @@ class TFOptimizer(Optimizer):
     return self.optimizer.compute_gradients(loss, params)
 
   def get_updates(self, loss, params):
-    self.updates = [K.update_add(self.iterations, 1)]
-    if not params:
-      return self.updates
-    grads = self.optimizer.compute_gradients(loss, params)
-    opt_update = self.optimizer.apply_gradients(
-        grads, global_step=self.iterations)
+    if distribute_lib.has_distribution_strategy():
+      self.updates = []
+
+      if not params:
+        # After the model vars have been created, the second call to get_updates
+        # is called with params as an empty list. This ensures that we call
+        # compute_gradients with params=None.
+        grads = self.optimizer.compute_gradients(loss)
+      else:
+        grads = self.optimizer.compute_gradients(loss, params)
+      global_step = training_util.get_global_step()
+      opt_update = self.optimizer.apply_gradients(grads, global_step)
+    else:
+      self.updates = [state_ops.assign_add(self.iterations, 1)]
+      if not params:
+        return self.updates
+
+      grads = self.optimizer.compute_gradients(loss, params)
+      opt_update = self.optimizer.apply_gradients(
+          grads, global_step=self.iterations)
+
     self.updates.append(opt_update)
     return self.updates
 
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/_impl/keras/regularizers.py
index 2c30844647acdb78d1ca31d052ec7e5ecc6dcc2a..74c37d370ea630ca3c3e5e0945828f63928572e1 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/_impl/keras/regularizers.py
@@ -23,6 +23,7 @@ import six
 from tensorflow.python.keras._impl.keras import backend as K
 from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,9 +56,9 @@ class L1L2(Regularizer):
   def __call__(self, x):
     regularization = 0.
     if self.l1:
-      regularization += K.sum(self.l1 * K.abs(x))
+      regularization += math_ops.reduce_sum(self.l1 * math_ops.abs(x))
     if self.l2:
-      regularization += K.sum(self.l2 * K.square(x))
+      regularization += math_ops.reduce_sum(self.l2 * math_ops.square(x))
     return regularization
 
   def get_config(self):
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
index 583079d9626361eb594f16a57af86f103e5ee74d..8882a3a46bcb9de7283a67f001e67ed8644a0cf7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
@@ -21,17 +21,146 @@ from __future__ import print_function
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-# pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.layers.utils import conv_input_length
-from tensorflow.python.layers.utils import conv_output_length
-from tensorflow.python.layers.utils import deconv_output_length as deconv_length
-from tensorflow.python.layers.utils import normalize_tuple
+from tensorflow.python.keras._impl.keras import backend
+
+
+def convert_data_format(data_format, ndim):
+  if data_format == 'channels_last':
+    if ndim == 3:
+      return 'NWC'
+    elif ndim == 4:
+      return 'NHWC'
+    elif ndim == 5:
+      return 'NDHWC'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  elif data_format == 'channels_first':
+    if ndim == 3:
+      return 'NCW'
+    elif ndim == 4:
+      return 'NCHW'
+    elif ndim == 5:
+      return 'NCDHW'
+    else:
+      raise ValueError('Input rank not supported:', ndim)
+  else:
+    raise ValueError('Invalid data_format:', data_format)
+
+
+def normalize_tuple(value, n, name):
+  """Transforms a single integer or iterable of integers into an integer tuple.
+
+  Arguments:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    if len(value_tuple) != n:
+      raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                       str(n) + ' integers. Received: ' + str(value))
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except (ValueError, TypeError):
+        raise ValueError('The `' + name + '` argument must be a tuple of ' +
+                         str(n) + ' integers. Received: ' + str(value) + ' '
+                         'including element ' + str(single_value) + ' of type' +
+                         ' ' + str(type(single_value)))
+    return value_tuple
+
+
+def conv_output_length(input_length, filter_size, padding, stride, dilation=1):
+  """Determines output length of a convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+      dilation: dilation rate, integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
+  if padding == 'same':
+    output_length = input_length
+  elif padding == 'valid':
+    output_length = input_length - dilated_filter_size + 1
+  elif padding == 'full':
+    output_length = input_length + dilated_filter_size - 1
+  return (output_length + stride - 1) // stride
+
+
+def conv_input_length(output_length, filter_size, padding, stride):
+  """Determines input length of a convolution given output length.
+
+  Arguments:
+      output_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The input length (integer).
+  """
+  if output_length is None:
+    return None
+  assert padding in {'same', 'valid', 'full'}
+  if padding == 'same':
+    pad = filter_size // 2
+  elif padding == 'valid':
+    pad = 0
+  elif padding == 'full':
+    pad = filter_size - 1
+  return (output_length - 1) * stride - 2 * pad + filter_size
+
+
+def deconv_output_length(input_length, filter_size, padding, stride):
+  """Determines output length of a transposed convolution given input length.
+
+  Arguments:
+      input_length: integer.
+      filter_size: integer.
+      padding: one of "same", "valid", "full".
+      stride: integer.
+
+  Returns:
+      The output length (integer).
+  """
+  if input_length is None:
+    return None
+  input_length *= stride
+  if padding == 'valid':
+    input_length += max(filter_size - stride, 0)
+  elif padding == 'full':
+    input_length -= (stride + filter_size - 2)
+  return input_length
 
 
 def normalize_data_format(value):
   if value is None:
-    value = K.image_data_format()
+    value = backend.image_data_format()
   data_format = value.lower()
   if data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('The `data_format` argument must be one of '
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
index bbf1d2a3d9c3948271780ec3fad3316b4e6d53c3..f82e3277de70a631c93f0ef3c240f41ddb3390a7 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/io_utils.py
@@ -19,9 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from collections import defaultdict
-import sys
 
 import numpy as np
+import six
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -160,13 +160,11 @@ def ask_to_proceed_with_overwrite(filepath):
   Returns:
       True if we can proceed with overwrite, False otherwise.
   """
-  get_input = input
-  if sys.version_info[:2] <= (2, 7):
-    get_input = raw_input
-  overwrite = get_input('[WARNING] %s already exists - overwrite? '
-                        '[y/n]' % (filepath))
-  while overwrite not in ['y', 'n']:
-    overwrite = get_input('Enter "y" (overwrite) or "n" (cancel).')
+  overwrite = six.moves.input('[WARNING] %s already exists - overwrite? '
+                              '[y/n]' % (filepath)).strip().lower()
+  while overwrite not in ('y', 'n'):
+    overwrite = six.moves.input('Enter "y" (overwrite) or "n" '
+                                '(cancel).').strip().lower()
   if overwrite == 'n':
     return False
   print('[TIP] Next time specify overwrite=True!')
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
index 4c8009dfd80e1aec457fa03687f2840c7fe4607b..902972ecbb8fd69a9252b7e19e32bee5e33e4f97 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
@@ -35,7 +35,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([K.count_params(p) for p in set(weights)]))
+  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
@@ -193,8 +193,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   else:
     trainable_count = count_params(model.trainable_weights)
 
-  non_trainable_count = int(
-      np.sum([K.count_params(p) for p in set(model.non_trainable_weights)]))
+  non_trainable_count = count_params(model.non_trainable_weights)
 
   print_fn('Total params: {:,}'.format(trainable_count + non_trainable_count))
   print_fn('Trainable params: {:,}'.format(trainable_count))
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8da5f7777733767f31fad205a23c2f08f9ffbb1c
--- /dev/null
+++ b/tensorflow/python/keras/_impl/keras/utils/tf_utils.py
@@ -0,0 +1,74 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+
+
+def smart_cond(pred, true_fn=None, false_fn=None, name=None):
+  """Return either `true_fn()` if predicate `pred` is true else `false_fn()`.
+
+  If `pred` is a bool or has a constant value, we return either `true_fn()`
+  or `false_fn()`, otherwise we use `tf.cond` to dynamically route to both.
+
+  Arguments:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix when using `tf.cond`.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`.
+
+  Raises:
+    TypeError: If `true_fn` or `false_fn` is not callable.
+  """
+  if isinstance(pred, variables.Variable):
+    return control_flow_ops.cond(
+        pred, true_fn=true_fn, false_fn=false_fn, name=name)
+  return smart_module.smart_cond(
+      pred, true_fn=true_fn, false_fn=false_fn, name=name)
+
+
+def constant_value(pred):
+  """Return the bool value for `pred`, or None if `pred` had a dynamic value.
+
+  Arguments:
+    pred: A scalar, either a Python bool or a TensorFlow boolean variable
+      or tensor, or the Python integer 1 or 0.
+
+  Returns:
+    True or False if `pred` has a constant boolean value, None otherwise.
+
+  Raises:
+    TypeError: If `pred` is not a Variable, Tensor or bool, or Python
+      integer 1 or 0.
+  """
+  # Allow integer booleans.
+  if isinstance(pred, int):
+    if pred == 1:
+      pred = True
+    elif pred == 0:
+      pred = False
+
+  if isinstance(pred, variables.Variable):
+    return None
+  return smart_module.smart_constant_value(pred)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 84ee5040dcd7b118a5c63b6532135913fe238797..b45cafed3186a096333bcdc5cbcec13556918872 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -49,6 +49,7 @@ from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution
 from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
 from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
 from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 228d1c245248c972d7d504df10251e5e45076a2e..1827a26902dd06729437893756f4e4ab57926866 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -295,7 +295,6 @@ tf_py_test(
         "//tensorflow/python:nn_grad",
     ],
     data = ["//tensorflow/core:image_testdata"],
-    tags = ["no_windows"],
 )
 
 tf_py_test(
@@ -1030,12 +1029,14 @@ tf_py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
     ],
     tags = ["no_windows"],
 )
@@ -1142,7 +1143,6 @@ tf_py_test(
         "//tensorflow/python:variables",
     ],
     data = ["//tensorflow/core:lmdb_testdata"],
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -1192,6 +1192,22 @@ cuda_py_test(
     shard_count = 10,
 )
 
+cuda_py_test(
+    name = "inplace_ops_test",
+    size = "small",
+    srcs = ["inplace_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+    ],
+    shard_count = 10,
+)
+
 cuda_py_test(
     name = "batch_matmul_op_test",
     size = "small",
@@ -1569,7 +1585,7 @@ cuda_py_test(
 
 cuda_py_test(
     name = "init_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["init_ops_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -2332,7 +2348,6 @@ cuda_py_test(
         "//tensorflow/python:variables",
     ],
     shard_count = 4,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2463,7 +2478,6 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
     ],
     shard_count = 10,
-    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2523,7 +2537,10 @@ cuda_py_test(
         "//tensorflow/python:sparse_ops",
     ],
     shard_count = 5,
-    tags = ["noasan"],
+    tags = [
+        "noasan",
+        "optonly",  # b/77589990
+    ],
 )
 
 cuda_py_test(
@@ -2726,6 +2743,7 @@ cuda_py_test(
     ],
     data = ["//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files"],
     shard_count = 20,
+    tags = ["no_windows"],
 )
 
 cuda_py_test(
@@ -2945,15 +2963,3 @@ tf_py_test(
         "//tensorflow/python/eager:tape",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 64c1760d5e72c8dd2b0b8adb09cc3612f85228b0..5a20eebbc559cf6a3cad97adb8aa056cb88719cb 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -780,6 +780,14 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  def testInt64Indices(self):
+    with self.test_session(use_gpu=True) as sess:
+      a = math_ops.range(3)
+      index = constant_op.constant(1, dtype=dtypes.int64)
+      b = 2 * a[index]
+      grad, = gradients_impl.gradients(b, a)
+      self.assertAllEqual(sess.run(grad), [0, 2, 0])
+
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
@@ -999,30 +1007,38 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
 
 class ShapeSizeRankTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDenseShape(self):
-    with self.test_session():
-      t_value = [[0, 42], [24, 0]]
-      self.assertAllEqual((2, 2), array_ops.shape(t_value).eval())
-      self.assertEqual(4, array_ops.size(t_value).eval())
-      self.assertEqual(2, array_ops.rank(t_value).eval())
+    t_value = [[0, 42], [24, 0]]
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t_value)))
 
-      t = constant_op.constant(t_value)
-      self.assertAllEqual((2, 2), array_ops.shape(t).eval())
-      self.assertEqual(4, array_ops.size(t).eval())
-      self.assertEqual(2, array_ops.rank(t).eval())
+    t = constant_op.constant(t_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t)))
+    self.assertEqual(4, self.evaluate(array_ops.size(t)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(t)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSparseShape(self):
-    with self.test_session():
-      sp_value = sparse_tensor.SparseTensorValue(
-          indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
-      self.assertAllEqual((2, 2), array_ops.shape(sp_value).eval())
-      self.assertEqual(4, array_ops.size(sp_value).eval())
-      self.assertEqual(2, array_ops.rank(sp_value).eval())
-
-      sp = sparse_tensor.SparseTensor.from_value(sp_value)
-      self.assertAllEqual((2, 2), array_ops.shape(sp).eval())
-      self.assertEqual(4, array_ops.size(sp).eval())
-      self.assertEqual(2, array_ops.rank(sp).eval())
+    sp_value = sparse_tensor.SparseTensorValue(
+        indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp_value)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp_value)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp_value)))
+
+    sp = sparse_tensor.SparseTensor.from_value(sp_value)
+    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp)))
+    self.assertEqual(4, self.evaluate(array_ops.size(sp)))
+    self.assertEqual(2, self.evaluate(array_ops.rank(sp)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSizeDtype(self):
+    tensor = [1]
+    self.assertEqual(dtypes.int32, self.evaluate(array_ops.size(tensor)).dtype)
+    self.assertEqual(
+        dtypes.int64,
+        self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
 @test_util.with_c_api
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..30e6289420b36a75589ef25150480e48f8245ec2
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -0,0 +1,76 @@
+# Description:
+#   Kernel tests for Boosted Trees.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+tf_py_test(
+    name = "resource_ops_test",
+    size = "small",
+    srcs = ["resource_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "prediction_ops_test",
+    size = "small",
+    srcs = ["prediction_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:resources",
+    ],
+)
+
+tf_py_test(
+    name = "stats_ops_test",
+    size = "small",
+    srcs = ["stats_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+tf_py_test(
+    name = "training_ops_test",
+    size = "small",
+    srcs = ["training_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resources",
+    ],
+)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/__init__.py b/tensorflow/python/kernel_tests/boosted_trees/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d132f15e51dbc6cd8e706e36b889352c20792cdf
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -0,0 +1,926 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests boosted_trees prediction kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for training."""
+
+  def testCachedPredictionOnEmptyEnsemble(self):
+    """Tests that prediction on a dummy ensemble does not fail."""
+    with self.test_session() as session:
+      # Create a dummy ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # No previous cached values.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      # We have two features: 0 and 1. Values don't matter here on a dummy
+      # ensemble.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testNoCachedPredictionButTreeExists(self):
+    """Tests that predictions are updated once trees are added."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, none were cached before.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [0, 0]
+
+      feature_0_values = [67, 5]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the first tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([2, 1], new_node_ids)
+      self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
+
+  def testCachedPredictionIsCurrent(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1. Values don't matter because trees didn't
+      # change.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=4,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # Nothing changed.
+      self.assertAllClose(cached_tree_ids, new_tree_ids)
+      self.assertAllClose(cached_node_ids, new_node_ids)
+      self.assertAllClose([[0], [0]], logits_updates)
+
+  def testCachedPredictionFromTheSameTree(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 15
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+              original_leaf {
+                scalar: -2
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -5.875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -2.075
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 0.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [67, 5]
+      feature_1_values = [9, 17]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=4,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([4, 5], new_node_ids)
+      # Full predictions for each instance would be 8.79 and -5.875,
+      # so an update from the previous cached values lr*(7.14 and -2) would be
+      # 1.65 and -3.875, and then multiply them by 0.1 (lr)
+      self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
+
+  def testCachedPredictionFromPreviousTree(self):
+    """Tests the predictions work when we have cache from previous trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: true
+        }
+        tree_metadata {
+          is_finalized: false
+        }
+        tree_weights: 0.1
+        tree_weights: 0.1
+        tree_weights: 0.1
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Two examples, one was cached in node 1 first, another in node 2.
+      cached_tree_ids = [0, 0]
+      cached_node_ids = [1, 0]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=2,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+      # Example 1 will get to node 3 in tree 1 and node 2 of tree 2
+      # Example 2 will get to node 2 in tree 1 and node 1 of tree 2
+
+      # We are in the last tree.
+      self.assertAllClose([2, 2], new_tree_ids)
+      # When using the full tree, the first example will end up in node 4,
+      # the second in node 5.
+      self.assertAllClose([2, 1], new_node_ids)
+      # Example 1: tree 0: 8.79, tree 1: 5.0, tree 2: 5.0 = >
+      #            change = 0.1*(5.0+5.0)
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            change= 0.1(1.14+7.0-7.0)
+      self.assertAllClose([[1], [0.114]], logits_updates)
+
+  def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=3,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are still in the same tree.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4.
+      self.assertAllClose([1, 1, 3, 4, 1, 1], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:0
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.55
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 4
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [0, 0, 0, 0, 0, 0]
+      # Leaves 3,4, 7 and 8 got deleted during post-pruning, leaves 5 and 6
+      # changed the ids to 3 and 4 respectively.
+      cached_node_ids = [3, 4, 5, 6, 7, 8]
+
+      # We have two features: 0 and 1.
+      feature_0_values = [12, 17, 35, 36, 23, 11]
+      feature_1_values = [12, 12, 17, 18, 123, 24]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=3,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([1, 1, 1, 1, 1, 1], new_tree_ids)
+      # Examples from leaves 3,4,7,8 should be in leaf 1, examples from leaf 5
+      # and 6 in leaf 3 and 4 in tree 0. For tree 1, all of the examples are in
+      # the root node.
+      self.assertAllClose([0, 0, 0, 0, 0, 0], new_node_ids)
+
+      cached_values = [[0.08], [0.093], [0.0553], [0.0783], [0.15 + 0.08],
+                       [0.5 + 0.08]]
+      root = 0.55
+      self.assertAllClose([[root + 0.01], [root + 0.01], [root + 0.0553],
+                           [root + 0.0783], [root + 0.01], [root + 0.01]],
+                          logits_updates + cached_values)
+
+  def testCachedPredictionTheWholeTreeWasPruned(self):
+    """Tests that prediction based on previous node in the tree works."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.00
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: -6.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 5.0
+          }
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      cached_tree_ids = [
+          0,
+          0,
+      ]
+      # The predictions were cached in 1 and 2, both were pruned to the root.
+      cached_node_ids = [1, 2]
+
+      # We have two features: 0 and 1.These are not going to be used anywhere.
+      feature_0_values = [12, 17]
+      feature_1_values = [12, 12]
+
+      # Grow tree ensemble.
+      predict_op = boosted_trees_ops.training_predict(
+          tree_ensemble_handle,
+          max_depth=1,
+          cached_tree_ids=cached_tree_ids,
+          cached_node_ids=cached_node_ids,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits_updates, new_tree_ids, new_node_ids = session.run(predict_op)
+
+      # We are in the last tree.
+      self.assertAllClose([0, 0], new_tree_ids)
+      self.assertAllClose([0, 0], new_node_ids)
+
+      self.assertAllClose([[-6.0], [5.0]], logits_updates)
+
+
+class PredictionOpsTest(test_util.TensorFlowTestCase):
+  """Tests prediction ops for inference."""
+
+  def testPredictionMultipleTree(self):
+    """Tests the predictions work when we have multiple trees."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 28
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 1.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 8.79
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 26
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 50
+              left_id: 3
+              right_id: 4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.0
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 34
+              left_id: 1
+              right_id: 2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -7.0
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 5.0
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_weights: 0.2
+        tree_weights: 1.0
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+
+      # Example 1: tree 0: 1.14, tree 1: 5.0, tree 2: 5.0 = >
+      #            logit = 0.1*5.0+0.2*5.0+1*5
+      # Example 2: tree 0: 1.14, tree 1: 7.0, tree 2: -7 = >
+      #            logit= 0.1*1.14+0.2*7.0-1*7.0
+      expected_logits = [[6.114], [-5.486]]
+
+      # Do with parallelization, e.g. EVAL
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1,
+          max_depth=2)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+      # Do without parallelization, e.g. INFER - the result is the same
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1,
+          max_depth=2)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a223241e893d6838faec9a48cb4ca9cb3c24a211
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -0,0 +1,228 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees resource kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from google.protobuf import text_format
+
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class ResourceOpsTest(test_util.TensorFlowTestCase):
+  """Tests resource_ops."""
+
+  def testCreate(self):
+    with self.test_session():
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      resources.initialize_resources(resources.shared_resources()).run()
+      stamp_token = ensemble.get_stamp_token()
+      self.assertEqual(0, stamp_token.eval())
+      (_, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+
+  def testCreateWithProto(self):
+    with self.test_session():
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 7.14
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 6.54
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.305
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.525
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.145
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 6
+        }
+      """, ensemble_proto)
+      ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble',
+          stamp_token=7,
+          serialized_proto=ensemble_proto.SerializeToString())
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(7, stamp_token.eval())
+      self.assertEqual(2, num_trees.eval())
+      self.assertEqual(1, num_finalized_trees.eval())
+      self.assertEqual(6, num_attempted_layers.eval())
+
+  def testSerializeDeserialize(self):
+    with self.test_session():
+      # Initialize.
+      ensemble = boosted_trees_ops.TreeEnsemble('ensemble', stamp_token=5)
+      resources.initialize_resources(resources.shared_resources()).run()
+      (stamp_token, num_trees, num_finalized_trees,
+       num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(5, stamp_token.eval())
+      self.assertEqual(0, num_trees.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+      self.assertEqual(0, num_attempted_layers.eval())
+
+      # Deserialize.
+      ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.5
+        tree_metadata {
+          num_layers_grown: 4  # it's fake intentionally.
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 5
+        }
+      """, ensemble_proto)
+      with ops.control_dependencies([
+          ensemble.deserialize(
+              stamp_token=3,
+              serialized_proto=ensemble_proto.SerializeToString())
+      ]):
+        (stamp_token, num_trees, num_finalized_trees,
+         num_attempted_layers) = ensemble.get_states()
+      self.assertEqual(3, stamp_token.eval())
+      self.assertEqual(1, num_trees.eval())
+      # This reads from metadata, not really counting the layers.
+      self.assertEqual(5, num_attempted_layers.eval())
+      self.assertEqual(0, num_finalized_trees.eval())
+
+      # Serialize.
+      new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
+      new_stamp_token, new_serialized = ensemble.serialize()
+      self.assertEqual(3, new_stamp_token.eval())
+      new_ensemble_proto.ParseFromString(new_serialized.eval())
+      self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a54cc43517f4513b88b94ceb9b401b84b5ca053f
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees stats kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.platform import googletest
+
+
+class StatsOpsTest(test_util.TensorFlowTestCase):
+  """Tests stats_ops."""
+
+  def testCalculateBestGainsWithoutRegularization(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.0,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL2(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=0.1,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
+                          sess.run(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testCalculateBestGainsWithL1(self):
+    """Testing Gain calculation with L1."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l1 = 0.1
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=l1,
+          l2=0.0,
+          tree_complexity=0.0,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
+                          sess.run(left_node_contribs_list))
+
+      self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
+                          sess.run(right_node_contribs_list))
+
+      # Gain should also include an adjustment of the gradient by l1.
+      self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
+                          sess.run(gains_list))
+
+  def testCalculateBestGainsWithTreeComplexity(self):
+    """Testing Gain calculation with L2."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 2]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .58], [0., 0.], [.3, .4]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .06], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .3], [-.4, .5], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      l2 = 0.1
+      tree_complexity = 3.
+      (node_ids_list, gains_list, thresholds_list, left_node_contribs_list,
+       right_node_contribs_list
+      ) = boosted_trees_ops.calculate_best_gains_per_feature(
+          node_id_range,
+          stats_summary_list,
+          l1=0.0,
+          l2=l2,
+          tree_complexity=tree_complexity,
+          max_splits=max_splits)
+
+      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+
+      self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
+                          sess.run(gains_list))
+
+      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      # The left node contrib will be later added to the previous node value to
+      # make the left node value, and the same for right node contrib.
+      self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
+                          sess.run(left_node_contribs_list))
+      self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
+                          sess.run(right_node_contribs_list))
+
+  def testMakeStatsSummarySimple(self):
+    """Simple test for MakeStatsSummary."""
+    with self.test_session():
+      self.assertAllClose([[[[1., 5.], [2., 6.]], [[3., 7.], [4., 8.]]]],
+                          boosted_trees_ops.make_stats_summary(
+                              node_ids=[0, 0, 1, 1],
+                              gradients=[[1.], [2.], [3.], [4.]],
+                              hessians=[[5.], [6.], [7.], [8.]],
+                              bucketized_features_list=[[0, 1, 0, 1]],
+                              max_splits=2,
+                              num_buckets=2).eval())
+
+  def testMakeStatsSummaryAccumulate(self):
+    """Tests that Summary actually accumulates."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests a single feature.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [[
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+              [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+              [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+          ]],
+          result.eval())
+
+  def testMakeStatsSummaryMultipleFeatures(self):
+    """Tests that MakeStatsSummary works for multiple features."""
+    with self.test_session():
+      max_splits = 3
+      num_buckets = 4
+      node_ids = [1, 1, 2, 2, 1, 1, 2, 0]
+      gradients = [[.1], [.2], [.3], [-.4], [-.05], [.06], [.07], [.08]]
+      hessians = [[.2], [.3], [.4], [.5], [.06], [.07], [.08], [.09]]
+
+      # Tests multiple features.
+      # The output from another feature will stored be in 3rd dimension.
+      bucketized_features = [[3, 1, 2, 0, 1, 2, 0, 1], [0, 0, 0, 2, 2, 3, 3, 2]]
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, bucketized_features, max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+      self.assertAllClose(
+          [
+              [
+                  [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0
+                  [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
+                  [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
+              ],  # feature 0
+              [
+                  [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0
+                  [[.3, .5], [0., 0.], [-.05, .06], [.06, .07]],  # node 1
+                  [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
+              ],  # feature 1
+          ],
+          result.eval())
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4226ff75c2327d09c0d89b29950605b610672603
--- /dev/null
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -0,0 +1,1465 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for boosted_trees training kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from google.protobuf import text_format
+from tensorflow.core.kernels.boosted_trees import boosted_trees_pb2
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import boosted_trees_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.platform import googletest
+
+
+class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
+  """Tests for growing tree ensemble from split candidates."""
+
+  def testGrowWithEmptyEnsemble(self):
+    """Test growing an empty ensemble."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_ids = [0, 2, 6]
+
+      # Prepare feature inputs.
+      # Note that features 1 & 3 have the same gain but different splits.
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Feature split with the highest gain.
+      feature3_nodes = np.array([0], dtype=np.int32)
+      feature3_gains = np.array([7.65], dtype=np.float32)
+      feature3_thresholds = np.array([7], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-4.89]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[5.3]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # Tree will be finalized now, since we will reach depth 1.
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      # Note that since the tree is finalized, we added a new dummy tree.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 6
+              threshold: 7
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.65
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.489
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.53
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeNotFinalized(self):
+    """Test growing an existing ensemble with the last tree not finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([1.7], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          # tree is going to be finalized now, since we reach depth 2.
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should be finalized as max tree depth is 2 and we have
+      # grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            bucketized_split {
+              threshold: 21
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 1.4
+              original_leaf {
+                scalar: 0.714
+              }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 7
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 2.7
+              original_leaf {
+                scalar: -0.4375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.114
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.879
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.5875
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.2075
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          is_finalized: true
+          num_layers_grown: 2
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testGrowExistingEnsembleTreeFinalized(self):
+    """Test growing an existing ensemble with the last tree finalized."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+              scalar: 0.0
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+
+      feature_ids = [75]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          learning_rate=0.1,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+      session.run(grow_op)
+
+      # Expect a new tree added, with a split on feature 75
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+       trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 75
+              threshold: 21
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -1.4
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.6
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.165
+            }
+          }
+        }
+        tree_weights: 0.15
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 2
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPrePruning(self):
+    """Test growing an existing ensemble with pre-pruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # For node 1, the best split is on feature 2 (gain -0.63), but the gain
+      # is negative so node 1 will not be split.
+      # For node 2, the best split is on feature 3, gain is positive.
+
+      feature_ids = [0, 1, 0]
+
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, 2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect the split for node 1 to be chosen from feature 1 and
+      # the split for node 2 to be chosen from feature 2.
+      # The grown tree should not be finalized as max tree depth is 3 and
+      # it's only grown 2 layers.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 0
+              threshold: 3
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 2.8
+              original_leaf {
+                scalar: -4.375
+              }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.45
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.182
+            }
+          }
+        }
+        tree_weights: 0.1
+        tree_metadata {
+          is_finalized: false
+          num_layers_grown: 2
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDueToEmptySplits(self):
+    """Test that the metadata is updated even though we can't split."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      # feature 1 only has a candidate for node 1, feature 2 has candidates
+      # for both nodes and feature 3 only has a candidate for node 2.
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING,
+          max_depth=2,
+          # No splits are available.
+          feature_ids=[],
+          node_ids=[],
+          gains=[],
+          thresholds=[],
+          left_node_contribs=[],
+          right_node_contribs=[])
+      session.run(grow_op)
+
+      # Expect no new splits created, but attempted (global) stats updated. Meta
+      # data for this tree should not be updated (we didn't succeed building a
+      # layer.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.714
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -0.4375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testMetadataWhenCantSplitDuePrePruning(self):
+    """Test metadata is updated correctly when no split due to prepruning."""
+    with self.test_session() as session:
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      text_format.Merge("""
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """, tree_ensemble_config)
+
+      # Create existing ensemble with one root split
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare feature inputs.
+      feature_ids = [0, 1, 0]
+
+      # All the gains are negative.
+      feature1_nodes = np.array([1], dtype=np.int32)
+      feature1_gains = np.array([-1.4], dtype=np.float32)
+      feature1_thresholds = np.array([21], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-6.0]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[1.65]], dtype=np.float32)
+
+      feature2_nodes = np.array([1, 2], dtype=np.int32)
+      feature2_gains = np.array([-0.63, -2.7], dtype=np.float32)
+      feature2_thresholds = np.array([23, 7], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6], [-1.5]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24], [2.3]], dtype=np.float32)
+
+      feature3_nodes = np.array([2], dtype=np.int32)
+      feature3_gains = np.array([-2.8], dtype=np.float32)
+      feature3_thresholds = np.array([3], dtype=np.int32)
+      feature3_left_node_contribs = np.array([[-0.75]], dtype=np.float32)
+      feature3_right_node_contribs = np.array([[1.93]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=0.1,
+          pruning_mode=boosted_trees_ops.PruningMode.PRE_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes, feature3_nodes],
+          gains=[feature1_gains, feature2_gains, feature3_gains],
+          thresholds=[
+              feature1_thresholds, feature2_thresholds, feature3_thresholds
+          ],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs,
+              feature3_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs,
+              feature3_right_node_contribs
+          ])
+      session.run(grow_op)
+
+      # Expect that no new split was created because all the gains were negative
+      # Global metadata should be updated, tree metadata should not be updated.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      tree_ensemble = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 4
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.14
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, tree_ensemble)
+
+  def testPostPruningOfSomeNodes(self):
+    """Test growing an ensemble with post-pruning."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.2], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from second features to be chosen despite the negative
+      # gain.
+      # No pruning happened just yet.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare the second layer.
+      # Note that node 1 gain is negative and node 2 gain is positive.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, 0.5], dtype=np.float32)
+      feature1_thresholds = np.array([7, 5], dtype=np.int32)
+      feature1_left_node_contribs = np.array(
+          [[0.07], [0.041]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.083], [0.064]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # After adding this layer, the tree will not be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 7
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: -0.2
+              original_leaf {
+                scalar: 0.01
+               }
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 5
+              right_id: 6
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.08
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.093
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+                scalar: 0.0783
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 2
+          is_finalized: false
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 2
+        }
+       """
+      self.assertEqual(new_stamp, 2)
+
+      self.assertProtoEquals(expected_result, res_ensemble)
+      # Now split the leaf 3, again with negative gain. After this layer, the
+      # tree will be finalized, and post-pruning happens. The leafs 3,4,7,8 will
+      # be pruned out.
+
+      # Prepare the third layer.
+      feature_ids = [92]
+      feature1_nodes = np.array([3], dtype=np.int32)
+      feature1_gains = np.array([-0.45], dtype=np.float32)
+      feature1_thresholds = np.array([11], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.15]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.5]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=3,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+      # After adding this layer, the tree will be finalized
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+      # Node that nodes 3, 4, 7 and 8 got deleted, so metadata stores has ids
+      # mapped to their parent node 1, with the respective change in logits.
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id:1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.2
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 5
+              left_id: 3
+              right_id: 4
+            }
+            metadata {
+              gain: 0.5
+              original_leaf {
+                scalar: 0.0143
+               }
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0553
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0783
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 3
+          is_finalized: true
+          post_pruned_nodes_meta {
+            new_node_id: 0
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 2
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.07
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.083
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 3
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 4
+            logit_change: 0.0
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.22
+          }
+          post_pruned_nodes_meta {
+            new_node_id: 1
+            logit_change: -0.57
+          }
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 3
+        }
+       """
+      self.assertEqual(new_stamp, 3)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+  def testPostPruningOfAllNodes(self):
+    """Test growing an ensemble with post-pruning, with all nodes are pruned."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs. All have negative gains.
+      feature_ids = [0, 1]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([-1.3], dtype=np.float32)
+      feature1_thresholds = np.array([7], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.013]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([-0.62], dtype=np.float32)
+      feature2_thresholds = np.array([33], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[0.01]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.0143]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 2 to be chosen despite the negative gain.
+      # The grown tree should not be finalized as max tree depth is 2 so no
+      # pruning occurs.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 1
+              threshold: 33
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: -0.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.01
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 0.0143
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+      # Prepare inputs.
+      # All have negative gain.
+      feature_ids = [3]
+      feature1_nodes = np.array([1, 2], dtype=np.int32)
+      feature1_gains = np.array([-0.2, -0.5], dtype=np.float32)
+      feature1_thresholds = np.array([77, 79], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[0.023], [0.3]], dtype=np.float32)
+      feature1_right_node_contribs = np.array(
+          [[0.012343], [24]], dtype=np.float32)
+
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=2,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes],
+          gains=[feature1_gains],
+          thresholds=[feature1_thresholds],
+          left_node_contribs=[feature1_left_node_contribs],
+          right_node_contribs=[feature1_right_node_contribs])
+
+      session.run(grow_op)
+
+      # Expect the split from feature 1 to be chosen despite the negative gain.
+      # The grown tree should be finalized. Since all nodes have negative gain,
+      # the whole tree is pruned.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      # Expect the ensemble to be empty as post-pruning will prune
+      # the entire finalized tree.
+      self.assertEqual(new_stamp, 2)
+      self.assertProtoEquals("""
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      trees {
+        nodes {
+          leaf {
+          }
+        }
+      }
+      tree_weights: 1.0
+      tree_weights: 1.0
+      tree_metadata{
+        num_layers_grown: 2
+        is_finalized: true
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: 0.0
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.01
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.0143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.033
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.022343
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -0.3143
+        }
+        post_pruned_nodes_meta {
+          new_node_id: 0
+          logit_change: -24.0143
+        }
+      }
+      tree_metadata {
+      }
+      growing_metadata {
+        num_trees_attempted: 1
+        num_layers_attempted: 2
+      }
+      """, res_ensemble)
+
+  def testPostPruningChangesNothing(self):
+    """Test growing an ensemble with post-pruning with all gains >0."""
+    with self.test_session() as session:
+      # Create empty ensemble.
+      tree_ensemble_config = boosted_trees_pb2.TreeEnsemble()
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto=tree_ensemble_config.SerializeToString())
+      tree_ensemble_handle = tree_ensemble.resource_handle
+
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # Prepare inputs.
+      # Second feature has larger (but still negative gain).
+      feature_ids = [3, 4]
+
+      feature1_nodes = np.array([0], dtype=np.int32)
+      feature1_gains = np.array([7.62], dtype=np.float32)
+      feature1_thresholds = np.array([52], dtype=np.int32)
+      feature1_left_node_contribs = np.array([[-4.375]], dtype=np.float32)
+      feature1_right_node_contribs = np.array([[7.143]], dtype=np.float32)
+
+      feature2_nodes = np.array([0], dtype=np.int32)
+      feature2_gains = np.array([0.63], dtype=np.float32)
+      feature2_thresholds = np.array([23], dtype=np.int32)
+      feature2_left_node_contribs = np.array([[-0.6]], dtype=np.float32)
+      feature2_right_node_contribs = np.array([[0.24]], dtype=np.float32)
+
+      # Grow tree ensemble.
+      grow_op = boosted_trees_ops.update_ensemble(
+          tree_ensemble_handle,
+          learning_rate=1.0,
+          pruning_mode=boosted_trees_ops.PruningMode.POST_PRUNING,
+          max_depth=1,
+          feature_ids=feature_ids,
+          node_ids=[feature1_nodes, feature2_nodes],
+          gains=[feature1_gains, feature2_gains],
+          thresholds=[feature1_thresholds, feature2_thresholds],
+          left_node_contribs=[
+              feature1_left_node_contribs, feature2_left_node_contribs
+          ],
+          right_node_contribs=[
+              feature1_right_node_contribs, feature2_right_node_contribs
+          ])
+
+      session.run(grow_op)
+
+      # Expect the split from the first feature to be chosen.
+      # Pruning got triggered but changed nothing.
+      new_stamp, serialized = session.run(tree_ensemble.serialize())
+      res_ensemble = boosted_trees_pb2.TreeEnsemble()
+      res_ensemble.ParseFromString(serialized)
+
+      expected_result = """
+        trees {
+          nodes {
+            bucketized_split {
+              feature_id: 3
+              threshold: 52
+              left_id: 1
+              right_id: 2
+            }
+            metadata {
+              gain: 7.62
+            }
+          }
+          nodes {
+            leaf {
+              scalar: -4.375
+            }
+          }
+          nodes {
+            leaf {
+              scalar: 7.143
+            }
+          }
+        }
+        trees {
+          nodes {
+            leaf {
+            }
+          }
+        }
+        tree_weights: 1.0
+        tree_weights: 1.0
+        tree_metadata {
+          num_layers_grown: 1
+          is_finalized: true
+        }
+        tree_metadata {
+        }
+        growing_metadata {
+          num_trees_attempted: 1
+          num_layers_attempted: 1
+        }
+      """
+      self.assertEqual(new_stamp, 1)
+      self.assertProtoEquals(expected_result, res_ensemble)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index 5c8b71da174b8c38a797f8bf97c432d732d9978f..cb1359be159f5d8983f149cf42b2723dc0581ea8 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -19,16 +19,34 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
 
 
 class ClipTest(test.TestCase):
 
+  def testClipByValueGradient(self):
+    inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
+    outputs_1 = clip_ops.clip_by_value(inputs, 0.5, 3.5)
+    min_val = constant_op.constant([0.5, 0.5, 0.5, 0.5], dtype=dtypes.float32)
+    max_val = constant_op.constant([3.5, 3.5, 3.5, 3.5], dtype=dtypes.float32)
+    outputs_2 = clip_ops.clip_by_value(inputs, min_val, max_val)
+    with self.test_session():
+      error_1 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_1, [4])
+      self.assertLess(error_1, 1e-4)
+
+      error_2 = gradient_checker.compute_gradient_error(inputs, [4],
+                                                        outputs_2, [4])
+      self.assertLess(error_2, 1e-4)
+
   # ClipByValue test
   def testClipByValue(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
@@ -37,18 +55,84 @@ class ClipTest(test.TestCase):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  # [Tensor, Scalar, Scalar]
+  def testClipByValue0Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = 2
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Scalar]
+  def testClipByValue1Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [4, 4, 4]]
+        clip_value_min = constant_op.constant([2, 2, 2, 3, 3, 3], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = 4
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Scalar, Tensor]
+  def testClipByValue2Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[4, 4, 4], [4, 5, 6]]
+        clip_value_min = 4
+        clip_value_max = constant_op.constant([6, 6, 6, 6, 6, 6], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
+  # [Tensor, Tensor, Tensor]
+  def testClipByValue3Type(self):
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64,
+                  dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
+                  dtypes.uint8, dtypes.uint16]:
+      with self.test_session(use_gpu=True):
+        x = constant_op.constant([1, 2, 3, 4, 5, 6], shape=[2, 3], dtype=dtype)
+        np_ans = [[2, 2, 3], [5, 5, 6]]
+        clip_value_min = constant_op.constant([2, 2, 2, 5, 5, 5], shape=[2, 3],
+                                              dtype=dtype)
+        clip_value_max = constant_op.constant([5, 5, 5, 7, 7, 7], shape=[2, 3],
+                                              dtype=dtype)
+        ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
+        tf_ans = ans.eval()
+
+      self.assertAllClose(np_ans, tf_ans)
+
   def testClipByValueBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-5.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, -clip, clip)
-      with self.assertRaises(ValueError):
-        _ = clip_ops.clip_by_value(x, 1.0, clip)
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, -clip, clip)
+        tf_ans = ans.eval()
+      with self.assertRaises(errors_impl.InvalidArgumentError):
+        ans = clip_ops.clip_by_value(x, 1.0, clip)
+        tf_ans = ans.eval()
 
   def testClipByValueNonFinite(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([float('NaN'), float('Inf'), -float('Inf')])
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
@@ -60,7 +144,7 @@ class ClipTest(test.TestCase):
   # ClipByNorm tests
   def testClipByNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
@@ -76,7 +160,7 @@ class ClipTest(test.TestCase):
     self.assertAllClose(np_ans, tf_ans_tensor)
 
   def testClipByNormBadShape(self):
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3, 1])
       # Use a nonsensical shape.
       clip = constant_op.constant([1.0, 2.0])
@@ -85,7 +169,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Norm of x = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -97,7 +181,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
@@ -109,7 +193,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim0(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[:, 0] = sqrt(3^2 + 4^2) = 5, x[:, 2] = 3
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
@@ -121,7 +205,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormClippedWithDim1(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
@@ -133,7 +217,7 @@ class ClipTest(test.TestCase):
 
   def testClipByNormNotClippedWithAxes(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 3.0], shape=[2, 3])
       # Norm of x[0, :] = 3, x[1, :] = sqrt(3^2 + 4^2) = 5
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
@@ -146,7 +230,7 @@ class ClipTest(test.TestCase):
   # ClipByGlobalNorm tests
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -167,7 +251,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -188,7 +272,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -211,7 +295,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = ops.IndexedSlices(
           constant_op.constant([1.0, -2.0]), constant_op.constant([3, 4]))
@@ -244,7 +328,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([1.0, -2.0])
       # Global norm of x0 and x1 = sqrt(1 + 4^2 + 2^2 + 2^2) = 5
@@ -263,7 +347,7 @@ class ClipTest(test.TestCase):
 
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x0 = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       x1 = constant_op.constant([0.0, 0.0])
       # Norm = 0, no changes
@@ -282,7 +366,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClipped(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -294,7 +378,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormClippedTensor(self):
     # Norm clipping when average clip_norm < 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
@@ -306,7 +390,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormNotClipped(self):
     # No norm clipping when average clip_norm >= 0.83333333
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
       # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
@@ -318,7 +402,7 @@ class ClipTest(test.TestCase):
 
   def testClipByAverageNormZero(self):
     # No norm clipping when average clip_norm = 0
-    with self.test_session():
+    with self.test_session(use_gpu=True):
       x = constant_op.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3])
       # Average norm = 0, no changes
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 18796f709566f022258806ce46cc706e8fe34354..749313b00d8b01eac821662ee4cfa61cff4e34b6 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -653,12 +653,12 @@ class FillTest(test.TestCase):
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex64(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex64)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex64)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillComplex128(self):
-    np_ans = np.array([[0.15] * 3] * 2).astype(np.complex128)
-    self._compare([2, 3], np_ans[0][0], np_ans, use_gpu=False)
+    np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
 
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index ec8ac74163d093c57e6e4ffbab6977ce732cc3ef..f4616fd661f989c1c3e4939a3d062b0260f8572e 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -344,6 +345,8 @@ class Conv3DTest(test.TestCase):
         if data_format == "NCDHW":
           conv = test_util.NCHWToNHWC(conv)
 
+        self.assertEqual(conv.shape, tensor_shape.TensorShape(output_shape))
+
         if test_input:
           jacob_t, jacob_n = gradient_checker.compute_gradient(
               orig_input_tensor, input_shape, conv, output_shape)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 8db0bb6f0dc495e7be2cd717787acf87156f42af..34e77512434ea26d2693f0bb1600ff81dd15e84f 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -2165,5 +2165,47 @@ class AccumulateTest(test.TestCase):
         math_ops.accumulate_n([a], tensor_dtype=np.int32)
 
 
+class PolyvalTest(test.TestCase):
+
+  def _runtest(self, dtype, degree):
+    x = np.random.rand(2, 2).astype(dtype)
+    coeffs = [np.random.rand(2, 2).astype(dtype) for _ in range(degree + 1)]
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+  def testSimple(self):
+    for dtype in [
+        np.int32, np.float32, np.float64, np.complex64, np.complex128
+    ]:
+      for degree in range(5):
+        self._runtest(dtype, degree)
+
+  def testBroadcast(self):
+    dtype = np.float32
+    degree = 3
+    shapes = [(1,), (2, 1), (1, 2), (2, 2)]
+    for x_shape in shapes:
+      for coeff_shape in shapes:
+        x = np.random.rand(*x_shape).astype(dtype)
+        coeffs = [
+            np.random.rand(*coeff_shape).astype(dtype)
+            for _ in range(degree + 1)
+        ]
+        np_val = np.polyval(coeffs, x)
+        with self.test_session():
+          tf_val = math_ops.polyval(coeffs, x)
+          self.assertAllClose(np_val, tf_val.eval())
+
+  def testEmpty(self):
+    x = np.random.rand(2, 2).astype(np.float32)
+    coeffs = []
+    np_val = np.polyval(coeffs, x)
+    with self.test_session():
+      tf_val = math_ops.polyval(coeffs, x)
+      self.assertAllClose(np_val, tf_val.eval())
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/decode_csv_op_test.py b/tensorflow/python/kernel_tests/decode_csv_op_test.py
index fec52fa9cc7bcab1da67e797c2e121edac8c9345..4f49d726765e6019715a9b40f531b82df7f33126 100644
--- a/tensorflow/python/kernel_tests/decode_csv_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_csv_op_test.py
@@ -78,9 +78,11 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(args, expected_out)
 
   def test2DNoQuoteDelimiter(self):
-    args = {"records": [["1", "2"], ['""', '"']],
-            "record_defaults": [[""]],
-            "use_quote_delim": False}
+    args = {
+        "records": [["1", "2"], ['""', '"']],
+        "record_defaults": [[""]],
+        "use_quote_delim": False
+    }
     expected_out = [[[b"1", b"2"], [b'""', b'"']]]
 
     self._test(args, expected_out)
@@ -88,8 +90,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testDouble(self):
     args = {
         "records": ["1.0", "-1.79e+308", '"1.79e+308"'],
-        "record_defaults": [np.array(
-            [], dtype=np.double)],
+        "record_defaults": [np.array([], dtype=np.double)],
     }
 
     expected_out = [[1.0, -1.79e+308, 1.79e+308]]
@@ -99,8 +100,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testInt64(self):
     args = {
         "records": ["1", "2", '"2147483648"'],
-        "record_defaults": [np.array(
-            [], dtype=np.int64)],
+        "record_defaults": [np.array([], dtype=np.int64)],
     }
 
     expected_out = [[1, 2, 2147483648]]
@@ -173,8 +173,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWithoutDefaultsError(self):
     args = {
         "records": [",1", "0.2,3", "3.0,"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -183,8 +182,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldIntError(self):
     args = {
         "records": [",1", "0.2,234a", "3.0,2"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -202,8 +200,7 @@ class DecodeCSVOpTest(test.TestCase):
   def testWrongFieldFloatError(self):
     args = {
         "records": [",1", "0.2,2", "3.0adf,3"],
-        "record_defaults": [[1.0], np.array(
-            [], dtype=np.int32)]
+        "record_defaults": [[1.0], np.array([], dtype=np.int32)]
     }
 
     self._test(
@@ -229,6 +226,73 @@ class DecodeCSVOpTest(test.TestCase):
     self._test(
         args, expected_err_re="Quoted field has to end with quote followed.*")
 
+  def testSelectCols(self):
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[1], [2]],
+        "select_cols": [0, 1]
+    }
+    expected_out = [[1, 4], [2, 5]]
+    self._test(args, expected_out)
+
+  def testSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 2]
+    }
+    expected_out = [[0, 4], [1, 5], [2, 6]]
+    self._test(args, expected_out)
+
+  def testWrongSelectColsInclLast(self):
+    # The last col is a edge-casey; add test for that
+    args = {
+        "records": [",,", "4,5,6"],
+        "record_defaults": [[0], [1], [2]],
+        "select_cols": [0, 1, 3]
+    }
+    self._test(args, expected_err_re="Expect 3 fields but have 2 in record 0")
+
+  def testWrongSelectColsLen(self):
+    args = {
+        "records": ["1,2,3", "4,5,6"],
+        "record_defaults": [[0], [0], [0]],
+        "select_cols": [0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "Length of select_cols and record_defaults do not match."):
+      self._test(args)
+
+  def testWrongSelectColsSorting(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [1, 0]
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols is not strictly increasing."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesNegative(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [-1, 0]  # -1 is not a valid index
+    }
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "select_cols contains negative values."):
+      self._test(args)
+
+  def testWrongSelectColsIndicesTooHigh(self):
+    args = {
+        "records": ["1,2,3"],
+        "record_defaults": [[0], [1]],
+        "select_cols": [0, 3]  # 3 is not a valid index
+    }
+    # Only successfully parses one of the columns
+    self._test(args, expected_err_re="Expect 2 fields but have 1 in record 0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index e220d0569281c6dbe4107fdfb8013e99592f153c..f3cc9636f91f7d3573d8a66d6b1b4936e49a9141 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -280,15 +280,3 @@ cuda_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index df99a0ed257da20179909eb44eacf7d44528dad2..a8def95b147b6dd4825675769187733b8493b374 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -281,6 +281,22 @@ class UniformTest(test.TestCase):
       expected_pdf = [1.0, 0.1]
       self.assertAllClose(expected_pdf, pdf.eval())
 
+  def testUniformFloat64(self):
+    uniform = uniform_lib.Uniform(
+        low=np.float64(0.), high=np.float64(1.))
+
+    self.assertAllClose(
+        [1., 1.],
+        self.evaluate(uniform.prob(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(
+        [0.5, 0.6],
+        self.evaluate(uniform.cdf(np.array([0.5, 0.6], dtype=np.float64))))
+
+    self.assertAllClose(0.5, self.evaluate(uniform.mean()))
+    self.assertAllClose(1 / 12., self.evaluate(uniform.variance()))
+    self.assertAllClose(0., self.evaluate(uniform.entropy()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index f5717a5a21a0be82382c5da556ed6f5540591abf..34fb655035d6cadab583c1f66dbeae3f7a0b65b0 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -39,6 +40,7 @@ import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
 
+# pylint: disable=invalid-name
 def simple_scoped_fn(a, x):
   """Simple function: (a, x) -> 2(x+a), but with "2" as a variable in scope."""
   with variable_scope.variable_scope("body"):
@@ -158,6 +160,13 @@ class FunctionalOpsTest(test.TestCase):
                 values=constant_op.constant([0, 1, 2]),
                 dense_shape=[2, 2]))
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testMapOverScalarErrors(self):
+    with self.assertRaisesRegexp(ValueError, "not scalars"):
+      functional_ops.map_fn(lambda x: x, [1, 2])
+    with self.assertRaisesRegexp(ValueError, "not a scalar"):
+      functional_ops.map_fn(lambda x: x, 1)
+
   def testMap_Scoped(self):
     with self.test_session() as sess:
 
@@ -229,7 +238,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.test_session():
       nums = np.array([1, 2, 3, 4, 5, 6])
       with self.assertRaisesRegexp(
-          TypeError, r"two structures don't have the same sequence type."):
+          TypeError, r"two structures don't have the same nested structure"):
         # lambda emits tuple, but dtype is a list
         functional_ops.map_fn(
             lambda x: ((x + 3) * 2, -(x + 3) * 2),
@@ -316,7 +325,7 @@ class FunctionalOpsTest(test.TestCase):
       initializer = np.array(1.0)
       # Multiply a * 1 each time
       with self.assertRaisesRegexp(
-          ValueError, "two structures don't have the same number of elements"):
+          ValueError, "two structures don't have the same nested structure"):
         functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
   def testScan_Scoped(self):
@@ -607,6 +616,276 @@ class FunctionalOpsTest(test.TestCase):
       mul = sess.run(remote_op)
       self.assertEqual(mul, 9)
 
+  def testIf(self):
+
+    @function.Defun(dtypes.float32)
+    def Twice(x):
+      return x * 2
+
+    @function.Defun(dtypes.float32)
+    def Thrice(x):
+      return x * 3 + 1
+
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(x):
+        return sess.run(
+            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+
+      self.assertAllEqual(Run(9.), 18.)
+      self.assertAllEqual(Run(-8.), -23.)
+      self.assertAllEqual(Run(0.), 1.)
+
+  def testWhile(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+
+      def Run(n):
+        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+
+      self.assertAllEqual(Run(20.), 210.)
+      self.assertAllEqual(Run(100.), 5050.)
+
+  def testWhileError(self):
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Cond(n, unused_x):
+      return n > 0
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def CondReturnsTooManyArgs(n, x):
+      return n > 0, x
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    @function.Defun(*[dtypes.float32] * 2)
+    def BodyReturnsTooManyArgs(n, x):
+      return n - 1, x + n, x
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Expected a single scalar.*got 2 tensors."):
+        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "While loop body returned 3 arguments. Expected: 2"):
+        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+
+  def testWhileInMultipleSubgraphs(self):
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Cond(n, x):  # pylint: disable=unused-argument
+      return n > 0
+
+    @function.Defun(* [dtypes.float32] * 2)
+    def Body(n, x):
+      return n - 1, x + n
+
+    # TODO(b/65752372): Set `use_gpu=False` because
+    # `functional_ops.While()` does not reliably work on GPU (apparently
+    # because the result of evaluating the condition may be in device
+    # memory, but it is read on the host).
+    with self.test_session(use_gpu=False) as sess:
+      n = array_ops.placeholder(dtypes.float32)
+      _, result = functional_ops.While([n, 0.], Cond, Body)
+      c = constant_op.constant(37.)
+
+      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+      # Test that the result is the same when we run a different subgraph.
+      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu) as sess:
+
+      @function.Defun(dtypes.int32, dtypes.float32)
+      def Body(n, x):
+        return x + math_ops.to_float(n)
+
+      xs = [
+          # 1 + 2  + ... + 20
+          functional_ops.For(
+              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+          # 100 + 99 + ... + 1
+          functional_ops.For(
+              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+      ]
+      xvals = sess.run(xs)
+    self.assertAllEqual(210, xvals[0])
+    self.assertAllEqual(5050, xvals[1])
+
+  def testFor(self):
+    self._tfSum(False)
+
+  def testForWithWhile(self):
+    self._tfSum(True)
+
+  def testForWithWhileNaming(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      @function.Defun(dtypes.int32, dtypes.float32, func_name="TestBody")
+      def TestBody(n, x):
+        return x + math_ops.to_float(n)
+
+      _ = functional_ops.For(
+          1, 21, 1, [0.], TestBody, rewrite_with_while=True)[0]
+
+    names = []
+    for func in g.as_graph_def().library.function:
+      names.append(func.signature.name)
+    self.assertTrue("TestBody" in names)
+    self.assertTrue("TestBody_Cond" in names)
+    self.assertTrue("TestBody_Body" in names)
+
+  def testForCapturedInputs(self):
+    v = variables.Variable(1.0)
+
+    @function.Defun(dtypes.int32)
+    def TestNullary(n):
+      v + math_ops.to_float(n)  # pylint: disable=expression-not-assigned
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def TestUnary(n, x):
+      return x + math_ops.to_float(n) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32, dtypes.float32)
+    def TestBinary(n, x, x2):
+      return x + math_ops.to_float(n) + v, x2 + v
+
+    for rewrite_with_while in (True, False):
+      # TODO(b/65752372): Set `use_gpu=False` because
+      # `functional_ops.While()` does not reliably work on GPU (apparently
+      # because the result of evaluating the condition may be in device
+      # memory, but it is read on the host).
+      use_gpu = not rewrite_with_while
+      with self.test_session(use_gpu=use_gpu) as sess:
+        result_nullary = functional_ops.For(
+            1, 10, 1, [], TestNullary,
+            rewrite_with_while=rewrite_with_while)
+        result_unary = functional_ops.For(
+            1, 10, 1, [0.], TestUnary,
+            rewrite_with_while=rewrite_with_while)
+        result_binary = functional_ops.For(
+            1, 10, 1, [0., 0.], TestBinary,
+            rewrite_with_while=rewrite_with_while)
+        sess.run(variables.global_variables_initializer())
+        assert not result_nullary
+        # The nullary variant doesn't return anything so we can't easily run it.
+        # As a total hack, fetch the operation by name and run it.
+        sess.run(ops.get_default_graph().get_operation_by_name(
+            "While" if rewrite_with_while else "For"))
+        assert len(result_unary) == 1
+        self.assertEqual([54.0], sess.run(result_unary))
+        assert len(result_binary) == 2
+        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+
+  def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
+    # On GPU, don't rewrite using a while loop.
+    use_gpu = not rewrite_with_while
+    with self.test_session(use_gpu=use_gpu):
+
+      @function.Defun(dtypes.int32, *[dtypes.float64] * 3)
+      def MLP(i, a, ws, bs):
+        a = math_ops.tanh(math_ops.matmul(a, ws[i, :]) + bs[i, :])
+        return a, ws, bs
+
+      ret = functional_ops.For(
+          0,
+          wsval.shape[0],
+          1, [xval, wsval, bsval],
+          MLP,
+          rewrite_with_while=rewrite_with_while)[0]
+
+      return ret.eval()
+
+  def _npMLP(self, xval, wsval, bsval):
+    for i in range(wsval.shape[0]):
+      xval = np.tanh(np.dot(xval, wsval[i, :]) + bsval[i, :])
+    return xval
+
+  def _testForMLP(self, rewrite_with_while):
+    # We construct a 5-layer Multi-Layer Perceptron network here.
+    # Each layer have the same number of hidden unites (3), and the
+    # activation function is tanh().  We feed the input (xval) with
+    # batch size 2.
+    xval = np.random.normal(size=(2, 3))
+    wsval = np.random.normal(size=(5, 3, 3))
+    bsval = np.random.normal(size=(5, 3))
+    np_ans = self._npMLP(xval, wsval, bsval)
+    tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
+    self.assertAllClose(np_ans, tf_for_ans)
+
+  def testForMLP(self):
+    self._testForMLP(False)
+
+  def testForMLPWhile(self):
+    self._testForMLP(True)
+
+  def testForError(self):
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def Foo(i, v):
+      return math_ops.to_float(i) + v
+
+    @function.Defun(dtypes.int32, dtypes.float32)
+    def ReturnsTooManyArgs(unused_i, v):
+      return v, v
+
+    with self.test_session(use_gpu=True):
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a scalar"):
+        functional_ops.For([0], 10, 1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Invalid start/limit/delta"):
+        functional_ops.For(0, 10, -1, [0.0], Foo)[0].eval()
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "For loop body returned 2 arguments. Expected: 1"):
+        functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
+
+  def testGradient(self):
+
+    @function.Defun(dtypes.float32)
+    def Poly(x):
+      # y = 2x^3+3x^2+4x+8
+      return 2 * x * x * x + 3 * x * x + 4 * x + 8
+
+    @function.Defun(dtypes.float32)
+    def Grad(x):
+      # dy/dx = dy/dy * dy/dx = 1.0 * (6x^2+6x+4)
+      return functional_ops.Gradient([x, 1.0], Poly)[0]
+
+    with self.test_session(use_gpu=False) as sess:
+      a = constant_op.constant(0.)
+      avals = [Poly(a), Grad(a)]
+      b = constant_op.constant(1.)
+      bvals = [Poly(b), Grad(b)]
+      self.assertAllEqual(sess.run(avals), [8., 4.])
+      self.assertAllEqual(sess.run(bvals), [17., 16.])
+
 
 if __name__ == "__main__":
   test.main()
+
+# pylint: enable=invalid-name
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index c1755985ee85c62005c8d3d5fb916859193aa5f3..1e5c118cbc3573af0a2ce95239f499a5e52a0c86 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -618,7 +618,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     for dtype in [dtypes.float32]:
       for kernel_size in [[3], [8], [3, 5], [2, 4], [3, 3, 3], [2, 2, 2]]:
         tol = 1e-2
-        # Check orthogonality by computing the 2-norms of the inputs and ouputs.
+        # Check orthogonality by computing the 2-norms of the inputs and outputs.
         if len(kernel_size) == 1:
           shape = [4, 32, 64]
           convolution = convolutional.conv1d
@@ -649,6 +649,30 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
               sess.run(outputs_2norm)/(np.sqrt(np.prod(shape))*np.sqrt(3.14)),
               rtol=tol, atol=tol)
 
+  def testNonuniformity(self):
+    value = 0
+    abs_value = 0
+    shape = [3, 3, 10, 10]
+    count = 70
+    tol = 1e-5
+    with self.test_session(use_gpu=True):  # as sess:
+      for i in range(count):
+        x = variable_scope.get_variable("{}".format(i), shape=shape,
+                                        initializer=
+                                        init_ops.convolutional_delta_orthogonal)
+        x.initializer.run()
+        y = x.eval()[1, 1, :, :]
+        determinant = np.linalg.det(y)
+        value += determinant
+        abs_value += np.abs(determinant)
+
+      # Check there is some variation in the signs of the determinants
+      self.assertLess(value, count - tol)
+      self.assertLess(-count + tol, value)
+      # Check all determinants have absolute value 1
+      # Compute the sum of the absolute values of 'count' determinants
+      self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
+
 
 class IdentityInitializerTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f95e13187fcd5cc199d871ea5efdca363b37cd0
--- /dev/null
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -0,0 +1,198 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for inplace_ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.platform import test as test_lib
+
+
+class InplaceOpsTest(test_util.TensorFlowTestCase):
+
+  def testBasicUpdate(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] = 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, [-1],
+                                       array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] = 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_update(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] = 7
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicUpdateBool(self):
+    with self.test_session(use_gpu=True):
+      x = array_ops.ones([7, 3], dtypes.bool)
+      y = np.ones([7, 3], dtypes.bool.as_numpy_dtype)
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [3], array_ops.ones([1, 3],
+                                                            dtypes.bool))
+      y[3, :] = True
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, [-1],
+                                     array_ops.zeros([1, 3], dtypes.bool))
+      y[-1, :] = False
+      self.assertAllClose(x.eval(), y)
+      x = inplace_ops.inplace_update(x, 5, array_ops.zeros([3], dtypes.bool))
+      y[5, :] = False
+      self.assertAllClose(x.eval(), y)
+
+  def testBasicAdd(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = array_ops.inplace_add(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] += 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] += 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] += 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_add(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] += 99
+        self.assertAllClose(x.eval(), y)
+
+  def testBasicSub(self):
+    for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
+      with self.test_session(use_gpu=True):
+        x = array_ops.ones([7, 3], dtype)
+        y = np.ones([7, 3], dtype.as_numpy_dtype)
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [3], array_ops.ones([1, 3], dtype))
+        y[3, :] -= 1
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, [-1], array_ops.ones([1, 3], dtype) * 2)
+        y[-1, :] -= 2
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, 5, array_ops.ones([3], dtype) * 7)
+        y[5, :] -= 7
+        self.assertAllClose(x.eval(), y)
+        x = inplace_ops.inplace_sub(x, None, array_ops.ones([7, 3], dtype) * 99)
+        y[:, :] -= 99
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom(self):
+    with self.test_session(use_gpu=True):
+      d0, d1, d2 = 100, 3, 5
+      x = array_ops.zeros([d0, d1, d2])
+      y = np.zeros([d0, d1, d2])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10, d1, d2))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx, :] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx, :] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx, :] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testRandom1D(self):
+    with self.test_session(use_gpu=True):
+      d0 = 100
+      x = array_ops.zeros([d0])
+      y = np.zeros([d0])
+      for _ in xrange(20):
+        idx = np.random.choice(d0, d0 // 10, replace=False)
+        val = np.random.randint(10, size=(d0 // 10))
+        op = np.random.randint(3)
+        if op == 0:
+          x = inplace_ops.inplace_update(x, idx, val)
+          y[idx] = val
+        elif op == 1:
+          x = inplace_ops.inplace_add(x, idx, val)
+          y[idx] += val
+        elif op == 2:
+          x = inplace_ops.inplace_sub(x, idx, val)
+          y[idx] -= val
+        self.assertAllClose(x.eval(), y)
+
+  def testAlias(self):
+    with self.test_session(use_gpu=True) as sess:
+      x = array_ops.ones([2, 3])
+      y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
+      with ops.control_dependencies([y]):
+        z = array_ops.identity(x)
+        _, vy, vz = sess.run([x, y, z])
+      self.assertAllClose(vy, vz)
+
+  def testError(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must be a vector"):
+        _ = inplace_ops.inplace_update([[1.]], [[0]], [[10]]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "x and v shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0], [10]).eval()
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "i and x shape doesn't match"):
+        _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
+
+  def testEmpty(self):
+    for dtype in [
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+    ]:
+      with self.test_session(use_gpu=True):
+        test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
+        for shape in test_shapes:
+          val = inplace_ops.empty(shape, dtype).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty(shape, dtype, init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+          val = inplace_ops.empty_like(array_ops.zeros(shape, dtype)).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          val = inplace_ops.empty_like(
+              array_ops.zeros(shape, dtype), init=True).eval()
+          self.assertEqual(val.shape, shape)
+          self.assertEqual(val.dtype, dtype.as_numpy_dtype)
+          self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+        self.assertEqual(val.tolist(), [[b"", b""]])
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 66afb6ec014991ca32efd5b0895ff695d3d1015f..184d1dde2aa5e8d786cb85141f8dfb90c0bdad63 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,10 +19,12 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index fd1b5bab6f5aa072c8821eb053bd8d39391be4d4..9555e510997a6aa07797dffa1a6e4810b0b4e5d2 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -140,15 +140,3 @@ cuda_py_test(
     ],
     shard_count = 5,
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index e1edffc3d9afec618f8dfcf74bae1b0f1bde2772..7b291e29de41d2fe37257bb42222ac23fc8e1d3f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -94,8 +95,8 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 class BroadcastMatrixBatchDimsTest(test.TestCase):
 
   def test_zero_batch_matrices_returned_as_empty_list(self):
-    self.assertAllEqual(
-        [], linear_operator_util.broadcast_matrix_batch_dims([]))
+    self.assertAllEqual([],
+                        linear_operator_util.broadcast_matrix_batch_dims([]))
 
   def test_one_batch_matrix_returned_after_tensor_conversion(self):
     arr = rng.rand(2, 3, 4)
@@ -194,6 +195,44 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       linear_operator_util.broadcast_matrix_batch_dims([y, x])
 
 
+class CholeskySolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    chol = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    chol = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    chol_broadcast = chol + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    chol_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.cholesky_solve_with_broadcast(
+                  chol_ph, rhs_ph),
+              linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              chol_ph: chol,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
 class MatmulWithBroadcastTest(test.TestCase):
 
   def test_static_dims_broadcast(self):
@@ -209,7 +248,7 @@ class MatmulWithBroadcastTest(test.TestCase):
       expected = math_ops.matmul(x, y_broadcast)
       self.assertAllEqual(expected.eval(), result.eval())
 
-  def test_dynamic_dims_broadcast_32bit(self):
+  def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
     x = rng.rand(2, 1, 3)
@@ -221,9 +260,90 @@ class MatmulWithBroadcastTest(test.TestCase):
 
     with self.test_session() as sess:
       result, expected = sess.run(
-          [linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
-           math_ops.matmul(x, y_broadcast)],
-          feed_dict={x_ph: x, y_ph: y})
+          [
+              linear_operator_util.matmul_with_broadcast(x_ph, y_ph),
+              math_ops.matmul(x, y_broadcast)
+          ],
+          feed_dict={
+              x_ph: x,
+              y_ph: y
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(3, 3)
+    rhs = rng.rand(2, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2, 2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(2, 1, 3, 7)
+    matrix_broadcast = matrix + np.zeros((2, 2, 1, 1))
+    rhs_broadcast = rhs + np.zeros((2, 2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
+      self.assertAllEqual(expected, result)
+
+
+class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
+
+  def test_static_dims_broadcast(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    with self.test_session():
+      result = linear_operator_util.matrix_triangular_solve_with_broadcast(
+          matrix, rhs)
+      self.assertAllEqual((2, 3, 7), result.get_shape())
+      expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+      self.assertAllEqual(expected.eval(), result.eval())
+
+  def test_dynamic_dims_broadcast_64bit(self):
+    # batch_shape = [2]
+    matrix = rng.rand(2, 3, 3)
+    rhs = rng.rand(3, 7)
+    rhs_broadcast = rhs + np.zeros((2, 1, 1))
+
+    matrix_ph = array_ops.placeholder(dtypes.float64)
+    rhs_ph = array_ops.placeholder(dtypes.float64)
+
+    with self.test_session() as sess:
+      result, expected = sess.run(
+          [
+              linear_operator_util.matrix_triangular_solve_with_broadcast(
+                  matrix_ph, rhs_ph),
+              linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
+          ],
+          feed_dict={
+              matrix_ph: matrix,
+              rhs_ph: rhs,
+          })
       self.assertAllEqual(expected, result)
 
 
@@ -244,7 +364,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       # Should not raise
       linear_operator_util.assert_compatible_matrix_dimensions(
-          operator, x).run()
+          operator, x).run()  # pyformat: disable
 
   def test_incompatible_dimensions_raise(self):
     with self.test_session():
@@ -252,7 +372,7 @@ class AssertCompatibleMatrixDimensionsTest(test.TestCase):
       operator = DomainDimensionStubOperator(3)
       with self.assertRaisesOpError("Incompatible matrix dimensions"):
         linear_operator_util.assert_compatible_matrix_dimensions(
-            operator, x).run()
+            operator, x).run()  # pyformat: disable
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index dbbed39c727f01ed1fae271375575c690958c7d8..6173a1def3365f455e9fd5066a2b40620ec50a93 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -33,6 +33,8 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -43,71 +45,83 @@ def scalar_shape():
 
 class ListOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPop(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testPushPop()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStack(self):
     l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
                                    element_shape=scalar_shape())
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [1.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testStack()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 2.0)
+    self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, 1.0)
-    self.assertAllEqual(list_ops.tensor_list_length(l), 0)
+    self.assertAllEqual(self.evaluate(e), 1.0)
+    self.assertAllEqual(self.evaluate(list_ops.tensor_list_length(l)), 0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testFromTensorGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testTensorListFromTensor()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
-    self.assertAllEqual(e0, 1.0)
+    self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(t, [3.0, 2.0])
+    self.assertAllEqual(self.evaluate(t), [3.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGPU(self):
     if not context.num_gpus():
       return
     with context.device("gpu:0"):
       self.testGetSetItem()
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnknownShape(self):
-    l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                   element_shape=-1)
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=-1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
-    _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
-    self.assertAllEqual(e, [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), [1.0, 2.0])
+    l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(e), 1.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testCPUGPUCopy(self):
     if not context.num_gpus():
       return
@@ -116,15 +130,16 @@ class ListOpsTest(test_util.TensorFlowTestCase):
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
-          list_ops.tensor_list_pop_back(
-              l_gpu, element_dtype=dtypes.float32)[1],
-          2.0)
+          self.evaluate(
+              list_ops.tensor_list_pop_back(
+                  l_gpu, element_dtype=dtypes.float32)[1]), 2.0)
     l_cpu = array_ops.identity(l_gpu)
     self.assertAllEqual(
-        list_ops.tensor_list_pop_back(
-            l_cpu, element_dtype=dtypes.float32)[1],
-        2.0)
+        self.evaluate(
+            list_ops.tensor_list_pop_back(
+                l_cpu, element_dtype=dtypes.float32)[1]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStack(self):
     with context.graph_mode(), self.test_session():
       tl = list_ops.empty_tensor_list(
@@ -132,9 +147,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
           element_dtype=dtypes.int32)
       tl = list_ops.tensor_list_push_back(tl, [1])
       self.assertAllEqual(
-          list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32).eval(),
+          self.evaluate(
+              list_ops.tensor_list_stack(tl, element_dtype=dtypes.int32)),
           [[1]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoop(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -149,9 +166,10 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, t1 = control_flow_ops.while_loop(lambda i, t1: math_ops.less(i, 4),
                                           body, [i, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32).eval()
-      self.assertAllEqual(s1, [0, 1, 2, 3])
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.int32)
+      self.assertAllEqual(self.evaluate(s1), [0, 1, 2, 3])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       list_ = list_ops.empty_tensor_list(
@@ -169,11 +187,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       for _ in range(2):
         list_, m = body(list_, m)
 
-      s1 = list_ops.tensor_list_stack(
-          list_, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(list_, element_dtype=dtypes.float32)
       np_s1 = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.float32)
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGraphStackInLoopSwitchDtype(self):
     with context.graph_mode(), self.test_session():
       t1 = list_ops.empty_tensor_list(
@@ -193,10 +211,11 @@ class ListOpsTest(test_util.TensorFlowTestCase):
 
       i, m, t1 = control_flow_ops.while_loop(
           lambda i, m, t1: math_ops.less(i, 4), body, [i, m, t1])
-      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32).eval()
+      s1 = list_ops.tensor_list_stack(t1, element_dtype=dtypes.float32)
       np_s1 = np.vstack([np.arange(1, 4) * i for i in range(4)])
-      self.assertAllEqual(s1, np_s1)
+      self.assertAllEqual(self.evaluate(s1), np_s1)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSerialize(self):
     # pylint: disable=g-import-not-at-top
     try:
@@ -226,8 +245,9 @@ class ListOpsTest(test_util.TensorFlowTestCase):
               l_ps, element_dtype=dtypes.float32)
         with ops.device("/job:worker"):
           worker_e = array_ops.identity(e)
-        self.assertAllEqual(worker_e.eval(), [2.0])
+        self.assertAllEqual(self.evaluate(worker_e), [2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
@@ -237,18 +257,21 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       l = list_ops.tensor_list_push_back(l, c)
       l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
       e = 2 * e
-    self.assertAllEqual(tape.gradient(e, [c])[0], 2.0)
+    self.assertAllEqual(self.evaluate(tape.gradient(e, [c])[0]), 2.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testStackFromTensorGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
       l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
       c2 = list_ops.tensor_list_stack(
-          l, element_dtype=dtypes.float32)
+          l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
-    self.assertAllEqual(tape.gradient(result, [c])[0], [2.0, 2.0])
+    grad = tape.gradient(result, [c])[0]
+    self.assertAllEqual(self.evaluate(grad), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testGetSetGradients(self):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
@@ -261,16 +284,40 @@ class ListOpsTest(test_util.TensorFlowTestCase):
       ee = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       y = e * e + ee * ee
     grad_c, grad_c2 = tape.gradient(y, [c, c2])
-    self.assertAllEqual(grad_c, [0.0, 4.0])
-    self.assertAllEqual(grad_c2, 6.0)
+    self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
+    self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
     l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
     with self.assertRaises(errors.InvalidArgumentError):
-      list_ops.tensor_list_set_item(l, 20, 3.0)
+      self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testResourceVariableScatterGather(self):
+    c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
+    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
+    v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
+    self.evaluate(v.initializer)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_0_stacked))
+    v_r_sparse_stacked = list_ops.tensor_list_stack(
+        v.sparse_read(0), dtypes.float32)
+    self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
+    l_new_0 = list_ops.tensor_list_from_tensor(
+        [3.0, 4.0], element_shape=scalar_shape())
+    l_new_1 = list_ops.tensor_list_from_tensor(
+        [5.0, 6.0], element_shape=scalar_shape())
+    updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
+    updated_v_elems = array_ops.unstack(updated_v)
+    updated_v_stacked = [
+        list_ops.tensor_list_stack(el, dtypes.float32) for el in updated_v_elems
+    ]
+    expected = ([[1.0, 2.0]] * 3 + [[3.0, 4.0], [1.0, 2.0], [5.0, 6.0]] +
+                [[1.0, 2.0]] * 4)
+    self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
 
 if __name__ == "__main__":
-  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index b8200ac0cb1e4315a56181779c70da1126d8fc15..7948a475bbaad5978368f1d68372174e4b7a8ab7 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -88,6 +88,16 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  def testNegativeAxis(self):
+    self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
+    self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
+    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    with self.test_session():
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "is out of range"):
+        manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
+                       3, -10).eval()
+
   def testRollInputMustVectorHigherRaises(self):
     tensor = 7
     shift = 1
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index ad802f7e1f72f6cbc3dda1ca98e46e6da4e5110a..55653489aff0a745c5731db4d31864aede97e954 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -1124,40 +1124,91 @@ class AUCTest(test.TestCase):
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
-  def testAUCPRSpecialCase(self):
+  # Regarding the AUC-PR tests: note that the preferred method when
+  # calculating AUC-PR is summation_method='careful_interpolation'.
+  def testCorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.79726744594
+      expected = 1 - math.log(1.5) / 2
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testCorrectAnotherAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.61350593198
+      expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testThirdCorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
+          shape=(1, 7),
+          dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='careful_interpolation')
+
+      sess.run(variables.local_variables_initializer())
+      # expected ~= 0.90410597584
+      expected = 1 - math.log(4./3) / 3
+      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
+
+  def testIncorrectAUCPRSpecialCase(self):
+    with self.test_session() as sess:
+      predictions = constant_op.constant(
+          [0.1, 0.4, 0.35, 0.8], shape=(1, 4), dtype=dtypes_lib.float32)
+      labels = constant_op.constant([0, 0, 1, 1], shape=(1, 4))
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
-  def testAnotherAUCPRSpecialCase(self):
+  def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.1, 0.4, 0.35, 0.8, 0.1, 0.135, 0.81],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 1, 0, 1, 0, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
-  def testThirdAUCPRSpecialCase(self):
+  def testThirdIncorrectAUCPRSpecialCase(self):
     with self.test_session() as sess:
       predictions = constant_op.constant(
           [0.0, 0.1, 0.2, 0.33, 0.3, 0.4, 0.5],
           shape=(1, 7),
           dtype=dtypes_lib.float32)
       labels = constant_op.constant([0, 0, 0, 0, 1, 1, 1], shape=(1, 7))
-      auc, update_op = metrics.auc(labels, predictions, curve='PR')
+      auc, update_op = metrics.auc(labels, predictions, curve='PR',
+                                   summation_method='trapezoidal')
 
       sess.run(variables.local_variables_initializer())
       self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index ed44a1a4d16a94d3aa75a50bf059e33326757c4d..a0c372db7d0a4e76c37c01e1ce24cd8fc9123f7a 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -817,9 +817,6 @@ class PoolingTest(test.TestCase):
           cpu_val, gpu_val, half_rtol=0.01, half_atol=0.01)
 
   def testMaxPoolingWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     tensor_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     with self.test_session(use_gpu=True) as sess:
       t = constant_op.constant(tensor_input, shape=[1, 3, 3, 1])
@@ -836,9 +833,6 @@ class PoolingTest(test.TestCase):
       self.assertAllEqual(argmax.ravel(), [0, 1, 3, 5])
 
   def testMaxPoolingGradWithArgmax(self):
-    # MaxPoolWithArgMax is implemented only on CUDA.
-    if not test.is_gpu_available(cuda_only=True):
-      return
     orig_input = [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0]
     tensor_input = [11.0, 12.0, 13.0, 14.0]
     tensor_argmax = list(np.array([0, 1, 3, 5], dtype=np.int64))
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 88a4ddf7f29ec772282e7a8e2b59f144f1a968c2..acd7566eec8e3fffd74db33234b03a0c87427a3e 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -121,15 +121,3 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index df37dd98ece57ae7c3835ab63b720b29fc19c975..e4b5c3832a2252aedc8820a650b022cd30b7f285 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -228,6 +228,17 @@ class RandomUniformTest(test.TestCase):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegexp(
+          ValueError, "Shape must be rank 0 but is rank 1"):
+        random_ops.random_uniform(
+            [1000], minval=1, maxval=[2, 3], dtype=dtype)
+
   # Check that uniform ints actually follow a uniform distribution.
   def testUniformInts(self):
     minv = -2
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 742564f9bf671bc0da87c8b6d8e3ee6ed0ef2549..6d330869362b4703b177cbb18666bdad1d9e90d0 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -87,6 +87,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
+      resource_variable_ops.assign_variable_op(handle, 1)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "Trying to read variable with wrong dtype. "
                                    "Expected float got int32."):
@@ -173,215 +174,161 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
         resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32))
     self.assertEqual(read, 2)
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAdd(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(resource_variable_ops.assign_variable_op(
-          handle, constant_op.constant([[1]], dtype=dtypes.int32)))
-      self.evaluate(resource_variable_ops.resource_scatter_add(
-          handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSub(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         [[2]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMul(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         [[5]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDiv(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMin(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_min(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMax(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         [[3]],
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterAddScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_add(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterSubScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_sub(handle, [0],
-                                                     constant_op.constant(
-                                                         2,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[-1]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMulScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[1]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_mul(handle, [0],
-                                                     constant_op.constant(
-                                                         5,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[5]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterDivScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_div(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[2]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMinScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_min(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[3]])
-
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes()
   def testScatterMaxScalar(self):
-    with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
-      self.evaluate(
-          resource_variable_ops.assign_variable_op(handle,
-                                                   constant_op.constant(
-                                                       [[6]],
-                                                       dtype=dtypes.int32)))
-      self.evaluate(
-          resource_variable_ops.resource_scatter_max(handle, [0],
-                                                     constant_op.constant(
-                                                         3,
-                                                         dtype=dtypes.int32)))
-      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(self.evaluate(read), [[6]])
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)))
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
@@ -801,6 +748,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_update(v, [1], [3.0])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  def testScatterAddStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
+      state_ops.scatter_add(v, [1], [3])
+      self.assertAllEqual([1.0, 5.0], v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 86ab9fbb70b5efcf06cc064617df14deb18c1f98..51aa671098905e840b7c96cd5a984887d347adf9 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -24,11 +24,13 @@ import threading
 import numpy
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
@@ -118,6 +120,16 @@ class VariableScopeTest(test.TestCase):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.dtype.base_dtype, dtypes.float16)
 
+  def testGetVariableInGraphNestedUnderEagerContext(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        v = variable_scope.get_variable("should_be_resource", [])
+        self.assertEqual(type(v), resource_variable_ops.ResourceVariable)
+
+      f()
+
   def testEagerVariableStore(self):
     with context.eager_mode():
       store = variable_scope.EagerVariableStore()
@@ -156,6 +168,28 @@ class VariableScopeTest(test.TestCase):
       for v in new_store.variables():
         self.assertEqual(v.numpy(), 1)
 
+  def testEagerVariableStoreWithEagerDefun(self):
+    with context.eager_mode():
+
+      @function.defun
+      def f():
+        x = constant_op.constant([[2.0]])
+        d1 = core_layers.Dense(
+            1, name="my_dense", kernel_initializer=init_ops.ones_initializer())
+        _ = d1(x)  # create variables
+        self.assertEqual(len(d1.variables), 2)
+        v1, v2 = d1.variables
+        d2 = core_layers.Dense(
+            1,
+            name="my_dense",
+            kernel_initializer=init_ops.ones_initializer(),
+            _reuse=True)
+        _ = d2(x)
+        self.assertEqual(len(d2.variables), 2)
+        v3, v4 = d2.variables
+        self.assertAllEqual([v1, v2], [v3, v4])
+      f()
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
@@ -209,15 +243,15 @@ class VariableScopeTest(test.TestCase):
 
           with variable_scope.variable_scope("not_cached", caching_device=""):
             v2_not_cached = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_not_cached.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_not_cached.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope(
               "not_cached_identity_device",
               caching_device=lambda op: op.device):
             v2_identity_device = variable_scope.get_variable("v", [])
-            self.assertFalse(v2_identity_device.value().device.startswith(
-                caching_device))
+            self.assertFalse(
+                v2_identity_device.value().device.startswith(caching_device))
 
           with variable_scope.variable_scope("we_will_do_it_live") as vs_live:
             vs_live.set_caching_device("/job:live")
@@ -484,15 +518,19 @@ class VariableScopeTest(test.TestCase):
 
   def testVarScopeGetOrCreateReuse(self):
     with self.test_session():
+
       def test_value(value):
         x = constant_op.constant(value)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = state_ops.assign(variable_scope.get_variable("var", []), x)
-        with variable_scope.variable_scope("testVarScopeGetOrCreateReuse_bar",
-                                           reuse=variable_scope.AUTO_REUSE):
+        with variable_scope.variable_scope(
+            "testVarScopeGetOrCreateReuse_bar",
+            reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
         self.assertEqual(value, x.eval())
+
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
       test_value(17.)
@@ -551,19 +589,16 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("default") as default:
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer/w:0")
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_1/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_1/w:0")
         with variable_scope.variable_scope(default):
           pass
         # No matter the jump in the middle, unique numbering continues.
         with variable_scope.variable_scope(None, "layer"):
           self.assertEqual(
-              variable_scope.get_variable("w", []).name,
-              "default/layer_2/w:0")
+              variable_scope.get_variable("w", []).name, "default/layer_2/w:0")
 
   def testVarOpScopeReuse(self):
     with self.test_session():
@@ -935,12 +970,12 @@ class VariableScopeTest(test.TestCase):
   def testGetCollection(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
-      _ = variable_scope.get_variable("testGetCollection_b", [],
-                                      trainable=False)
+      _ = variable_scope.get_variable(
+          "testGetCollection_b", [], trainable=False)
       with variable_scope.variable_scope("testGetCollection_foo_") as scope1:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope1.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -954,8 +989,8 @@ class VariableScopeTest(test.TestCase):
         ])
       with variable_scope.variable_scope("testGetCollection_foo") as scope2:
         _ = variable_scope.get_variable("testGetCollection_a", [])
-        _ = variable_scope.get_variable("testGetCollection_b", [],
-                                        trainable=False)
+        _ = variable_scope.get_variable(
+            "testGetCollection_b", [], trainable=False)
         self.assertEqual([
             v.name
             for v in scope2.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -992,22 +1027,22 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope(
           "testGetTrainableVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetTrainableVariables_b", [])
-        _ = variable_scope.get_variable("testGetTrainableVariables_c", [],
-                                        trainable=False)
-        self.assertEqual([v.name
-                          for v in scope.trainable_variables()],
-                         ["testGetTrainableVariables_foo/"
-                          "testGetTrainableVariables_b:0"])
+        _ = variable_scope.get_variable(
+            "testGetTrainableVariables_c", [], trainable=False)
+        self.assertEqual(
+            [v.name for v in scope.trainable_variables()],
+            ["testGetTrainableVariables_foo/"
+             "testGetTrainableVariables_b:0"])
 
   def testGetGlobalVariables(self):
     with self.test_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
       with variable_scope.variable_scope("testGetGlobalVariables_foo") as scope:
         _ = variable_scope.get_variable("testGetGlobalVariables_b", [])
-        self.assertEqual([v.name
-                          for v in scope.global_variables()],
-                         ["testGetGlobalVariables_foo/"
-                          "testGetGlobalVariables_b:0"])
+        self.assertEqual(
+            [v.name for v in scope.global_variables()],
+            ["testGetGlobalVariables_foo/"
+             "testGetGlobalVariables_b:0"])
 
   def testGetLocalVariables(self):
     with self.test_session():
@@ -1016,10 +1051,8 @@ class VariableScopeTest(test.TestCase):
       with variable_scope.variable_scope("foo") as scope:
         _ = variable_scope.get_variable(
             "b", [], collections=[ops.GraphKeys.LOCAL_VARIABLES])
-        _ = variable_scope.get_variable(
-            "c", [])
-        self.assertEqual([v.name
-                          for v in scope.local_variables()], ["foo/b:0"])
+        _ = variable_scope.get_variable("c", [])
+        self.assertEqual([v.name for v in scope.local_variables()], ["foo/b:0"])
 
   def testGetVariableWithRefDtype(self):
     v = variable_scope.get_variable("v", shape=[3, 4], dtype=dtypes.float32)
@@ -1242,10 +1275,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with ops.name_scope("prod_getter"):
         return g_0 * g_1
 
-    with variable_scope.variable_scope(
-        "prod_scope", custom_getter=prod_getter):
-      with variable_scope.variable_scope(
-          "sum_scope", custom_getter=sum_getter):
+    with variable_scope.variable_scope("prod_scope", custom_getter=prod_getter):
+      with variable_scope.variable_scope("sum_scope", custom_getter=sum_getter):
         with variable_scope.variable_scope(
             "inner_sum_scope", custom_getter=sum_getter):
           # take sums of sums of products
@@ -1270,9 +1301,8 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       np_vars, np_v = sess.run([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
-          np_v,
-          (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3]))
-           + ((np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
+          np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
+              (np_vars[4] * np_vars[5]) + (np_vars[6] * np_vars[7]))))
 
   def testVariableCreator(self):
 
@@ -1368,7 +1398,11 @@ class VariableScopeMultithreadedTest(test.TestCase):
 
     graph = ops.get_default_graph()
     threads = [
-        threading.Thread(target=thread_fn, args=(i, graph,)) for i in range(2)]
+        threading.Thread(target=thread_fn, args=(
+            i,
+            graph,
+        )) for i in range(2)
+    ]
 
     threads[0].start()
     # Allow thread 0 to finish before starting thread 1.
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 1e5f26a77f4c923871f780ca31dac1763ddd144c..64db49c900c21d60ba2337f920d6fa2cb9ab7b5f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -12,148 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the base Layer class, from which all layers inherit."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import copy
-import re
-import weakref
 
-import numpy as np
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import utils as layers_util
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras._impl.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('layers.Layer')
-class Layer(checkpointable.CheckpointableBase):
-  """Base layer class.
+InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-  This is the class from which all layers inherit, implementing common
-  infrastructure functionality.
 
-  A layer is a class implementing common neural networks operations, such
-  as convolution, batch norm, etc. These operations require managing variables,
-  losses, and updates, as well as applying TensorFlow ops to input tensors.
+@tf_export('layers.Layer')
+class Layer(base_layer.Layer):
+  """Base layer class.
 
-  Users will just instantiate it and then treat it as a callable.
+  It is considered legacy, and we recommend the use of `tf.keras.layers.Layer`
+  instead.
 
-  We recommend that descendants of Layer implement the following methods:
-  * `__init__()`: Save configuration in member variables
-  * `build()`: Called once from `__call__`, when we know the shapes of inputs
-    and `dtype`. Should have the calls to `add_variable()`, and then
-    call the super's `build()` (which sets `self.built = True`, which is
-    nice in case the user wants to call `build()` manually before the
-    first `__call__`).
-  * `call()`: Called in `__call__` after making sure `build()` has been called
-    once. Should actually perform the logic of applying the layer to the
-    input tensors (which should be passed in as the first argument).
+  Arguments:
+    trainable: Boolean, whether the layer's variables should be trainable.
+    name: String name of the layer.
+    dtype: Default dtype of the layer's weights (default of `None` means use the
+      type of the first input).
 
   Read-only properties:
-    `name`: The name of the layer (string).
-    `dtype`: Default dtype of the layer (default of `None` means use the
+    name: The name of the layer (string).
+    dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    `trainable_variables`: List of trainable variables.
-    `non_trainable_variables`: List of non-trainable variables.
-    `variables`: List of all variables of this layer, trainable and
+    trainable_variables: List of trainable variables.
+    non_trainable_variables: List of non-trainable variables.
+    variables: List of all variables of this layer, trainable and
       non-trainable.
-    `updates`: List of update ops of this layer.
-    `losses`: List of losses added by this layer.
+    updates: List of update ops of this layer.
+    losses: List of losses added by this layer.
+    trainable_weights: List of variables to be included in backprop.
+    non_trainable_weights: List of variables that should not be
+      included in backprop.
+    weights: The concatenation of the lists trainable_weights and
+      non_trainable_weights (in this order).
 
   Mutable properties:
-    `trainable`: Whether the layer should be trained (boolean).
-    `input_spec`: Optional (list of) `InputSpec` object(s) specifying the
+    trainable: Whether the layer should be trained (boolean).
+    input_spec: Optional (list of) `InputSpec` object(s) specifying the
       constraints on inputs that can be accepted by the layer.
   """
 
   def __init__(self, trainable=True, name=None, dtype=None,
-               activity_regularizer=None, **kwargs):
-    # We use a kwargs dict here because these kwargs only exist
-    # for compatibility reasons.
-    # The list of kwargs is subject to changes in the future.
-    # We do not want to commit to it or to expose the list to users at all.
-    # Note this is exactly as safe as defining kwargs in the function signature,
-    # the only difference being that the list of valid kwargs is defined
-    # below rather rather in the signature, and default values are defined
-    # in calls to kwargs.get().
-    allowed_kwargs = {
-        '_scope',
-        '_reuse',
-        'input_shape',  # For compatibility with Keras `Sequential` model.
-        'batch_size',  # For compatibility with Keras `Sequential` model.
-    }
-    for kwarg in kwargs:
-      if kwarg not in allowed_kwargs:
-        raise TypeError('Keyword argument not understood:', kwarg)
-
-    # Mutable properties
-    # Indicates whether the layer's weights are updated during training
-    # and whether the layer's updates are run during training
-    self.trainable = trainable
-    # A stateful layer is a layer whose updates are run during inference too,
-    # for instance stateful RNNs.
-    self.stateful = False
-    # Indicates whether `build` needs to be called upon layer call, to create
-    # the layer's weights.
-    self.built = False
-    # Provides information about which inputs are compatible with the layer.
-    self.input_spec = None
-
-    if activity_regularizer and context.executing_eagerly():
-      raise ValueError(
-          ('Activity regularization is not supported when executing eagerly. '
-           'Got activity_regularizer=%s') % (activity_regularizer,))
-    self._activity_regularizer = activity_regularizer
+               **kwargs):
+    # For backwards compatibility, legacy layers do not use `ResourceVariable`
+    # by default.
+    self._use_resource_variables = False
+    scope = kwargs.pop('_scope', None)
+    self._reuse = kwargs.pop('_reuse', None)
+
+    # Avoid an incorrect lint error
     self._trainable_weights = []
-    self._non_trainable_weights = []
-    self._updates = []
-    # When executing eagerly, _losses is a list of zero-argument lambdas which
-    # return tensors. When using graph execution, _losses is a list of ops.
-    self._losses = []
-    self._reuse = kwargs.get('_reuse')
-    self._graph = None  # Will be set at build time.
-    self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    self._call_fn_args = estimator_util.fn_args(self.call)
-    self._compute_previous_mask = ('mask' in self._call_fn_args or
-                                   hasattr(self, 'compute_mask'))
-    self._call_has_scope_arg = 'scope' in self._call_fn_args
-
-    # These lists will be filled via successive calls
-    # to self._add_inbound_node().
-    self._inbound_nodes = []
-    self._outbound_nodes = []
+    self.built = False
 
-    self._init_set_name(name)
+    super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
+                                **kwargs)
 
-    # Determine variable scope.
-    scope = kwargs.get('_scope')
+    self._graph = None
+    self._call_has_scope_arg = 'scope' in self._call_fn_args
     if scope:
       with vs.variable_scope(scope) as captured_scope:
         self._scope = captured_scope
     else:
       self._scope = None
+    self._current_scope = None
 
-    # Set `_batch_input_shape` attribute
-    # for compatibility with Keras `Sequential` model.
-    if 'input_shape' in kwargs:
-      batch_size = kwargs.get('batch_size')
-      self._batch_input_shape = (batch_size,) + tuple(kwargs['input_shape'])
+  @property
+  def graph(self):
+    if context.executing_eagerly():
+      raise RuntimeError('Layer.graph not supported when executing eagerly.')
+    return self._graph
 
   def _init_set_name(self, name):
     # Determine layer name (non-unique).
@@ -166,18 +109,15 @@ class Layer(checkpointable.CheckpointableBase):
       self._name, base_name = self._make_unique_name()
     self._base_name = base_name
 
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
+  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
+                        namespace='', zero_based=False):
+    base_name = base_layer.to_snake_case(self.__class__.__name__)
+    name = base_layer.unique_layer_name(base_name,
+                                        name_uid_map=name_uid_map,
+                                        avoid_names=avoid_names,
+                                        namespace=namespace,
+                                        zero_based=zero_based)
+    return (name, base_name)
 
   @property
   def scope_name(self):
@@ -189,271 +129,16 @@ class Layer(checkpointable.CheckpointableBase):
                        'querying `scope_name`.')
     return self._scope.name
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
-
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
-
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
-
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.trainable_weights + self.non_trainable_weights
-
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self.weights
-
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
-
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
-
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
-
-    This call is ignored in Eager mode.
-
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
-
-    updates = _to_list(updates)
-    updates = [x if isinstance(x, ops.Operation)
-               else ops.convert_to_tensor(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
-    else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
-
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
-
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
-
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
-
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
-
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
-
-    Note that when executing eagerly, getting this property evaluates
-    regularizers. When using graph execution, variable regularization ops have
-    already been created and are simply returned here.
-
-    Returns:
-      A list of tensors.
-    """
-    if context.executing_eagerly():
-      # _losses may only contain variable regularization losses when executing
-      # eagerly, and they have been saved as lambdas to be executed when
-      # requested.
-      return [regularizer() for regularizer in self._losses]
-    else:
-      return self._losses
-
   def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
-
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
-
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
-
-    Arguments:
-      losses: Loss tensor, or list/tuple of tensors.
-      inputs: If anything other than None is passed, it signals the losses
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for activity regularization losses, for instance.
-        If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      # TODO(fchollet): it should be possible (and highly desirable) to support
-      # `add_loss` in eager mode. This allows great convenience and flexibility
-      # in defining custom losses on the fly (e.g. in VAEs).
-      # Simply appending the loss value to `self._losses`
-      # is the correct behavior.
-      # The only caveat is that we need to force the user to only call
-      # `add_loss` from inside a model or Layer's `call` method
-      # (otherwise the loss computation cannot be backproped through).
-      raise RuntimeError('Layer.add_loss not supported in Eager mode.')
-
-    losses = _to_list(losses)
-    self._losses += losses
-    if inputs is None:
-      for loss in losses:
-        loss._unconditional_loss = True  # pylint: disable=protected-access
-    else:
-      for loss in losses:
-        loss._unconditional_loss = False  # pylint: disable=protected-access
+    previous_losses_length = len(self._losses)
+    super(Layer, self).add_loss(losses, inputs=inputs)
     # TODO(fchollet): deprecate collection below.
-    _add_elements_to_collection(losses, ops.GraphKeys.REGULARIZATION_LOSSES)
-
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
-
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
-
-    Returns:
-      List of loss tensors of the layer that depend on `inputs`.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
-
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = layers_util.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
-
-  def build(self, _):
-    """Creates the variables of the layer."""
-    self.built = True
-
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """The logic of the layer lives here.
+    new_losses = self._losses[previous_losses_length:]
+    _add_elements_to_collection(new_losses, ops.GraphKeys.REGULARIZATION_LOSSES)
 
-    Arguments:
-      inputs: input tensor(s).
-      **kwargs: additional keyword arguments.
-
-    Returns:
-      Output tensor(s).
-    """
-    return inputs
-
-  def _name_scope_name(self, current_variable_scope):
+  def _name_scope(self):
     """Determines op naming for the Layer."""
-    return current_variable_scope.original_name_scope
-
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer given the input shape.
-
-    Args:
-      input_shape: A (possibly nested tuple of) `TensorShape`.  It need not
-        be fully defined (e.g. the batch size may be unknown).
-
-    Returns:
-      A (possibly nested tuple of) `TensorShape`.
-
-    Raises:
-      TypeError: if `input_shape` is not a (possibly nested tuple of)
-        `TensorShape`.
-      ValueError: if `input_shape` is incomplete or is incompatible with the
-        the layer.
-    """
-    raise NotImplementedError
-
-  def _make_unique_name(self, name_uid_map=None, avoid_names=None,
-                        namespace='', zero_based=False):
-    base_name = _to_snake_case(self.__class__.__name__)
-    name = _unique_layer_name(base_name, name_uid_map=name_uid_map,
-                              avoid_names=avoid_names, namespace=namespace,
-                              zero_based=zero_based)
-    return (name, base_name)
+    return self._current_scope.original_name_scope
 
   def _set_scope(self, scope=None):
     if self._scope is None:
@@ -467,10 +152,11 @@ class Layer(checkpointable.CheckpointableBase):
             scope, default_name=self._base_name) as captured_scope:
           self._scope = captured_scope
 
-  def add_variable(self, name, shape, dtype=None,
-                   initializer=None, regularizer=None,
-                   trainable=True, constraint=None,
-                   partitioner=None):
+  def add_weight(self, name, shape, dtype=None,
+                 initializer=None, regularizer=None,
+                 trainable=True, constraint=None,
+                 use_resource=None,
+                 partitioner=None):
     """Adds a new variable to the layer, or gets an existing one; returns it.
 
     Arguments:
@@ -486,6 +172,7 @@ class Layer(checkpointable.CheckpointableBase):
         then this parameter is ignored and any added variables are also
         marked as non-trainable.
       constraint: constraint instance (callable).
+      use_resource: Whether to use `ResourceVariable`.
       partitioner: (optional) partitioner instance (callable).  If
         provided, when the requested variable is created it will be split
         into multiple partitions according to `partitioner`.  In this case,
@@ -504,10 +191,6 @@ class Layer(checkpointable.CheckpointableBase):
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
-
-    # `init_graph` should point to the graph in which variable initialization
-    # will occur; it should be None if and only if initialization will take
-    # place in the eager context.
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -530,71 +213,43 @@ class Layer(checkpointable.CheckpointableBase):
 
     self._set_scope(None)
     reuse = self.built or self._reuse
+    prev_len_trainable = len(self._trainable_weights)
     with vs.variable_scope(
         self._scope, reuse=reuse, auxiliary_name_scope=False) as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        variable = self._add_variable_with_custom_getter(
-            name=name,
-            shape=shape,
-            getter=vs.get_variable,
-            # Manage errors in Layer rather than Checkpointable.
-            overwrite=True,
-            initializer=initializer,
+      self._current_scope = scope
+      with ops.name_scope(self._name_scope()):
+        use_resource = (use_resource or
+                        self._use_resource_variables or
+                        scope.use_resource)
+        variable = super(Layer, self).add_weight(
+            name,
+            shape,
             dtype=dtypes.as_dtype(dtype),
+            initializer=initializer or scope.initializer,
+            trainable=trainable,
             constraint=constraint,
-            trainable=trainable and self.trainable,
-            partitioner=partitioner)
-
-        if init_graph is not None:  # pylint: disable=protected-access
-          # The variable was created and initialized in a graph.
-
-          if variable in existing_variables:
-            # To match the behavior of tf.get_variable(), we only apply
-            # regularization if the variable is newly created.
-            return variable
-
+            partitioner=partitioner,
+            use_resource=use_resource,
+            getter=vs.get_variable)
+
+        if regularizer:
+          if context.executing_eagerly() or variable not in existing_variables:
+            self._handle_weight_regularization(name, variable, regularizer)
+
+        if init_graph is not None:
+          # Handle edge case where a custom getter has overridden `trainable`.
+          # There is one known occurrence of this, in unit test
+          # testBasicRNNCellNotTrainable in
+          # contrib.rnn.python.kernel_tests.core_rnn_cell_test
           with init_graph.as_default():
             trainable_variables = tf_variables.trainable_variables()
           if (trainable and self.trainable and
               variable not in trainable_variables):
             # A custom getter / variable scope overrode the trainable flag.
-            trainable = False
-
-          if regularizer:
-            if isinstance(variable, tf_variables.PartitionedVariable):
-              for v in variable:
-                with ops.colocate_with(v.op):
-                  with ops.name_scope(name + '/Regularizer'):
-                    regularization = regularizer(v)
-                if regularization is not None:
-                  self.add_loss(regularization)
-            else:
-              with ops.colocate_with(variable.op):
-                with ops.name_scope(name + '/Regularizer'):
-                  regularization = regularizer(variable)
-              if regularization is not None:
-                self.add_loss(regularization)
-        elif regularizer:  # and initialization took place in an eager context
-          if isinstance(variable, tf_variables.PartitionedVariable):
-            raise RuntimeError(
-                'Partitioned variable regularization is not yet '
-                'supported when executing eagerly. File a feature request '
-                'if this is important to you.')
-          # Save a zero-argument lambda which runs the regularizer on the
-          # variable, to be executed when `Layer.losses` is requested.
-          # This makes losses responsive to variable updates when executing
-          # eagerly.
-          #
-          # TODO(akshayka): Do the same for graphs as well, so that losses
-          # collected in a while_loop can be run outside its control flow
-          # context and so that losses won't be swallowed up by graph functions
-          # (i.e., `.losses()` should always create regularizers).
-          self._losses.append(lambda: regularizer(variable))
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
+            extra_trainable_vars = self._trainable_weights[prev_len_trainable:]
+            self._trainable_weights = self._trainable_weights[
+                :prev_len_trainable]
+            self._non_trainable_weights += extra_trainable_vars
     return variable
 
   def __call__(self, inputs, *args, **kwargs):
@@ -622,33 +277,14 @@ class Layer(checkpointable.CheckpointableBase):
       ValueError: if the layer's `call` method returns None (an invalid value).
     """
     self._set_scope(kwargs.pop('scope', None))
-    input_list = nest.flatten(inputs)
 
-    build_graph = not context.executing_eagerly()
-    in_deferred_mode = isinstance(input_list[0], _DeferredTensor)
-    # Ensure the Layer, if being reused, is working with inputs from
-    # the same graph as where it was created.
-    if build_graph:
+    if not context.executing_eagerly():
       try:
         # Set layer's "graph" at build time
-        self._graph = ops._get_graph_from_inputs(input_list, graph=self._graph)  # pylint: disable=protected-access
+        self._graph = ops._get_graph_from_inputs(nest.flatten(inputs),  # pylint: disable=protected-access
+                                                 graph=self._graph)
       except ValueError as e:
         raise ValueError('Input graph and Layer graph are not the same: %s' % e)
-    if build_graph or in_deferred_mode:
-      user_kwargs = copy.copy(kwargs)
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if (not hasattr(self, '_compute_previous_mask') or
-        self._compute_previous_mask):
-      previous_mask = _collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = estimator_util.fn_args(self.call)
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not _is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
 
     if self.built:
       try:
@@ -665,133 +301,27 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       scope_context_manager = vs.variable_scope(
           self._scope, reuse=self._reuse, auxiliary_name_scope=False)
-    input_shapes = None
-    with scope_context_manager as scope:
-      with ops.name_scope(self._name_scope_name(scope)):
-        if not self.built:
-          if not build_graph:
-            # Activity regularization is currently unsupported in Eager mode.
-            if self._activity_regularizer:
-              raise ValueError(
-                  'activity_regularizer currently unsupported with '
-                  'eager execution enabled. Found an activity_regularizer in '
-                  '%s(%s).' % (self.__class__.__name__, self))
-          if not build_graph and not in_deferred_mode:
-            # TODO(agarwal): support _keras_history in Eager mode.
-            for x in input_list:
-              if hasattr(x, '_keras_history'):
-                raise ValueError('_keras_history currently unsupported in '
-                                 'Eager mode. Found _keras_history in %s while '
-                                 'executing __call__ for %s(%s)' %
-                                 (x, self.__class_.__name__, self))
-
-          # Check input assumptions set before layer building, e.g. input rank.
-          self._assert_input_compatibility(inputs)
-          if input_list and self._dtype is None:
-            try:
-              self._dtype = input_list[0].dtype.base_dtype.name
-            except AttributeError:
-              pass
-          input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-          self.build(input_shapes)
-        try:
-          # Note: not all sub-classes of Layer call Layer.__init__ (especially
-          # the ones under tensorflow/python/keras). Hence we recompute this
-          # attribute here if it is not set.
-          # TODO(agarwal): Fix the sub-classes and avoid this complexity.
-          call_has_scope_arg = self._call_has_scope_arg
-        except AttributeError:
-          self._call_fn_args = estimator_util.fn_args(self.call)
-          self._call_has_scope_arg = 'scope' in self._call_fn_args
-          call_has_scope_arg = self._call_has_scope_arg
-        if call_has_scope_arg:
-          kwargs['scope'] = scope
-        # Check input assumptions set after layer building, e.g. input shape.
-        if build_graph or in_deferred_mode:
-          self._assert_input_compatibility(inputs)
-
-        if not in_deferred_mode:
-          outputs = self.call(inputs, *args, **kwargs)
-          if outputs is None:
-            raise ValueError('A layer\'s `call` method should return a Tensor '
-                             'or a list of Tensors, not None.')
-        else:
-          # Deferred mode behavior: use `compute_output_shape` to
-          # infer the number of outputs of the layer and their shapes.
-          if input_shapes is None:
-            input_shapes = nest.map_structure(lambda x: x.get_shape(), inputs)
-
-          output_shapes = self.compute_output_shape(input_shapes)
-          output_shapes = nest.flatten(output_shapes)
-          outputs = [
-              # TODO(fchollet): name the deferred tensors?
-              _DeferredTensor(shape=shape, dtype=self._dtype)
-              for shape in output_shapes
-          ]
-          if len(outputs) == 1:
-            outputs = outputs[0]
 
-        if build_graph:
-          # Apply activity regularization.
-          # Note that it should be applied every time the layer creates a new
-          # output, since it is output-specific.
-          if self._activity_regularizer:
-            output_list = nest.flatten(outputs)
-            for output in output_list:
-              with ops.name_scope('ActivityRegularizer'):
-                activity_regularization = self._activity_regularizer(output)
-              self.add_loss(activity_regularization, inputs=inputs)
+    with scope_context_manager as scope:
+      self._current_scope = scope
 
-          # TODO(fchollet): consider enabling masking for Eager mode.
-          if hasattr(self, 'compute_mask'):
-            output_mask = self.compute_mask(inputs, previous_mask)
-            if isinstance(outputs, (list, tuple)):
-              if output_mask is None:
-                output_mask = [None for _ in range(len(outputs))]
-              for x, m in zip(outputs, output_mask):
-                x._keras_mask = m  # pylint: disable=protected-access
-            else:
-              outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      try:
+        call_has_scope_arg = self._call_has_scope_arg
+      except AttributeError:
+        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_has_scope_arg = 'scope' in self._call_fn_args
+        call_has_scope_arg = self._call_has_scope_arg
+      if call_has_scope_arg:
+        kwargs['scope'] = scope
 
-    if build_graph:
-      # If all input tensors have history metadata,
-      # we update the output tensors
-      # with corresponding history metadata, thus eventually allowing to use
-      # these tensors to instantiate a Network.
-      if _have_all_keras_metadata(inputs):
-        # If the layer returns tensors from its inputs, unmodified,
-        # we copy them to avoid loss of tensor metadata.
-        output_ls = nest.flatten(outputs)
-        output_ls_copy = []
-        for x in output_ls:
-          if x in input_list:
-            with ops.name_scope(scope.original_name_scope):
-              x = array_ops.identity(x)
-          output_ls_copy.append(x)
-        if len(output_ls_copy) == 1:
-          outputs = output_ls_copy[0]
-        else:
-          outputs = output_ls_copy
+      # Actually call layer
+      outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
 
+    if not context.executing_eagerly():
       # Update global default collections.
       _add_elements_to_collection(self.updates, ops.GraphKeys.UPDATE_OPS)
-
-    if in_deferred_mode or build_graph:
-      if _have_all_keras_metadata(inputs):
-        # Add an inbound node to the layer, so it can keep track of this call.
-        # This updates the layer history of the output tensor(s).
-        self._add_inbound_node(
-            input_tensors=inputs, output_tensors=outputs, arguments=user_kwargs)
-
-    self.built = True
     return outputs
 
-  @property
-  def graph(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.graph not supported in Eager mode.')
-    return self._graph
-
   def __deepcopy__(self, memo):
     no_copy = set(['_graph'])
     shallow_copy = set(['_scope', '_always_reuse_variable_scope'])
@@ -803,658 +333,12 @@ class Layer(checkpointable.CheckpointableBase):
         setattr(result, k, v)
       elif k in shallow_copy:
         setattr(result, k, copy.copy(v))
-      elif _is_tensor_or_tensor_list(v):
+      elif base_layer.is_tensor_or_tensor_list(v):
         setattr(result, k, v)
       else:
         setattr(result, k, copy.deepcopy(v, memo))
     return result
 
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
-
-    This simply wraps `self.__call__`.
-
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-    """
-    return self.__call__(inputs, *args, **kwargs)
-
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
-
-    Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
-
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
-
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
-
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
-
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
-
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
-
-    Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
-
-    Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
-
-    Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
-    """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
-
-  def get_input_shape_at(self, node_index):
-    """Retrieves the input shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'input_shapes',
-                                             'input shape')
-
-  def get_output_shape_at(self, node_index):
-    """Retrieves the output shape(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A shape tuple
-        (or list of shape tuples if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          'Layer.get_output_shape_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'output_shapes',
-                                             'output shape')
-
-  def get_input_at(self, node_index):
-    """Retrieves the input tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple inputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_input_at not supported in Eager mode.')
-    return self._get_node_attribute_at_index(node_index, 'input_tensors',
-                                             'input')
-
-  def get_output_at(self, node_index):
-    """Retrieves the output tensor(s) of a layer at a given node.
-
-    Arguments:
-        node_index: Integer, index of the node
-            from which to retrieve the attribute.
-            E.g. `node_index=0` will correspond to the
-            first time the layer was called.
-
-    Returns:
-        A tensor (or list of tensors if the layer has multiple outputs).
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-    """
-    return self._get_node_attribute_at_index(node_index, 'output_tensors',
-                                             'output')
-
-  @property
-  def input(self):
-    """Retrieves the input tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-        Input tensor or list of input tensors.
-
-    Raises:
-        AttributeError: if the layer is connected to
-        more than one incoming layers.
-
-    Raises:
-      RuntimeError: If called in Eager mode.
-      AttributeError: If no inbound nodes are found.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name +
-                           ' is not connected, no input to return.')
-    return self._get_node_attribute_at_index(0, 'input_tensors', 'input')
-
-  @property
-  def output(self):
-    """Retrieves the output tensor(s) of a layer.
-
-    Only applicable if the layer has exactly one output,
-    i.e. if it is connected to one incoming layer.
-
-    Returns:
-      Output tensor or list of output tensors.
-
-    Raises:
-      AttributeError: if the layer is connected to more than one incoming
-        layers.
-      RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('Layer ' + self.name + ' has no inbound nodes.')
-    return self._get_node_attribute_at_index(0, 'output_tensors', 'output')
-
-  @property
-  def input_shape(self):
-    """Retrieves the input shape(s) of a layer.
-
-    Only applicable if the layer has exactly one input,
-    i.e. if it is connected to one incoming layer, or if all inputs
-    have the same shape.
-
-    Returns:
-        Input shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per input tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined input_shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined input shape.')
-    all_input_shapes = set(
-        [str(node.input_shapes) for node in self._inbound_nodes])
-    if len(all_input_shapes) == 1:
-      input_shapes = self._inbound_nodes[0].input_shapes
-      if len(input_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(input_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in input_shapes
-        ]
-    else:
-      raise AttributeError('The layer "' + str(self.name) +
-                           ' has multiple inbound nodes, '
-                           'with different input shapes. Hence '
-                           'the notion of "input shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_input_shape_at(node_index)` '
-                           'instead.')
-
-  def count_params(self):
-    """Count the total number of scalars composing the weights.
-
-    Returns:
-        An integer count.
-
-    Raises:
-        ValueError: if the layer isn't yet built
-          (in which case its weights aren't yet defined).
-    """
-    if not self.built:
-      if self.__class__.__name__ == 'Sequential':
-        self.build()  # pylint: disable=no-value-for-parameter
-      else:
-        raise ValueError('You tried to call `count_params` on ' + self.name +
-                         ', but the layer isn\'t built. '
-                         'You can build it manually via: `' + self.name +
-                         '.build(batch_input_shape)`.')
-    weight_shapes = [w.get_shape().as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
-
-  @property
-  def output_shape(self):
-    """Retrieves the output shape(s) of a layer.
-
-    Only applicable if the layer has one output,
-    or if all outputs have the same shape.
-
-    Returns:
-        Output shape, as an integer shape tuple
-        (or list of shape tuples, one tuple per output tensor).
-
-    Raises:
-        AttributeError: if the layer has no defined output shape.
-        RuntimeError: if called in Eager mode.
-    """
-    if not self._inbound_nodes:
-      raise AttributeError('The layer has never been called '
-                           'and thus has no defined output shape.')
-    all_output_shapes = set(
-        [str(node.output_shapes) for node in self._inbound_nodes])
-    if len(all_output_shapes) == 1:
-      output_shapes = self._inbound_nodes[0].output_shapes
-      if len(output_shapes) == 1:
-        return tuple(tensor_shape.TensorShape(output_shapes[0]).as_list())
-      else:
-        return [
-            tuple(tensor_shape.TensorShape(shape).as_list())
-            for shape in output_shapes
-        ]
-    else:
-      raise AttributeError('The layer "%s"'
-                           ' has multiple inbound nodes, '
-                           'with different output shapes. Hence '
-                           'the notion of "output shape" is '
-                           'ill-defined for the layer. '
-                           'Use `get_output_shape_at(node_index)` '
-                           'instead.' % self.name)
-
-  @property
-  def inbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._inbound_nodes
-
-  @property
-  def outbound_nodes(self):
-    """Deprecated, do NOT use! Only for compatibility with external Keras."""
-    return self._outbound_nodes
-
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
-
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
-
-    Arguments:
-        inputs: input tensor or list of input tensors.
-
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
-    """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
-    else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.get_shape().ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.get_shape().ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.get_shape().as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.get_shape().as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
-
-
-@tf_export('keras.layers.InputSpec', 'layers.InputSpec')
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
-
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
-
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
-
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
-
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
-
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
-
-
-class Node(object):
-  """A `Node` describes the connectivity between two layers.
-
-  Each time a layer is connected to some new input,
-  a node is added to `layer._inbound_nodes`.
-  Each time the output of a layer is used by another layer,
-  a node is added to `layer._outbound_nodes`.
-
-  Arguments:
-      outbound_layer: the layer that takes
-          `input_tensors` and turns them into `output_tensors`
-          (the node gets created when the `call`
-          method of the layer was called).
-      inbound_layers: a list of layers, the same length as `input_tensors`,
-          the layers from where `input_tensors` originate.
-      node_indices: a list of integers, the same length as `inbound_layers`.
-          `node_indices[i]` is the origin node of `input_tensors[i]`
-          (necessary since each inbound layer might have several nodes,
-          e.g. if the layer is being shared with a different data stream).
-      tensor_indices: a list of integers,
-          the same length as `inbound_layers`.
-          `tensor_indices[i]` is the index of `input_tensors[i]` within the
-          output of the inbound layer
-          (necessary since each inbound layer might
-          have multiple tensor outputs, with each one being
-          independently manipulable).
-      input_tensors: list of input tensors.
-      output_tensors: list of output tensors.
-      arguments: dictionary of keyword arguments that were passed to the
-          `call` method of the layer at the call that created the node.
-
-  `node_indices` and `tensor_indices` are basically fine-grained coordinates
-  describing the origin of the `input_tensors`.
-
-  A node from layer A to layer B is added to:
-    - A._outbound_nodes
-    - B._inbound_nodes
-  """
-
-  def __init__(self,
-               outbound_layer,
-               inbound_layers,
-               node_indices,
-               tensor_indices,
-               input_tensors,
-               output_tensors,
-               arguments=None):
-    # Layer instance (NOT a list).
-    if isinstance(outbound_layer, list):
-      raise ValueError(
-          '`outbound_layer` should be a layer instance, not a list.')
-    # this is the layer that takes a list of input tensors
-    # and turns them into a list of output tensors.
-    # the current node will be added to
-    # the inbound_nodes of outbound_layer.
-    self.outbound_layer = outbound_layer
-
-    # The following 3 properties describe where
-    # the input tensors come from: which layers,
-    # and for each layer, which node and which
-    # tensor output of each node.
-
-    # List of layer instances.
-    self.inbound_layers = inbound_layers
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.node_indices = node_indices
-    # List of integers, 1:1 mapping with inbound_layers.
-    self.tensor_indices = tensor_indices
-
-    # Following 2 properties:
-    # tensor inputs and outputs of outbound_layer.
-
-    # List of tensors. 1:1 mapping with inbound_layers.
-    self.input_tensors = input_tensors
-    # List of tensors, created by outbound_layer.call().
-    self.output_tensors = output_tensors
-
-    # Following 2 properties: input and output shapes.
-
-    # List of shape tuples, shapes of input_tensors.
-    self.input_shapes = [layers_util.static_shape(x) for x in input_tensors]
-    # List of shape tuples, shapes of output_tensors.
-    self.output_shapes = [layers_util.static_shape(x) for x in output_tensors]
-
-    # Optional keyword arguments to layer's `call`.
-    self.arguments = arguments
-
-    # Add nodes to all layers involved.
-    for layer in inbound_layers:
-      if layer is not None:
-        # For compatibility with external Keras, we use the deprecated
-        # accessor here.
-        layer.outbound_nodes.append(self)
-    # For compatibility with external Keras, we use the deprecated
-    # accessor here.
-    outbound_layer.inbound_nodes.append(self)
-
-  def get_config(self):
-    inbound_names = []
-    for layer in self.inbound_layers:
-      if layer:
-        inbound_names.append(layer.name)
-      else:
-        inbound_names.append(None)
-    return {
-        'outbound_layer': self.outbound_layer.name,
-        'inbound_layers': inbound_names,
-        'node_indices': self.node_indices,
-        'tensor_indices': self.tensor_indices
-    }
-
-
-class _DeferredTensor(object):
-  """Tensor-like object used to build graphs of layers in Eager mode.
-
-  When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
-  DeferredTensors with appropriate shape information. Thus DeferredTensor
-  behaves like a graph-mode Tensor when manipulated by layers.
-  """
-
-  def __init__(self, shape, dtype, name=None):
-    self.shape = tensor_shape.TensorShape(shape)
-    if dtype is None:
-      self.dtype = dtypes.as_dtype(np.float32)
-    else:
-      self.dtype = dtypes.as_dtype(dtype)
-    self.name = name
-
-  def get_shape(self):
-    return self.shape
-
-  def __str__(self):
-    return "DeferredTensor('%s', shape=%s, dtype=%s)" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-  def __repr__(self):
-    return "<_DeferredTensor '%s' shape=%s dtype=%s>" % (self.name,
-                                                         self.get_shape(),
-                                                         self.dtype.name)
-
-
-def _is_tensor_or_tensor_list(v):
-  v = nest.flatten(v)
-  if v and isinstance(v[0], ops.Tensor):
-    return True
-  else:
-    return False
-
-
-def _to_snake_case(name):
-  intermediate = re.sub('(.)([A-Z][a-z0-9]+)', r'\1_\2', name)
-  insecure = re.sub('([a-z])([A-Z])', r'\1_\2', intermediate).lower()
-  # If the class is private the name starts with "_" which is not secure
-  # for creating scopes. We prefix the name with "private" in this case.
-  if insecure[0] != '_':
-    return insecure
-  return 'private' + insecure
-
-
-def _to_list(x):
-  """This normalizes a list/tuple or single element into a list.
-
-  If a single element is passed, we return
-  a list of size 1 containing the element.
-
-  Arguments:
-    x: list or tuple or single element.
-
-  Returns:
-    A list.
-  """
-  if isinstance(x, (list, tuple)):
-    return list(x)
-  return [x]
-
 
 def _add_elements_to_collection(elements, collection_list):
   if context.executing_eagerly():
@@ -1470,105 +354,3 @@ def _add_elements_to_collection(elements, collection_list):
       if element not in collection_set:
         collection.append(element)
 
-
-def _is_all_none(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  # We cannot use Python's `any` because the iterable may return Tensors.
-  for element in iterable:
-    if element is not None:
-      return False
-  return True
-
-
-def _have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = iterable_or_element
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def _collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-# A global dictionary mapping graph objects to an index of counters used
-# for various layer names in each graph.
-# Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_LAYER_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
-def _get_default_graph_uid_map():
-  graph = ops.get_default_graph()
-  name_uid_map = PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections.defaultdict(int)
-    PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def _unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                       zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = _get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 9ed4afeaba931c47d2a1e65f08489773f0b9eb1b..c05c675263ff4cb57a48db96c43acfaf42af7c9b 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -94,61 +94,6 @@ class BaseLayerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
         core_layers.Dense(1, activity_regularizer=lambda *args, **kwargs: 0.)
 
-  def testGetVariable(self):
-    with self.test_session():
-
-      class MyLayer(base_layers.Layer):
-
-        def build(self, input_shape):
-          self.my_var = self.add_variable(
-              'my_var', [2, 2], initializer=init_ops.zeros_initializer())
-
-        def call(self, inputs):
-          return inputs * 2
-
-      layer = MyLayer(name='my_layer')
-      inputs = random_ops.random_uniform((5,), seed=1)
-      layer.apply(inputs)
-      layer.apply(inputs)
-      self.assertEqual([v.name for v in layer.variables],
-                       ['my_layer/my_var:0'])
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time.  It uses scope "<current scope>/base_name"
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope'):
-        with variable_scope.variable_scope('my_layer'):
-          variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope/my_layer')
-
-      # Creating a layer with no scope leads to lazy construction of
-      # the scope at apply() time. If 'scope' argument is passed to
-      # apply(), it uses that scope when accessing variables.
-      lazy_layer = MyLayer(_reuse=True)
-      with variable_scope.variable_scope('new_scope') as new_scope:
-        variable_scope.get_variable('my_var', [2, 2])
-
-        # Smoke test: it runs.
-        lazy_layer.apply(inputs, scope=new_scope)
-        # The variables were created outside of the Layer, and
-        # reuse=True, so the Layer does not own them and they are not
-        # stored in its collection.
-        self.assertEqual(lazy_layer.variables, [])
-        self.assertEqual(lazy_layer._scope.name, 'new_scope')
-
-      # Checking for graph equality is only done in GRAPH mode.
-      with ops.Graph().as_default():
-        inputs_ng = random_ops.random_uniform((5,), seed=1)
-        with self.assertRaisesRegexp(ValueError, r'graph are not the same'):
-          layer.apply(inputs_ng)
-
   @test_util.run_in_graph_and_eager_modes()
   def testCall(self):
 
@@ -165,38 +110,6 @@ class BaseLayerTest(test.TestCase):
       # op is only supported in GRAPH mode
       self.assertEqual(outputs.op.name, 'my_layer/Square')
 
-  def testFirstCallCanCreateVariablesButSecondCanNotWhenBuildEmpty(self):
-    # Note that this test is only run in Graph mode since with EAGER mode we can
-    # still create a new variable on second call.
-
-    class MyLayer(base_layers.Layer):
-
-      def build(self, _):
-        # Do not mark the layer as built.
-        pass
-
-      def call(self, inputs):
-        self.my_var = self.add_variable('my_var', [2, 2])
-        if self.built:
-          # Skip creating on the first call; try to create after it's
-          # built.  This is expected to fail.
-          self.add_variable('this_will_break_on_second_call', [2, 2])
-        return inputs + math_ops.square(self.my_var)
-
-    layer = MyLayer(name='my_layer')
-    inputs = random_ops.random_uniform((2,), seed=1)
-    outputs = layer.apply(inputs)
-    self.assertEqual(layer.built, True)
-    self.assertEqual(outputs.op.name, 'my_layer/add')
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-    with self.assertRaisesRegexp(ValueError,
-                                 'my_layer/this_will_break_on_second_call'):
-      layer.apply(inputs)
-    # The list of variables hasn't changed.
-    self.assertEqual([v.name
-                      for v in layer.variables], ['my_layer/my_var:0'])
-
   @test_util.run_in_graph_and_eager_modes()
   def testDeepCopy(self):
 
@@ -645,13 +558,14 @@ class BaseLayerTest(test.TestCase):
 
   def testLayerGraphSetInFirstApply(self):
     with ops.Graph().as_default():
-      layer = core_layers.Dense(1)  # Graph at construction time is ignored
+      # Graph at construction time is ignored
+      layer = core_layers.Dense(1)
     with ops.Graph().as_default():
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
       # layer is now bound to second Graph
     with ops.Graph().as_default(), self.assertRaisesRegexp(
         ValueError, 'Input graph and Layer graph are not the same'):
-      layer.apply(constant_op.constant([[1]]))
+      layer.apply(constant_op.constant([[1.]]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 2d99b1688f1b2736c0660ba2ac914018b21bf9ed..34a1487e748e41eebae8b87b17c34d0deda8597f 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -32,201 +33,8 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Conv(base.Layer):
-  """Abstract nD convolution layer (private, used as implementation base).
-
-  This layer creates a convolution kernel that is convolved
-  (actually cross-correlated) with the layer input to produce a tensor of
-  outputs. If `use_bias` is True (and a `bias_initializer` is provided),
-  a bias vector is created and added to the outputs. Finally, if
-  `activation` is not `None`, it is applied to the outputs as well.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: An integer or tuple/list of n integers, specifying the
-      length of the convolution window.
-    strides: An integer or tuple/list of n integers,
-      specifying the stride length of the convolution.
-      Specifying any stride value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of n integers, specifying
-      the dilation rate to use for dilated convolution.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any `strides` value != 1.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    kernel_initializer: An initializer for the convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    kernel_regularizer: Optional regularizer for the convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    kernel_constraint: Optional projection function to be applied to the
-        kernel after being updated by an `Optimizer` (e.g. used to implement
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               activation=None,
-               use_bias=True,
-               kernel_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               kernel_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               kernel_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_Conv, self).__init__(trainable=trainable, name=name,
-                                activity_regularizer=activity_regularizer,
-                                **kwargs)
-    self.rank = rank
-    self.filters = filters
-    self.kernel_size = utils.normalize_tuple(kernel_size, rank, 'kernel_size')
-    self.strides = utils.normalize_tuple(strides, rank, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.dilation_rate = utils.normalize_tuple(
-        dilation_rate, rank, 'dilation_rate')
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    kernel_shape = self.kernel_size + (input_dim, self.filters)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    self._convolution_op = nn_ops.Convolution(
-        input_shape,
-        filter_shape=self.kernel.get_shape(),
-        dilation_rate=self.dilation_rate,
-        strides=self.strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format,
-                                              self.rank + 2))
-    self.built = True
-
-  def call(self, inputs):
-    outputs = self._convolution_op(inputs, self.kernel)
-
-    if self.use_bias:
-      if self.data_format == 'channels_first':
-        if self.rank == 1:
-          # nn.bias_add does not accept a 1D input tensor.
-          bias = array_ops.reshape(self.bias, (1, self.filters, 1))
-          outputs += bias
-        if self.rank == 2:
-          outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
-        if self.rank == 3:
-          # As of Mar 2017, direct addition is significantly slower than
-          # bias_add when computing gradients. To use bias_add, we collapse Z
-          # and Y into a single dimension to obtain a 4D input tensor.
-          outputs_shape = outputs.shape.as_list()
-          if outputs_shape[0] is None:
-            outputs_shape[0] = -1
-          outputs_4d = array_ops.reshape(outputs,
-                                         [outputs_shape[0], outputs_shape[1],
-                                          outputs_shape[2] * outputs_shape[3],
-                                          outputs_shape[4]])
-          outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
-          outputs = array_ops.reshape(outputs_4d, outputs_shape)
-      else:
-        outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_last':
-      space = input_shape[1:-1]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0]] + new_space +
-                                      [self.filters])
-    else:
-      space = input_shape[2:]
-      new_space = []
-      for i in range(len(space)):
-        new_dim = utils.conv_output_length(
-            space[i],
-            self.kernel_size[i],
-            padding=self.padding,
-            stride=self.strides[i],
-            dilation=self.dilation_rate[i])
-        new_space.append(new_dim)
-      return tensor_shape.TensorShape([input_shape[0], self.filters] +
-                                      new_space)
-
-
 @tf_export('layers.Conv1D')
-class Conv1D(_Conv):
+class Conv1D(keras_layers.Conv1D, base.Layer):
   """1D convolution layer (e.g. temporal convolution).
 
   This layer creates a convolution kernel that is convolved
@@ -294,8 +102,7 @@ class Conv1D(_Conv):
                trainable=True,
                name=None,
                **kwargs):
-    super(Convolution1D, self).__init__(
-        rank=1,
+    super(Conv1D, self).__init__(
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -417,7 +224,7 @@ def conv1d(inputs,
 
 
 @tf_export('layers.Conv2D')
-class Conv2D(_Conv):
+class Conv2D(keras_layers.Conv2D, base.Layer):
   """2D convolution layer (e.g. spatial convolution over images).
 
   This layer creates a convolution kernel that is convolved
@@ -493,7 +300,6 @@ class Conv2D(_Conv):
                name=None,
                **kwargs):
     super(Conv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -622,7 +428,7 @@ def conv2d(inputs,
 
 
 @tf_export('layers.Conv3D')
-class Conv3D(_Conv):
+class Conv3D(keras_layers.Conv3D, base.Layer):
   """3D convolution layer (e.g. spatial convolution over volumes).
 
   This layer creates a convolution kernel that is convolved
@@ -699,7 +505,6 @@ class Conv3D(_Conv):
                name=None,
                **kwargs):
     super(Conv3D, self).__init__(
-        rank=3,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -828,169 +633,8 @@ def conv3d(inputs,
   return layer.apply(inputs)
 
 
-class _SeparableConv(_Conv):
-  """Abstract base layer for separable nD convolution.
-
-  This layer performs a depthwise convolution that acts separately on
-  channels, followed by a pointwise convolution that mixes channels.
-  If `use_bias` is True and a bias initializer is provided,
-  it adds a bias vector to the output.
-  It then optionally applies an activation function to produce the final output.
-
-  Arguments:
-    rank: An integer, the rank of the convolution, e.g. "2" for 2D convolution.
-    filters: Integer, the dimensionality of the output space (i.e. the number
-      of filters in the convolution).
-    kernel_size: A tuple or list of integers specifying the spatial
-      dimensions of the filters. Can be a single integer to specify the same
-      value for all spatial dimensions.
-    strides: A tuple or list of integers specifying the strides
-      of the convolution. Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Specifying any `stride` value != 1 is incompatible with specifying
-      any `dilation_rate` value != 1.
-    padding: One of `"valid"` or `"same"` (case-insensitive).
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, ..., channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, ...)`.
-    dilation_rate: An integer or tuple/list of 2 integers, specifying
-      the dilation rate to use for dilated convolution.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-      Currently, specifying any `dilation_rate` value != 1 is
-      incompatible with specifying any stride value != 1.
-    depth_multiplier: The number of depthwise convolution output channels for
-      each input channel. The total number of depthwise convolution output
-      channels will be equal to `num_filters_in * depth_multiplier`.
-    activation: Activation function. Set it to None to maintain a
-      linear activation.
-    use_bias: Boolean, whether the layer uses a bias.
-    depthwise_initializer: An initializer for the depthwise convolution kernel.
-    pointwise_initializer: An initializer for the pointwise convolution kernel.
-    bias_initializer: An initializer for the bias vector. If None, the default
-      initializer will be used.
-    depthwise_regularizer: Optional regularizer for the depthwise
-      convolution kernel.
-    pointwise_regularizer: Optional regularizer for the pointwise
-      convolution kernel.
-    bias_regularizer: Optional regularizer for the bias vector.
-    activity_regularizer: Optional regularizer function for the output.
-    depthwise_constraint: Optional projection function to be applied to the
-        depthwise kernel after being updated by an `Optimizer` (e.g. used for
-        norm constraints or value constraints for layer weights). The function
-        must take as input the unprojected variable and must return the
-        projected variable (which must have the same shape). Constraints are
-        not safe to use when doing asynchronous distributed training.
-    pointwise_constraint: Optional projection function to be applied to the
-        pointwise kernel after being updated by an `Optimizer`.
-    bias_constraint: Optional projection function to be applied to the
-        bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` also add variables to the graph collection
-      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self,
-               rank,
-               filters,
-               kernel_size,
-               strides=1,
-               padding='valid',
-               data_format='channels_last',
-               dilation_rate=1,
-               depth_multiplier=1,
-               activation=None,
-               use_bias=True,
-               depthwise_initializer=None,
-               pointwise_initializer=None,
-               bias_initializer=init_ops.zeros_initializer(),
-               depthwise_regularizer=None,
-               pointwise_regularizer=None,
-               bias_regularizer=None,
-               activity_regularizer=None,
-               depthwise_constraint=None,
-               pointwise_constraint=None,
-               bias_constraint=None,
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_SeparableConv, self).__init__(
-        rank=rank,
-        filters=filters,
-        kernel_size=kernel_size,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilation_rate=dilation_rate,
-        activation=activation,
-        use_bias=use_bias,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        bias_constraint=bias_constraint,
-        trainable=trainable,
-        name=name,
-        **kwargs)
-    self.depth_multiplier = depth_multiplier
-    self.depthwise_initializer = depthwise_initializer
-    self.pointwise_initializer = pointwise_initializer
-    self.depthwise_regularizer = depthwise_regularizer
-    self.pointwise_regularizer = pointwise_regularizer
-    self.depthwise_constraint = depthwise_constraint
-    self.pointwise_constraint = pointwise_constraint
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis].value is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis].value
-    self.input_spec = base.InputSpec(ndim=self.rank + 2,
-                                     axes={channel_axis: input_dim})
-    depthwise_kernel_shape = self.kernel_size + (input_dim,
-                                                 self.depth_multiplier)
-    pointwise_kernel_shape = (
-        1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
-
-    self.depthwise_kernel = self.add_variable(
-        name='depthwise_kernel',
-        shape=depthwise_kernel_shape,
-        initializer=self.depthwise_initializer,
-        regularizer=self.depthwise_regularizer,
-        constraint=self.depthwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    self.pointwise_kernel = self.add_variable(
-        name='pointwise_kernel',
-        shape=pointwise_kernel_shape,
-        initializer=self.pointwise_initializer,
-        regularizer=self.pointwise_regularizer,
-        constraint=self.pointwise_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    raise NotImplementedError
-
-
 @tf_export('layers.SeparableConv1D')
-class SeparableConv1D(_SeparableConv):
+class SeparableConv1D(keras_layers.SeparableConv1D, base.Layer):
   """Depthwise separable 1D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1072,7 +716,6 @@ class SeparableConv1D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv1D, self).__init__(
-        rank=1,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1096,45 +739,9 @@ class SeparableConv1D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides * 2 + (1,)
-      spatial_start_dim = 1
-    else:
-      strides = (1, 1) + self.strides * 2
-      spatial_start_dim = 2
-
-    # Explicitly broadcast inputs and kernels to 4D.
-    # TODO(fchollet): refactor when a native separable_conv1d op is available.
-    inputs = array_ops.expand_dims(inputs, spatial_start_dim)
-    depthwise_kernel = array_ops.expand_dims(self.depthwise_kernel, 0)
-    pointwise_kernel = array_ops.expand_dims(self.pointwise_kernel, 0)
-    dilation_rate = (1,) + self.dilation_rate
-
-    outputs = nn.separable_conv2d(
-        inputs,
-        depthwise_kernel,
-        pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    outputs = array_ops.squeeze(outputs, [spatial_start_dim])
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.SeparableConv2D')
-class SeparableConv2D(_SeparableConv):
+class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
   """Depthwise separable 2D convolution.
 
   This layer performs a depthwise convolution that acts separately on
@@ -1221,7 +828,6 @@ class SeparableConv2D(_SeparableConv):
                name=None,
                **kwargs):
     super(SeparableConv2D, self).__init__(
-        rank=2,
         filters=filters,
         kernel_size=kernel_size,
         strides=strides,
@@ -1245,31 +851,6 @@ class SeparableConv2D(_SeparableConv):
         name=name,
         **kwargs)
 
-  def call(self, inputs):
-    # Apply the actual ops.
-    if self.data_format == 'channels_last':
-      strides = (1,) + self.strides + (1,)
-    else:
-      strides = (1, 1) + self.strides
-    outputs = nn.separable_conv2d(
-        inputs,
-        self.depthwise_kernel,
-        self.pointwise_kernel,
-        strides=strides,
-        padding=self.padding.upper(),
-        rate=self.dilation_rate,
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
 
 @tf_export('layers.separable_conv1d')
 def separable_conv1d(inputs,
@@ -1511,7 +1092,7 @@ def separable_conv2d(inputs,
 
 
 @tf_export('layers.Conv2DTranspose')
-class Conv2DTranspose(Conv2D):
+class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
   """Transposed 2D convolution layer (sometimes called 2D Deconvolution).
 
   The need for transposed convolutions generally arises
@@ -1576,8 +1157,8 @@ class Conv2DTranspose(Conv2D):
                name=None,
                **kwargs):
     super(Conv2DTranspose, self).__init__(
-        filters,
-        kernel_size,
+        filters=filters,
+        kernel_size=kernel_size,
         strides=strides,
         padding=padding,
         data_format=data_format,
@@ -1593,120 +1174,6 @@ class Conv2DTranspose(Conv2D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def build(self, input_shape):
-    if len(input_shape) != 4:
-      raise ValueError('Inputs should have rank 4. Received input shape: ' +
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined. Found `None`.')
-    input_dim = input_shape[channel_axis]
-    self.input_spec = base.InputSpec(ndim=4, axes={channel_axis: input_dim})
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    height, width = inputs_shape[h_axis], inputs_shape[w_axis]
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_height, out_width)
-      strides = (1, 1, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_height, out_width, self.filters)
-      strides = (1, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv2d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs = nn.bias_add(
-          outputs,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, h_axis, w_axis = 1, 2, 3
-    else:
-      c_axis, h_axis, w_axis = 3, 1, 2
-
-    kernel_h, kernel_w = self.kernel_size
-    stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv2d_transpose')
@@ -1806,7 +1273,7 @@ def conv2d_transpose(inputs,
 
 
 @tf_export('layers.Conv3DTranspose')
-class Conv3DTranspose(Conv3D):
+class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
   """Transposed 3D convolution layer (sometimes called 3D Deconvolution).
 
   Arguments:
@@ -1885,153 +1352,6 @@ class Conv3DTranspose(Conv3D):
         trainable=trainable,
         name=name,
         **kwargs)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def build(self, input_shape):
-    if len(input_shape) != 5:
-      raise ValueError('Inputs should have rank 5, received input shape:',
-                       str(input_shape))
-    if self.data_format == 'channels_first':
-      channel_axis = 1
-    else:
-      channel_axis = -1
-    if input_shape[channel_axis] is None:
-      raise ValueError('The channel dimension of the inputs '
-                       'should be defined, found None: ' + str(input_shape))
-    input_dim = input_shape[channel_axis]
-    kernel_shape = self.kernel_size + (self.filters, input_dim)
-
-    self.kernel = self.add_variable(
-        'kernel',
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        trainable=True,
-        dtype=self.dtype)
-    if self.use_bias:
-      self.bias = self.add_variable(
-          'bias',
-          shape=(self.filters,),
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          trainable=True,
-          dtype=self.dtype)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs_shape = array_ops.shape(inputs)
-    batch_size = inputs_shape[0]
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    self.input_spec = base.InputSpec(ndim=5,
-                                     axes={c_axis: inputs_shape[c_axis]})
-
-    depth = inputs_shape[d_axis]
-    height = inputs_shape[h_axis]
-    width = inputs_shape[w_axis]
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    # Infer the dynamic output shape:
-    out_depth = utils.deconv_output_length(depth,
-                                           kernel_d,
-                                           self.padding,
-                                           stride_d)
-    out_height = utils.deconv_output_length(height,
-                                            kernel_h,
-                                            self.padding,
-                                            stride_h)
-    out_width = utils.deconv_output_length(width,
-                                           kernel_w,
-                                           self.padding,
-                                           stride_w)
-    if self.data_format == 'channels_first':
-      output_shape = (batch_size, self.filters, out_depth, out_height,
-                      out_width)
-      strides = (1, 1, stride_d, stride_h, stride_w)
-    else:
-      output_shape = (batch_size, out_depth, out_height, out_width,
-                      self.filters)
-      strides = (1, stride_d, stride_h, stride_w, 1)
-
-    output_shape_tensor = array_ops.stack(output_shape)
-    outputs = nn.conv3d_transpose(
-        inputs,
-        self.kernel,
-        output_shape_tensor,
-        strides,
-        data_format=utils.convert_data_format(self.data_format, ndim=5),
-        padding=self.padding.upper())
-
-    if not context.executing_eagerly():
-      # Infer the static output shape:
-      out_shape = inputs.get_shape().as_list()
-      out_shape[c_axis] = self.filters
-      out_shape[d_axis] = utils.deconv_output_length(out_shape[d_axis],
-                                                     kernel_d,
-                                                     self.padding,
-                                                     stride_d)
-      out_shape[h_axis] = utils.deconv_output_length(out_shape[h_axis],
-                                                     kernel_h,
-                                                     self.padding,
-                                                     stride_h)
-      out_shape[w_axis] = utils.deconv_output_length(out_shape[w_axis],
-                                                     kernel_w,
-                                                     self.padding,
-                                                     stride_w)
-      outputs.set_shape(out_shape)
-
-    if self.use_bias:
-      outputs_shape = outputs.shape.as_list()
-      if outputs_shape[0] is None:
-        outputs_shape[0] = -1
-      if self.data_format == 'channels_first':
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1],
-            outputs_shape[2] * outputs_shape[3], outputs_shape[4]
-        ])
-      else:
-        outputs_4d = array_ops.reshape(outputs, [
-            outputs_shape[0], outputs_shape[1] * outputs_shape[2],
-            outputs_shape[3], outputs_shape[4]
-        ])
-      outputs_4d = nn.bias_add(
-          outputs_4d,
-          self.bias,
-          data_format=utils.convert_data_format(self.data_format, ndim=4))
-      outputs = array_ops.reshape(outputs_4d, outputs_shape)
-
-    if self.activation is not None:
-      return self.activation(outputs)
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = list(input_shape)
-    if self.data_format == 'channels_first':
-      c_axis, d_axis, h_axis, w_axis = 1, 2, 3, 4
-    else:
-      c_axis, d_axis, h_axis, w_axis = 4, 1, 2, 3
-
-    kernel_d, kernel_h, kernel_w = self.kernel_size
-    stride_d, stride_h, stride_w = self.strides
-
-    output_shape[c_axis] = self.filters
-    output_shape[d_axis] = utils.deconv_output_length(
-        output_shape[d_axis], kernel_d, self.padding, stride_d)
-    output_shape[h_axis] = utils.deconv_output_length(
-        output_shape[h_axis], kernel_h, self.padding, stride_h)
-    output_shape[w_axis] = utils.deconv_output_length(
-        output_shape[w_axis], kernel_w, self.padding, stride_w)
-    return tensor_shape.TensorShape(output_shape)
 
 
 @tf_export('layers.conv3d_transpose')
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index e598d9f83ab21f2dd5fabb3dd37fa0bfb5f003a4..6d8e9eac878bb2eb65bfa29e872a0576a39af662 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,23 +27,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.Dense')
-class Dense(base.Layer):
+class Dense(keras_layers.Dense, base.Layer):
   """Densely-connected layer class.
 
   This layer implements the operation:
@@ -108,73 +99,19 @@ class Dense(base.Layer):
                trainable=True,
                name=None,
                **kwargs):
-    super(Dense, self).__init__(trainable=trainable, name=name,
+    super(Dense, self).__init__(units=units,
+                                activation=activation,
+                                use_bias=use_bias,
+                                kernel_initializer=kernel_initializer,
+                                bias_initializer=bias_initializer,
+                                kernel_regularizer=kernel_regularizer,
+                                bias_regularizer=bias_regularizer,
                                 activity_regularizer=activity_regularizer,
+                                kernel_constraint=kernel_constraint,
+                                bias_constraint=bias_constraint,
+                                trainable=trainable,
+                                name=name,
                                 **kwargs)
-    self.units = units
-    self.activation = activation
-    self.use_bias = use_bias
-    self.kernel_initializer = kernel_initializer
-    self.bias_initializer = bias_initializer
-    self.kernel_regularizer = kernel_regularizer
-    self.bias_regularizer = bias_regularizer
-    self.kernel_constraint = kernel_constraint
-    self.bias_constraint = bias_constraint
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if input_shape[-1].value is None:
-      raise ValueError('The last dimension of the inputs to `Dense` '
-                       'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(min_ndim=2,
-                                     axes={-1: input_shape[-1].value})
-    self.kernel = self.add_variable('kernel',
-                                    shape=[input_shape[-1].value, self.units],
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    if self.use_bias:
-      self.bias = self.add_variable('bias',
-                                    shape=[self.units,],
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
-    else:
-      self.bias = None
-    self.built = True
-
-  def call(self, inputs):
-    inputs = ops.convert_to_tensor(inputs, dtype=self.dtype)
-    shape = inputs.get_shape().as_list()
-    if len(shape) > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[len(shape) - 1],
-                                                             [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not context.executing_eagerly():
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      outputs = gen_math_ops.mat_mul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    input_shape = input_shape.with_rank_at_least(2)
-    if input_shape[-1].value is None:
-      raise ValueError(
-          'The innermost dimension of input_shape must be defined, but saw: %s'
-          % input_shape)
-    return input_shape[:-1].concatenate(self.units)
 
 
 @tf_export('layers.dense')
@@ -254,7 +191,7 @@ def dense(
 
 
 @tf_export('layers.Dropout')
-class Dropout(base.Layer):
+class Dropout(keras_layers.Dropout, base.Layer):
   """Applies Dropout to the input.
 
   Dropout consists in randomly setting a fraction `rate` of input units to 0
@@ -282,31 +219,14 @@ class Dropout(base.Layer):
                seed=None,
                name=None,
                **kwargs):
-    super(Dropout, self).__init__(name=name, **kwargs)
-    self.rate = rate
-    self.noise_shape = noise_shape
-    self.seed = seed
-
-  def _get_noise_shape(self, inputs):
-    # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
-    # which will override `self.noise_shape`, and allows for custom noise
-    # shapes with dynamically sized inputs.
-    if self.noise_shape is None:
-      return self.noise_shape
-    return nn_ops._get_noise_shape(inputs, self.noise_shape)
+    super(Dropout, self).__init__(rate=rate,
+                                  noise_shape=noise_shape,
+                                  seed=seed,
+                                  name=name,
+                                  **kwargs)
 
   def call(self, inputs, training=False):
-
-    def dropped_inputs():
-      return nn.dropout(inputs, 1  - self.rate,
-                        noise_shape=self._get_noise_shape(inputs),
-                        seed=self.seed)
-    return utils.smart_cond(training,
-                            dropped_inputs,
-                            lambda: array_ops.identity(inputs))
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(Dropout, self).call(inputs, training=training)
 
 
 @tf_export('layers.dropout')
@@ -352,7 +272,7 @@ def dropout(inputs,
 
 
 @tf_export('layers.Flatten')
-class Flatten(base.Layer):
+class Flatten(keras_layers.Flatten, base.Layer):
   """Flattens an input tensor while preserving the batch axis (axis 0).
 
   Examples:
@@ -367,25 +287,7 @@ class Flatten(base.Layer):
     # now `y` has shape `(None, None)`
   ```
   """
-
-  def __init__(self, **kwargs):
-    super(Flatten, self).__init__(**kwargs)
-    self.input_spec = base.InputSpec(min_ndim=2)
-
-  def call(self, inputs):
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
-    if not context.executing_eagerly():
-      outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    output_shape = [input_shape[0]]
-    if all(input_shape[1:]):
-      output_shape += [np.prod(input_shape[1:])]
-    else:
-      output_shape += [None]
-    return tensor_shape.TensorShape(output_shape)
+  pass
 
 
 @tf_export('layers.flatten')
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 29fb92ccb59aef83448cff8fd1bd759c4fda5abf..33284b0d695272db5a4e0d757d6f24b1930068de 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,26 +24,14 @@ import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import moving_averages
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('layers.BatchNormalization')
-class BatchNormalization(base.Layer):
+class BatchNormalization(keras_layers.BatchNormalization, base.Layer):
   """Batch Normalization layer from http://arxiv.org/abs/1502.03167.
 
   "Batch Normalization: Accelerating Deep Network Training by Reducing
@@ -143,489 +131,31 @@ class BatchNormalization(base.Layer):
                name=None,
                **kwargs):
     super(BatchNormalization, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    if isinstance(axis, list):
-      self.axis = axis[:]
-    else:
-      self.axis = axis
-    self.momentum = momentum
-    self.epsilon = epsilon
-    self.center = center
-    self.scale = scale
-    self.beta_initializer = beta_initializer
-    self.gamma_initializer = gamma_initializer
-    self.moving_mean_initializer = moving_mean_initializer
-    self.moving_variance_initializer = moving_variance_initializer
-    self.beta_regularizer = beta_regularizer
-    self.gamma_regularizer = gamma_regularizer
-    self.beta_constraint = beta_constraint
-    self.gamma_constraint = gamma_constraint
-    self.renorm = renorm
-    self.virtual_batch_size = virtual_batch_size
-    self.adjustment = adjustment
-    if fused is None:
-      fused = True
-
-    self.fused = fused
-    self._bessels_correction_test_only = True
-
-    if renorm:
-      renorm_clipping = renorm_clipping or {}
-      keys = ['rmax', 'rmin', 'dmax']
-      if set(renorm_clipping) - set(keys):
-        raise ValueError('renorm_clipping %s contains keys not in %s' %
-                         (renorm_clipping, keys))
-      self.renorm_clipping = renorm_clipping
-      self.renorm_momentum = renorm_momentum
-
-  def build(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape)
-    if not input_shape.ndims:
-      raise ValueError('Input has undefined rank:', input_shape)
-    ndims = len(input_shape)
-
-    # Convert axis to list and resolve negatives
-    if isinstance(self.axis, int):
-      self.axis = [self.axis]
-
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
-    for idx, x in enumerate(self.axis):
-      if x < 0:
-        self.axis[idx] = ndims + x
-
-    # Validate axes
-    for x in self.axis:
-      if x < 0 or x >= ndims:
-        raise ValueError('Invalid axis: %d' % x)
-    if len(self.axis) != len(set(self.axis)):
-      raise ValueError('Duplicate axis: %s' % self.axis)
-
-    if self.virtual_batch_size is not None:
-      if self.virtual_batch_size <= 0:
-        raise ValueError('virtual_batch_size must be a positive integer that '
-                         'divides the true batch size of the input Tensor')
-      # If using virtual batches, the first dimension must be the batch
-      # dimension and cannot be the batch norm axis
-      if 0 in self.axis:
-        raise ValueError('When using virtual_batch_size, the batch dimension '
-                         'must be 0 and thus axis cannot include 0')
-      if self.adjustment is not None:
-        raise ValueError('When using virtual_batch_size, adjustment cannot '
-                         'be specified')
-
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
-      # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
-      # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
-      # TODO(chrisying): fused batch norm is currently not supported for
-      # multi-axis batch norm and by extension virtual batches. In some cases,
-      # it might be possible to use fused batch norm but would require reshaping
-      # the Tensor to 4D with the axis in 1 or 3 (preferred 1) which is
-      # particularly tricky. A compromise might be to just support the most
-      # common use case (turning 5D w/ virtual batch to NCHW)
-
-    if self.fused:
-      if self.axis == [1]:
-        self._data_format = 'NCHW'
-      elif self.axis == [3]:
-        self._data_format = 'NHWC'
-      else:
-        raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
-
-    # Raise parameters of fp16 batch norm to fp32
-    if self.dtype == dtypes.float16 or self.dtype == dtypes.bfloat16:
-      param_dtype = dtypes.float32
-    else:
-      param_dtype = self.dtype or dtypes.float32
-
-    axis_to_dim = {x: input_shape[x].value for x in self.axis}
-    for x in axis_to_dim:
-      if axis_to_dim[x] is None:
-        raise ValueError('Input has undefined `axis` dimension. Input shape: ',
-                         input_shape)
-    self.input_spec = base.InputSpec(ndim=ndims, axes=axis_to_dim)
-
-    if len(axis_to_dim) == 1 and self.virtual_batch_size is None:
-      # Single axis batch norm (most common/default use-case)
-      param_shape = (list(axis_to_dim.values())[0],)
-    else:
-      # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
-      if self.virtual_batch_size is not None:
-        # When using virtual batches, add an extra dim at index 1
-        param_shape.insert(1, 1)
-        for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
-
-    if self.scale:
-      self.gamma = self.add_variable(
-          name='gamma',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
-          trainable=True)
-    else:
-      self.gamma = None
-      if self.fused:
-        self._gamma_const = array_ops.constant(
-            1.0, dtype=param_dtype, shape=param_shape)
-
-    if self.center:
-      self.beta = self.add_variable(
-          name='beta',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
-          trainable=True)
-    else:
-      self.beta = None
-      if self.fused:
-        self._beta_const = array_ops.constant(
-            0.0, dtype=param_dtype, shape=param_shape)
-
-    # Disable variable partitioning when creating the moving mean and variance
-    try:
-      if self._scope:
-        partitioner = self._scope.partitioner
-        self._scope.set_partitioner(None)
-      else:
-        partitioner = None
-      self.moving_mean = self.add_variable(
-          name='moving_mean',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_mean_initializer,
-          trainable=False)
-
-      self.moving_variance = self.add_variable(
-          name='moving_variance',
-          shape=param_shape,
-          dtype=param_dtype,
-          initializer=self.moving_variance_initializer,
-          trainable=False)
-
-      if self.renorm:
-        # Create variables to maintain the moving mean and standard deviation.
-        # These are used in training and thus are different from the moving
-        # averages above. The renorm variables are colocated with moving_mean
-        # and moving_variance.
-        # NOTE: below, the outer `with device` block causes the current device
-        # stack to be cleared. The nested ones use a `lambda` to set the desired
-        # device and ignore any devices that may be set by the custom getter.
-        def _renorm_variable(name, shape):
-          var = self.add_variable(
-              name=name,
-              shape=shape,
-              dtype=param_dtype,
-              initializer=init_ops.zeros_initializer(),
-              trainable=False)
-          return var
-
-        with ops.device(None):
-          device = (
-              self.moving_mean.device if context.executing_eagerly() else
-              (lambda _: self.moving_mean.device))
-          with ops.device(device):
-            self.renorm_mean = _renorm_variable('renorm_mean', param_shape)
-            self.renorm_mean_weight = _renorm_variable('renorm_mean_weight', ())
-          # We initialize renorm_stddev to 0, and maintain the (0-initialized)
-          # renorm_stddev_weight. This allows us to (1) mix the average
-          # stddev with the minibatch stddev early in training, and (2) compute
-          # the unbiased average stddev by dividing renorm_stddev by the weight.
-          device = (
-              self.moving_variance.device if context.executing_eagerly() else
-              (lambda _: self.moving_variance.device))
-          with ops.device(device):
-            self.renorm_stddev = _renorm_variable('renorm_stddev', param_shape)
-            self.renorm_stddev_weight = _renorm_variable(
-                'renorm_stddev_weight', ())
-    finally:
-      if partitioner:
-        self._scope.set_partitioner(partitioner)
-    self.built = True
-
-  def _assign_moving_average(self, variable, value, momentum):
-    with ops.name_scope(None, 'AssignMovingAvg',
-                        [variable, value, momentum]) as scope:
-      with ops.colocate_with(variable):
-        decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
-        if decay.dtype != variable.dtype.base_dtype:
-          decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (variable - value) * decay
-        return state_ops.assign_sub(variable, update_delta, name=scope)
-
-  def _fused_batch_norm(self, inputs, training):
-    """Returns the output of fused batch norm."""
-    beta = self.beta if self.center else self._beta_const
-    gamma = self.gamma if self.scale else self._gamma_const
-
-    def _fused_batch_norm_training():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          epsilon=self.epsilon,
-          data_format=self._data_format)
-
-    def _fused_batch_norm_inference():
-      return nn.fused_batch_norm(
-          inputs,
-          gamma,
-          beta,
-          mean=self.moving_mean,
-          variance=self.moving_variance,
-          epsilon=self.epsilon,
-          is_training=False,
-          data_format=self._data_format)
-
-    output, mean, variance = utils.smart_cond(
-        training, _fused_batch_norm_training, _fused_batch_norm_inference)
-    if not self._bessels_correction_test_only:
-      # Remove Bessel's correction to be consistent with non-fused batch norm.
-      # Note that the variance computed by fused batch norm is
-      # with Bessel's correction.
-      sample_size = math_ops.cast(
-          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
-      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
-      variance *= factor
-
-    training_value = utils.constant_value(training)
-    if training_value is None:
-      momentum = utils.smart_cond(training, lambda: self.momentum, lambda: 1.0)
-    else:
-      momentum = ops.convert_to_tensor(self.momentum)
-    if training_value or training_value is None:
-      mean_update = self._assign_moving_average(self.moving_mean, mean,
-                                                momentum)
-      variance_update = self._assign_moving_average(self.moving_variance,
-                                                    variance, momentum)
-      self.add_update(mean_update, inputs=inputs)
-      self.add_update(variance_update, inputs=inputs)
-
-    return output
-
-  def _renorm_correction_and_moments(self, mean, variance, training):
-    """Returns the correction and update values for renorm."""
-    stddev = math_ops.sqrt(variance + self.epsilon)
-    # Compute the average mean and standard deviation, as if they were
-    # initialized with this batch's moments.
-    mixed_renorm_mean = (self.renorm_mean +
-                         (1. - self.renorm_mean_weight) * mean)
-    mixed_renorm_stddev = (self.renorm_stddev +
-                           (1. - self.renorm_stddev_weight) * stddev)
-    # Compute the corrections for batch renorm.
-    r = stddev / mixed_renorm_stddev
-    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
-    # Ensure the corrections use pre-update moving averages.
-    with ops.control_dependencies([r, d]):
-      mean = array_ops.identity(mean)
-      stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
-    if rmin is not None:
-      r = math_ops.maximum(r, rmin)
-    if rmax is not None:
-      r = math_ops.minimum(r, rmax)
-    if dmax is not None:
-      d = math_ops.maximum(d, -dmax)
-      d = math_ops.minimum(d, dmax)
-    # When not training, use r=1, d=0.
-    r = utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d))
-
-    def _update_renorm_variable(var, weight, value):
-      """Updates a moving average and weight, returns the unbiased value."""
-      value = array_ops.identity(value)
-      def _do_update():
-        """Updates the var and weight, returns their updated ratio."""
-        # Update the variables without zero debiasing. The debiasing will be
-        # accomplished by dividing the exponential moving average by the weight.
-        # For example, after a single update, the moving average would be
-        # (1-decay) * value. and the weight will be 1-decay, with their ratio
-        # giving the value.
-        # Make sure the weight is not updated until before r and d computation.
-        with ops.control_dependencies([value]):
-          weight_value = array_ops.constant(1., dtype=weight.dtype)
-        new_var = self._assign_moving_average(var, value, self.renorm_momentum)
-        new_weight = self._assign_moving_average(weight, weight_value,
-                                                 self.renorm_momentum)
-        # TODO(yuefengz): the updates to var and weighted can not be batched
-        # together if we fetch their updated values here. Consider calculating
-        # new values and delaying the updates.
-        return new_var / new_weight
-
-      def _fake_update():
-        return array_ops.identity(var)
-      return utils.smart_cond(training, _do_update, _fake_update)
-
-    with ops.colocate_with(self.moving_mean):
-      new_mean = _update_renorm_variable(self.renorm_mean,
-                                         self.renorm_mean_weight,
-                                         mean)
-    with ops.colocate_with(self.moving_variance):
-      new_stddev = _update_renorm_variable(self.renorm_stddev,
-                                           self.renorm_stddev_weight,
-                                           stddev)
-      # Make sqrt(moving_variance + epsilon) = new_stddev.
-      new_variance = math_ops.square(new_stddev) - self.epsilon
-
-    return (r, d, new_mean, new_variance)
+        axis=axis,
+        momentum=momentum,
+        epsilon=epsilon,
+        center=center,
+        scale=scale,
+        beta_initializer=beta_initializer,
+        gamma_initializer=gamma_initializer,
+        moving_mean_initializer=moving_mean_initializer,
+        moving_variance_initializer=moving_variance_initializer,
+        beta_regularizer=beta_regularizer,
+        gamma_regularizer=gamma_regularizer,
+        beta_constraint=beta_constraint,
+        gamma_constraint=gamma_constraint,
+        renorm=renorm,
+        renorm_clipping=renorm_clipping,
+        renorm_momentum=renorm_momentum,
+        fused=fused,
+        trainable=trainable,
+        virtual_batch_size=virtual_batch_size,
+        adjustment=adjustment,
+        name=name,
+        **kwargs)
 
   def call(self, inputs, training=False):
-    in_eager_mode = context.executing_eagerly()
-    if self.virtual_batch_size is not None:
-      # Virtual batches (aka ghost batches) can be simulated by reshaping the
-      # Tensor and reusing the existing batch norm implementation
-      original_shape = [-1] + inputs.shape.as_list()[1:]
-      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]
-
-      # Will cause errors if virtual_batch_size does not divide the batch size
-      inputs = array_ops.reshape(inputs, expanded_shape)
-
-      def undo_virtual_batching(outputs):
-        outputs = array_ops.reshape(outputs, original_shape)
-        return outputs
-
-    if self.fused:
-      outputs = self._fused_batch_norm(inputs, training=training)
-      if self.virtual_batch_size is not None:
-        # Currently never reaches here since fused_batch_norm does not support
-        # virtual batching
-        return undo_virtual_batching(outputs)
-      return outputs
-
-    # Compute the axes along which to reduce the mean / variance
-    input_shape = inputs.get_shape()
-    ndims = len(input_shape)
-    reduction_axes = [i for i in range(ndims) if i not in self.axis]
-    if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
-
-    # Broadcasting only necessary for single-axis batch norm where the axis is
-    # not the last dimension
-    broadcast_shape = [1] * ndims
-    broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value
-    def _broadcast(v):
-      if (v is not None and
-          len(v.get_shape()) != ndims and
-          reduction_axes != list(range(ndims - 1))):
-        return array_ops.reshape(v, broadcast_shape)
-      return v
-
-    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)
-
-    def _compose_transforms(scale, offset, then_scale, then_offset):
-      if then_scale is not None:
-        scale *= then_scale
-        offset *= then_scale
-      if then_offset is not None:
-        offset += then_offset
-      return (scale, offset)
-
-    # Determine a boolean value for `training`: could be True, False, or None.
-    training_value = utils.constant_value(training)
-    if training_value is not False:
-      if self.adjustment:
-        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
-        # Adjust only during training.
-        adj_scale = utils.smart_cond(training,
-                                     lambda: adj_scale,
-                                     lambda: array_ops.ones_like(adj_scale))
-        adj_bias = utils.smart_cond(training,
-                                    lambda: adj_bias,
-                                    lambda: array_ops.zeros_like(adj_bias))
-        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
-
-      # Some of the computations here are not necessary when training==False
-      # but not a constant. However, this makes the code simpler.
-      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
-
-      moving_mean = self.moving_mean
-      moving_variance = self.moving_variance
-
-      mean = utils.smart_cond(training,
-                              lambda: mean,
-                              lambda: moving_mean)
-      variance = utils.smart_cond(training,
-                                  lambda: variance,
-                                  lambda: moving_variance)
-
-      if self.renorm:
-        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
-        # When training, the normalized values (say, x) will be transformed as
-        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
-        # = x * (r * gamma) + (d * gamma + beta) with renorm.
-        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
-        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
-        scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keep_dims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keep_dims=True)
-
-      def _do_update(var, value):
-        if in_eager_mode and not self.trainable:
-          return
-
-        return self._assign_moving_average(var, value, self.momentum)
-
-      mean_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_mean, new_mean),
-          lambda: self.moving_mean)
-      variance_update = utils.smart_cond(
-          training,
-          lambda: _do_update(self.moving_variance, new_variance),
-          lambda: self.moving_variance)
-      if not context.executing_eagerly():
-        self.add_update(mean_update, inputs=inputs)
-        self.add_update(variance_update, inputs=inputs)
-
-    else:
-      mean, variance = self.moving_mean, self.moving_variance
-
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
-                                     self.epsilon)
-    # If some components of the shape got lost due to adjustments, fix that.
-    outputs.set_shape(input_shape)
-
-    if self.virtual_batch_size is not None:
-      return undo_virtual_batching(outputs)
-
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    return input_shape
+    return super(BatchNormalization, self).call(inputs, training=training)
 
 
 @tf_export('layers.batch_normalization')
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 50503ce093fbc251b11c4d5cbccb2a2683d92e7a..75abe56f51f2a206ea3e5a5dad032446c150293a 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -13,92 +13,19 @@
 # limitations under the License.
 # =============================================================================
 
-# pylint: disable=unused-import,g-bad-import-order
 """Contains the pooling layer classes and their functional aliases.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras._impl.keras import layers as keras_layers
 from tensorflow.python.layers import base
-from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _Pooling1D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 1D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of a single integer,
-      representing the size of the pooling window.
-    strides: An integer or tuple/list of a single integer, specifying the
-      strides of the pooling operation.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, length, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, length)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling1D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 1, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 1, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=3)
-
-  def call(self, inputs):
-    # There is no TF op for 1D pooling, hence we make the inputs 4D.
-    if self.data_format == 'channels_last':
-      # input is NWC, make it NHWC
-      inputs = array_ops.expand_dims(inputs, 1)
-      # pool on the W dim
-      pool_shape = (1, 1) + self.pool_size + (1,)
-      strides = (1, 1) + self.strides + (1,)
-      data_format = 'NHWC'
-    else:
-      # input is NCW, make it NCHW
-      inputs = array_ops.expand_dims(inputs, 2)
-      # pool on the W dim
-      pool_shape = (1, 1, 1) + self.pool_size
-      strides = (1, 1, 1) + self.strides
-      data_format = 'NCHW'
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=data_format)
-
-    if self.data_format == 'channels_last':
-      return array_ops.squeeze(outputs, 1)
-    else:
-      return array_ops.squeeze(outputs, 2)
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    length = utils.conv_output_length(input_shape[1], self.pool_size[0],
-                                      self.padding, self.strides[0])
-    return tensor_shape.TensorShape([input_shape[0], length, input_shape[2]])
-
-
 @tf_export('layers.AveragePooling1D')
-class AveragePooling1D(_Pooling1D):
+class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
   """Average Pooling layer for 1D inputs.
 
   Arguments:
@@ -119,8 +46,9 @@ class AveragePooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling1D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -165,7 +93,7 @@ def average_pooling1d(inputs, pool_size, strides,
 
 
 @tf_export('layers.MaxPooling1D')
-class MaxPooling1D(_Pooling1D):
+class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
   """Max Pooling layer for 1D inputs.
 
   Arguments:
@@ -186,8 +114,9 @@ class MaxPooling1D(_Pooling1D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling1D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size,
         strides=strides,
         padding=padding,
@@ -231,79 +160,8 @@ def max_pooling1d(inputs, pool_size, strides,
   return layer.apply(inputs)
 
 
-class _Pooling2D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 2D inputs (e.g. images).
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 2 integers: (pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 2 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, height, width, channels)` while `channels_first` corresponds to
-      inputs with shape `(batch, channels, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling2D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 2, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 2, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=4)
-
-  def call(self, inputs):
-    if self.data_format == 'channels_last':
-      pool_shape = (1,) + self.pool_size + (1,)
-      strides = (1,) + self.strides + (1,)
-    else:
-      pool_shape = (1, 1) + self.pool_size
-      strides = (1, 1) + self.strides
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper(),
-        data_format=utils.convert_data_format(self.data_format, 4))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      rows = input_shape[2]
-      cols = input_shape[3]
-    else:
-      rows = input_shape[1]
-      cols = input_shape[2]
-    rows = utils.conv_output_length(rows, self.pool_size[0], self.padding,
-                                    self.strides[0])
-    cols = utils.conv_output_length(cols, self.pool_size[1], self.padding,
-                                    self.strides[1])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], rows, cols])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], rows, cols, input_shape[3]])
-
-
 @tf_export('layers.AveragePooling2D')
-class AveragePooling2D(_Pooling2D):
+class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
   """Average pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -328,8 +186,9 @@ class AveragePooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling2D, self).__init__(
-        nn.avg_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -373,7 +232,7 @@ def average_pooling2d(inputs,
 
 
 @tf_export('layers.MaxPooling2D')
-class MaxPooling2D(_Pooling2D):
+class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
   """Max pooling layer for 2D inputs (e.g. images).
 
   Arguments:
@@ -398,8 +257,9 @@ class MaxPooling2D(_Pooling2D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling2D, self).__init__(
-        nn.max_pool,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -442,90 +302,8 @@ def max_pooling2d(inputs,
   return layer.apply(inputs)
 
 
-class _Pooling3D(base.Layer):
-  """Pooling layer for arbitrary pooling functions, for 3D inputs.
-
-  This class only exists for code reuse. It will never be an exposed API.
-
-  Arguments:
-    pool_function: The pooling function to apply, e.g. `tf.nn.max_pool`.
-    pool_size: An integer or tuple/list of 3 integers:
-      (pool_depth, pool_height, pool_width)
-      specifying the size of the pooling window.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    strides: An integer or tuple/list of 3 integers,
-      specifying the strides of the pooling operation.
-      Can be a single integer to specify the same value for
-      all spatial dimensions.
-    padding: A string. The padding method, either 'valid' or 'same'.
-      Case-insensitive.
-    data_format: A string, one of `channels_last` (default) or `channels_first`.
-      The ordering of the dimensions in the inputs.
-      `channels_last` corresponds to inputs with shape
-      `(batch, depth, height, width, channels)`
-      while `channels_first` corresponds to
-      inputs with shape `(batch, channels, depth, height, width)`.
-    name: A string, the name of the layer.
-  """
-
-  def __init__(self, pool_function, pool_size, strides,
-               padding='valid', data_format='channels_last',
-               name=None, **kwargs):
-    super(_Pooling3D, self).__init__(name=name, **kwargs)
-    self.pool_function = pool_function
-    self.pool_size = utils.normalize_tuple(pool_size, 3, 'pool_size')
-    self.strides = utils.normalize_tuple(strides, 3, 'strides')
-    self.padding = utils.normalize_padding(padding)
-    self.data_format = utils.normalize_data_format(data_format)
-    self.input_spec = base.InputSpec(ndim=5)
-
-  def call(self, inputs):
-    pool_shape = (1,) + self.pool_size + (1,)
-    strides = (1,) + self.strides + (1,)
-
-    if self.data_format == 'channels_first':
-      # TF does not support `channels_first` with 3D pooling operations,
-      # so we must handle this case manually.
-      # TODO(fchollet): remove this when TF pooling is feature-complete.
-      inputs = array_ops.transpose(inputs, (0, 2, 3, 4, 1))
-
-    outputs = self.pool_function(
-        inputs,
-        ksize=pool_shape,
-        strides=strides,
-        padding=self.padding.upper())
-
-    if self.data_format == 'channels_first':
-      outputs = array_ops.transpose(outputs, (0, 4, 1, 2, 3))
-    return outputs
-
-  def compute_output_shape(self, input_shape):
-    input_shape = tensor_shape.TensorShape(input_shape).as_list()
-    if self.data_format == 'channels_first':
-      len_dim1 = input_shape[2]
-      len_dim2 = input_shape[3]
-      len_dim3 = input_shape[4]
-    else:
-      len_dim1 = input_shape[1]
-      len_dim2 = input_shape[2]
-      len_dim3 = input_shape[3]
-    len_dim1 = utils.conv_output_length(len_dim1, self.pool_size[0],
-                                        self.padding, self.strides[0])
-    len_dim2 = utils.conv_output_length(len_dim2, self.pool_size[1],
-                                        self.padding, self.strides[1])
-    len_dim3 = utils.conv_output_length(len_dim3, self.pool_size[2],
-                                        self.padding, self.strides[2])
-    if self.data_format == 'channels_first':
-      return tensor_shape.TensorShape(
-          [input_shape[0], input_shape[1], len_dim1, len_dim2, len_dim3])
-    else:
-      return tensor_shape.TensorShape(
-          [input_shape[0], len_dim1, len_dim2, len_dim3, input_shape[4]])
-
-
 @tf_export('layers.AveragePooling3D')
-class AveragePooling3D(_Pooling3D):
+class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
   """Average pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -552,8 +330,9 @@ class AveragePooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(AveragePooling3D, self).__init__(
-        nn.avg_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
@@ -599,7 +378,7 @@ def average_pooling3d(inputs,
 
 
 @tf_export('layers.MaxPooling3D')
-class MaxPooling3D(_Pooling3D):
+class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
   """Max pooling layer for 3D inputs (e.g. volumes).
 
   Arguments:
@@ -626,8 +405,9 @@ class MaxPooling3D(_Pooling3D):
   def __init__(self, pool_size, strides,
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
+    if strides is None:
+      raise ValueError('Argument `strides` must not be None.')
     super(MaxPooling3D, self).__init__(
-        nn.max_pool3d,
         pool_size=pool_size, strides=strides,
         padding=padding, data_format=data_format, name=name, **kwargs)
 
diff --git a/tensorflow/python/layers/utils_test.py b/tensorflow/python/layers/utils_test.py
index c941aad7bc63dbb891fbe78cd2a47dd6805bf231..7e94dda648166780af002ce6b979a751a0ced846 100644
--- a/tensorflow/python/layers/utils_test.py
+++ b/tensorflow/python/layers/utils_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.layers import utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -89,33 +88,5 @@ class ConvUtilsTest(test.TestCase):
     self.assertEqual(6, utils.deconv_output_length(4, 2, 'full', 2))
 
 
-class GraphUtilsTest(test.TestCase):
-
-  def testGetReachableFromInputs(self):
-
-    with self.test_session():
-      pl_1 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_2 = array_ops.placeholder(shape=None, dtype='float32')
-      pl_3 = array_ops.placeholder(shape=None, dtype='float32')
-      x_1 = pl_1 + pl_2
-      x_2 = pl_2 * 2
-      x_3 = pl_3 + 1
-      x_4 = x_1 + x_2
-      x_5 = x_3 * pl_1
-
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1]),
-          {pl_1, x_1, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_1, pl_2]),
-          {pl_1, pl_2, x_1, x_2, x_4, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([pl_3]),
-          {pl_3, x_3, x_5})
-      self.assertEqual(
-          utils.get_reachable_from_inputs([x_3]),
-          {x_3, x_5})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/lib/core/py_exception_registry.cc b/tensorflow/python/lib/core/py_exception_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6637de632b48e4dfc8219543161464b10dcdbe12
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
+#include <Python.h>
+
+namespace tensorflow {
+
+PyExceptionRegistry* PyExceptionRegistry::singleton_ = nullptr;
+
+void PyExceptionRegistry::Init(PyObject* code_to_exc_type_map) {
+  DCHECK(singleton_ == nullptr) << "PyExceptionRegistry::Init() already called";
+  singleton_ = new PyExceptionRegistry;
+
+  DCHECK(PyDict_Check(code_to_exc_type_map));
+  PyObject* key;
+  PyObject* value;
+  Py_ssize_t pos = 0;
+  while (PyDict_Next(code_to_exc_type_map, &pos, &key, &value)) {
+    TF_Code code = static_cast<TF_Code>(PyLong_AsLong(key));
+    singleton_->exc_types_[code] = value;
+    // The exception classes should also have the lifetime of the process, but
+    // incref just in case.
+    Py_INCREF(value);
+  }
+}
+
+PyObject* PyExceptionRegistry::Lookup(TF_Code code) {
+  DCHECK(singleton_ != nullptr) << "Must call PyExceptionRegistry::Init() "
+                                   "before PyExceptionRegistry::Lookup()";
+  DCHECK_NE(code, TF_OK);
+  DCHECK(singleton_->exc_types_.find(code) != singleton_->exc_types_.end())
+      << "Unknown error code passed to PyExceptionRegistry::Lookup: " << code;
+  return singleton_->exc_types_[code];
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/lib/core/py_exception_registry.h b/tensorflow/python/lib/core/py_exception_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b0f23b548c16130dee9a8ec086ae0283f1506e1
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
+
+#include <map>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/platform/logging.h"
+
+#ifndef PyObject_HEAD
+struct _object;
+typedef _object PyObject;
+#endif
+
+namespace tensorflow {
+
+// Global registry mapping C API error codes to the corresponding custom Python
+// exception type. This is used to expose the exception types to C extension
+// code (i.e. so we can raise custom exceptions via SWIG).
+//
+// Init() must be called exactly once at the beginning of the process before
+// Lookup() can be used.
+//
+// Example usage:
+//   TF_Status* status = TF_NewStatus();
+//   TF_Foo(..., status);
+//
+//   if (TF_GetCode(status) != TF_OK) {
+//     PyObject* exc_type = PyExceptionRegistry::Lookup(TF_GetCode(status));
+//     // Arguments to OpError base class. Set `node_def` and `op` to None.
+//     PyObject* args =
+//       Py_BuildValue("sss", nullptr, nullptr, TF_Message(status));
+//     PyErr_SetObject(exc_type, args);
+//     Py_DECREF(args);
+//     TF_DeleteStatus(status);
+//     return NULL;
+//   }
+class PyExceptionRegistry {
+ public:
+  // Initializes the process-wide registry. Should be called exactly once near
+  // the beginning of the process. The arguments are the various Python
+  // exception types (e.g. `cancelled_exc` corresponds to
+  // errors.CancelledError).
+  static void Init(PyObject* code_to_exc_type_map);
+
+  // Returns the Python exception type corresponding to `code`. Init() must be
+  // called before using this function. `code` should not be TF_OK.
+  static PyObject* Lookup(TF_Code code);
+
+ private:
+  static PyExceptionRegistry* singleton_;
+  PyExceptionRegistry() = default;
+
+  // Maps error codes to the corresponding Python exception type.
+  std::map<TF_Code, PyObject*> exc_types_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_EXCEPTION_REGISTRY_H_
diff --git a/tensorflow/python/lib/core/py_exception_registry.i b/tensorflow/python/lib/core/py_exception_registry.i
new file mode 100644
index 0000000000000000000000000000000000000000..e872b74985e03e203c8aeb8fdec8a3e67f03e1f9
--- /dev/null
+++ b/tensorflow/python/lib/core/py_exception_registry.i
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+%include "tensorflow/python/platform/base.i"
+
+%{
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+%}
+
+%ignoreall
+
+%unignore tensorflow::PyExceptionRegistry;
+%unignore tensorflow::PyExceptionRegistry::Init;
+
+%include "tensorflow/python/lib/core/py_exception_registry.h"
+%unignoreall
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 8247d354db62532c10c5acc9875cc08289cd31bf..32ea737a99067877e7f527e44d261a0b7c2eb07e 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 #include "tensorflow/python/lib/core/py_util.h"
@@ -77,9 +78,9 @@ string PyRepr(PyObject* obj) {
 bool IsPyDimension(PyObject* obj) {
   const char* tp_name = obj->ob_type->tp_name;
   if (strcmp(tp_name, "Dimension") != 0) return false;
-  bool ret =
-      StringPiece(PyRepr(PyType(obj)))
-          .ends_with("tensorflow.python.framework.tensor_shape.Dimension'>");
+  bool ret = str_util::EndsWith(
+      PyRepr(PyType(obj)),
+      "tensorflow.python.framework.tensor_shape.Dimension'>");
   return ret;
 }
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index 6fcf9c91d831e3a89552b522040e8e8647114a2f..bf2d6f68b55d78f9570d3854804e3d1316176c99 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -78,8 +78,7 @@ def tf_record_iterator(path, options=None):
   try:
     while True:
       try:
-        with errors.raise_exception_on_not_ok_status() as status:
-          reader.GetNext(status)
+        reader.GetNext()
       except errors.OutOfRangeError:
         break
       yield reader.record()
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 3c6a5c9e562ff9765c2ef47555871c94cd6feb1e..57d2657838760a6f0041dac850913035540dc939 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -255,10 +255,15 @@ def _SliceGrad(op, grad):
 @ops.RegisterGradient("StridedSlice")
 def _StridedSliceGrad(op, grad):
   """Gradient for StridedSlice op."""
-  x = array_ops.shape(op.inputs[0])
   begin = op.inputs[1]
   end = op.inputs[2]
   strides = op.inputs[3]
+  # StridedSliceGrad requires `x`, `begin`, `end` and `strides` to be of the
+  # same dtype so we build a shape of the same type as other args.
+  # Note that the choice of `begin` for specifying `out_type` is arbitrary.
+  # We could choose any of {begin|end|strides}.dtype since they are required to
+  # be the same.
+  x = array_ops.shape(op.inputs[0], out_type=begin.dtype)
 
   return array_ops.strided_slice_grad(
       x,
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9106461c6001e3a843bb694e389693236fbd442f..9e136937f6673b313d46fc79615f5df37587b452 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -144,6 +144,7 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,protected-access
 @tf_export("expand_dims")
+@deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
 
@@ -193,11 +194,7 @@ def expand_dims(input, axis=None, name=None, dim=None):
   Raises:
     ValueError: if both `dim` and `axis` are specified.
   """
-  # TODO(aselle): Remove argument dim
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("can't specify both 'dim' and 'axis'")
-    axis = dim
+  axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -387,7 +384,10 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
   """
   if context.executing_eagerly() and not isinstance(
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-    return np.prod(ops.convert_to_tensor(input)._shape_tuple())  # pylint: disable=protected-access
+    input = ops.convert_to_tensor(input)
+    np_out_type = out_type.as_numpy_dtype
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
                           sparse_tensor.SparseTensorValue)):
@@ -957,6 +957,11 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   Returns:
     A `tf.Tensor` with value equivalent to `list_or_tuple`.
   """
+  if context.executing_eagerly():
+    # NOTE: Fast path when all the items are tensors, this doesn't do any type
+    # checking.
+    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+      return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
@@ -1558,6 +1563,16 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 # pylint: enable=invalid-name
 
 
+def _constant_if_small(value, shape, dtype, name):
+  try:
+    if np.prod(shape) < 1000:
+      return constant(value, shape=shape, dtype=dtype, name=name)
+  except TypeError:
+    # Happens when shape is a Tensor, list with Tensor elements, etc.
+    pass
+  return None
+
+
 @tf_export("zeros")
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -1588,8 +1603,15 @@ def zeros(shape, dtype=dtypes.float32, name=None):
       zero = ""
     else:
       zero = 0
+
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(zero, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
@@ -1721,6 +1743,12 @@ def ones(shape, dtype=dtypes.float32, name=None):
     one = True if dtype == dtypes.bool else 1
     if not isinstance(shape, ops.Tensor):
       try:
+        # Create a constant if it won't be very big. Otherwise create a fill op
+        # to prevent serialized GraphDefs from becoming too large.
+        output = _constant_if_small(one, shape, dtype, name)
+        if output is not None:
+          return output
+
         # Go through tensor shapes to get int64-if-needed semantics
         shape = constant_op._tensor_shape_tensor_conversion_function(
             tensor_shape.TensorShape(shape))
diff --git a/tensorflow/python/ops/batch_norm_benchmark.py b/tensorflow/python/ops/batch_norm_benchmark.py
index 5d68b47aeaef3a90973387ecd5b265eef1e96a5f..d83b81909755df8d187232e15ecda48b1cbf4557 100644
--- a/tensorflow/python/ops/batch_norm_benchmark.py
+++ b/tensorflow/python/ops/batch_norm_benchmark.py
@@ -25,6 +25,7 @@ import time
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients_impl
@@ -39,7 +40,7 @@ from tensorflow.python.platform import test
 def batch_norm_op(tensor, mean, variance, beta, gamma, scale):
   """Fused kernel for batch normalization."""
   # _batch_norm_with_global_normalization is deprecated in v9
-  ops.get_default_graph().graph_def_versions.producer = 8
+  test_util.set_producer_version(ops.get_default_graph(), 8)
   # pylint: disable=protected-access
   return gen_nn_ops._batch_norm_with_global_normalization(
       tensor, mean, variance, beta, gamma, 0.001, scale)
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..174d00987f9f76b4b07be73e5c29435bed7dfa06
--- /dev/null
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -0,0 +1,160 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for boosted_trees."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_boosted_trees_ops
+from tensorflow.python.ops import resources
+
+# Re-exporting ops used by other modules.
+# pylint: disable=unused-import
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_calculate_best_gains_per_feature as calculate_best_gains_per_feature
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_make_stats_summary as make_stats_summary
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_predict as predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_training_predict as training_predict
+from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_update_ensemble as update_ensemble
+# pylint: enable=unused-import
+
+from tensorflow.python.training import saver
+
+
+class PruningMode(object):
+  NO_PRUNING, PRE_PRUNING, POST_PRUNING = range(0, 3)
+
+
+class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
+  """SaveableObject implementation for TreeEnsemble."""
+
+  def __init__(self, resource_handle, create_op, name):
+    """Creates a _TreeEnsembleSavable object.
+
+    Args:
+      resource_handle: handle to the decision tree ensemble variable.
+      create_op: the op to initialize the variable.
+      name: the name to save the tree ensemble variable under.
+    """
+    stamp_token, serialized = (
+        gen_boosted_trees_ops.boosted_trees_serialize_ensemble(resource_handle))
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree ensemble variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [
+        saver.BaseSaverBuilder.SaveSpec(stamp_token, slice_spec,
+                                        name + '_stamp'),
+        saver.BaseSaverBuilder.SaveSpec(serialized, slice_spec,
+                                        name + '_serialized'),
+    ]
+    super(_TreeEnsembleSavable, self).__init__(resource_handle, specs, name)
+    self._resource_handle = resource_handle
+    self._create_op = create_op
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree ensemble from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree ensemble variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+          self._resource_handle,
+          stamp_token=restored_tensors[0],
+          tree_ensemble_serialized=restored_tensors[1])
+
+
+class TreeEnsemble(object):
+  """Creates TreeEnsemble resource."""
+
+  def __init__(self, name, stamp_token=0, is_local=False, serialized_proto=''):
+    with ops.name_scope(name, 'TreeEnsemble') as name:
+      self._resource_handle = (
+          gen_boosted_trees_ops.boosted_trees_ensemble_resource_handle_op(
+              container='', shared_name=name, name=name))
+      create_op = gen_boosted_trees_ops.boosted_trees_create_ensemble(
+          self.resource_handle,
+          stamp_token,
+          tree_ensemble_serialized=serialized_proto)
+      is_initialized_op = (
+          gen_boosted_trees_ops.is_boosted_trees_ensemble_initialized(
+              self._resource_handle))
+      # Adds the variable to the savable list.
+      if not is_local:
+        saveable = _TreeEnsembleSavable(self.resource_handle, create_op,
+                                        self.resource_handle.name)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      resources.register_resource(
+          self.resource_handle,
+          create_op,
+          is_initialized_op,
+          is_shared=not is_local)
+
+  @property
+  def resource_handle(self):
+    return self._resource_handle
+
+  def get_stamp_token(self):
+    """Returns the current stamp token of the resource."""
+    stamp_token, _, _, _ = (
+        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+            self.resource_handle))
+    return stamp_token
+
+  def get_states(self):
+    """Returns states of the tree ensemble.
+
+    Returns:
+      stamp_token, num_trees, num_finalized_trees, num_attempted_layers.
+    """
+    stamp_token, num_trees, num_finalized_trees, num_attempted_layers = (
+        gen_boosted_trees_ops.boosted_trees_get_ensemble_states(
+            self.resource_handle))
+    # Use identity to give names.
+    return (array_ops.identity(stamp_token, name='stamp_token'),
+            array_ops.identity(num_trees, name='num_trees'),
+            array_ops.identity(num_finalized_trees, name='num_finalized_trees'),
+            array_ops.identity(
+                num_attempted_layers, name='num_attempted_layers'))
+
+  def serialize(self):
+    """Serializes the ensemble into proto and returns the serialized proto.
+
+    Returns:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+    """
+    return gen_boosted_trees_ops.boosted_trees_serialize_ensemble(
+        self.resource_handle)
+
+  def deserialize(self, stamp_token, serialized_proto):
+    """Deserialize the input proto and resets the ensemble from it.
+
+    Args:
+      stamp_token: int64 scalar Tensor to denote the stamp of the resource.
+      serialized_proto: string scalar Tensor of the serialized proto.
+
+    Returns:
+      Operation (for dependencies).
+    """
+    return gen_boosted_trees_ops.boosted_trees_deserialize_ensemble(
+        self.resource_handle, stamp_token, serialized_proto)
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 49f8c665313562cb20dbe4494103ded16646c741..0829aa67ed5236a7c2af89fc104f1d203c8a0f23 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -26,6 +26,8 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -58,18 +60,34 @@ def clip_by_value(t, clip_value_min, clip_value_max,
   """
   with ops.name_scope(name, "clip_by_value",
                       [t, clip_value_min, clip_value_max]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-
-    # Go through list of tensors, for each value in each tensor clip
-    t_min = math_ops.minimum(t, clip_value_max)
-    # Assert that the shape is compatible with the initial shape,
-    # to prevent unintentional broadcasting.
-    _ = t.shape.merge_with(t_min.shape)
-
-    t_max = math_ops.maximum(t_min, clip_value_min, name=name)
-    _ = t.shape.merge_with(t_max.shape)
-
-  return t_max
+    return gen_math_ops.clip_by_value(t,
+                                      clip_value_min,
+                                      clip_value_max,
+                                      name=name)
+
+@ops.RegisterGradient("ClipByValue")
+def _ClipByValueGrad(op, grad):
+  """Returns grad of clip_by_value."""
+  x = op.inputs[0]
+  y = op.inputs[1]
+  z = op.inputs[2]
+  gdtype = grad.dtype
+  sx = array_ops.shape(x)
+  sy = array_ops.shape(y)
+  sz = array_ops.shape(z)
+  gradshape = array_ops.shape(grad)
+  zeros = array_ops.zeros(gradshape, gdtype)
+  xymask = math_ops.less(x, y)
+  xzmask = math_ops.greater(x, z)
+  rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy)
+  rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz)
+  xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad)
+  ygrad = array_ops.where(xymask, grad, zeros)
+  zgrad = array_ops.where(xzmask, grad, zeros)
+  gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx)
+  gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy)
+  gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz)
+  return (gx, gy, gz)
 
 
 @tf_export("clip_by_norm")
diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 45955554cab130597e106660ff1fb4cdf7e9aeb1..6a551deb5ba55871b3a3fb144a6ecd2a3cbfcbd8 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import control_flow_ops
@@ -74,6 +75,11 @@ def _SwitchGrad(op, *grad):
     # At this point, we have created zero_grad guarded by the right switch.
     # Unfortunately, we may still get None here for not trainable data types.
     if zero_grad is None:
+      # For resource variables we get None always on the other branch, so bypass
+      # this.
+      if op.inputs[0].dtype == dtypes.resource:
+        return merge(
+            [grad[op_ctxt.branch]] * 2, name="cond_resource_grad")[0], None
       return None, None
     return merge(grad, name="cond_grad")[0], None
   else:
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 1278768d8bdc9f039f19cf032f8ee09442ea34a9..fb53d9ffea174ae4af6f664efb7e36f63e409124 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -833,6 +833,9 @@ class GradLoopState(object):
     if outer_grad_state:
       outer_forward_ctxt = outer_grad_state.forward_context
     else:
+      if not hasattr(forward_ctxt, 'outer_context'):
+        raise ValueError("Failed to call gradients on a while loop without"
+                         "properly serializing graph via MetaGraphDef")
       outer_forward_ctxt = forward_ctxt.outer_context
 
     # Add the forward loop counter.
@@ -1595,6 +1598,16 @@ class ControlFlowContext(object):
     last_context = self._context_stack.pop()
     graph._set_control_flow_context(last_context)
 
+  def EnterGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.EnterGradientColocation(op, gradient_uid)
+
+  def ExitGradientColocation(self, op, gradient_uid):
+    """Start building a gradient colocated with an op."""
+    if self._outer_context:
+      self._outer_context.ExitGradientColocation(op, gradient_uid)
+
   def ExitResult(self, result):
     """Make a list of tensors available in the outer context."""
     if self._outer_context:
@@ -3181,12 +3194,18 @@ def while_loop(cond,
         body = lambda i, lv: (i + 1, orig_body(*lv))
 
     if context.executing_eagerly():
+      try_to_pack = len(loop_vars) == 1
+      packed = False  # whether the body result was packed into a 1-item tuple
+
       while cond(*loop_vars):
         loop_vars = body(*loop_vars)
+        if try_to_pack and not isinstance(loop_vars, (list, _basetuple)):
+          packed = True
+          loop_vars = (loop_vars,)
       if maximum_iterations is not None:
         return loop_vars[1]
       else:
-        return loop_vars
+        return loop_vars[0] if packed else loop_vars
 
     if shape_invariants is not None:
       if maximum_iterations is not None:
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index f22f3059d139d1bb7c7db57a2939184f1089f397..289df6f3016e9df6a42d694ae854b4f22fdf84f9 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -947,5 +947,28 @@ class CaseTest(test_util.TensorFlowTestCase):
         sess.run(output, feed_dict={x: 4})
 
 
+@test_util.with_c_api
+class WhileLoopTestCase(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testWhileLoopWithSingleVariable(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: math_ops.add(i, 1)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    self.assertEqual(self.evaluate(r), 10)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerWhileLoopWithSingleVariable_bodyReturnsTuple(self):
+    i = constant_op.constant(0)
+    c = lambda i: math_ops.less(i, 10)
+    b = lambda i: (math_ops.add(i, 1),)
+    r = control_flow_ops.while_loop(c, b, [i])
+
+    # Expect a tuple since that is what the body returns.
+    self.assertEqual(self.evaluate(r), (10,))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index 4b57e2de790af13499bc73cfcfa98e999eab1603..908e7939027933327bcdeb21d598bc0b5ca5ff0f 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -218,7 +218,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
         The rows store: `[batch, time]`.
       `decoded.values`: Values vector, size `(total_decoded_outputs)`.
         The vector stores the decoded classes.
-      `decoded.shape`: Shape vector, size `(2)`.
+      `decoded.dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length]`
     neg_sum_logits: A `float` matrix `(batch_size x 1)` containing, for the
         sequence found, the negative of the sum of the greatest logit at each
@@ -265,7 +265,7 @@ def ctc_beam_search_decoder(inputs, sequence_length, beam_width=100,
         The rows store: [batch, time].
       `decoded[j].values`: Values vector, size `(total_decoded_outputs[j])`.
         The vector stores the decoded classes for beam j.
-      `decoded[j].shape`: Shape vector, size `(2)`.
+      `decoded[j].dense_shape`: Shape vector, size `(2)`.
         The shape values are: `[batch_size, max_decoded_length[j]]`.
     log_probability: A `float` matrix `(batch_size x top_paths)` containing
         sequence log-probabilities.
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 9eacac1b3704c43cbeb5ecd0cbe827cac3a7cc8b..dfa07abfc6474833143ce65ac5df65049e01cab8 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -95,7 +95,7 @@ def custom_gradient(f):
     if not context.executing_eagerly():
       if kwargs:
         raise ValueError(
-            "The custom_gradient decorator currently suports keywords "
+            "The custom_gradient decorator currently supports keywords "
             "arguments only when eager execution is enabled.")
       name = "CustomGradient-%s" % ops.uid()
       args = [ops.convert_to_tensor(x) for x in args]
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index d2cc87555f6321432261b32f08431c23ce707eff..cb725199a8501d2a894f18a9b57c23de4e349374 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -1769,7 +1769,9 @@ class StagingArea(BaseStagingArea):
     its capacity.
 
     Args:
-      values: Tensor (or a tuple of Tensors) to place into the staging area.
+      values: A single tensor, a list or tuple of tensors, or a dictionary with
+        tensor values. The number of elements must match the length of the
+        list provided to the dtypes argument when creating the StagingArea.
       name: A name for the operation (optional).
 
     Returns:
@@ -1780,11 +1782,12 @@ class StagingArea(BaseStagingArea):
     """
     with ops.name_scope(name, "%s_put" % self._name,
                         self._scope_vals(values)) as scope:
+      
+      if not isinstance(values, (list, tuple, dict)):
+        values = [values]
 
       # Hard-code indices for this staging area
-      indices = (
-          list(six.moves.range(len(values)))
-          if isinstance(values, (list, tuple)) else None)
+      indices = list(six.moves.range(len(values)))
       vals, _ = self._check_put_dtypes(values, indices)
 
       with ops.colocate_with(self._coloc_op):
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index 50b956a267320b40cb39fbff8b5965a6420146d7..9d9ede7ad75f4eafa91ad051458afbcb6dc8f7b5 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -26,15 +26,3 @@ py_library(
         "@six_archive//:six",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index ec623b55eb0067e16599c18c9c504635da863907..0891bffdd554828586c5b23919f955f685632694 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -166,7 +166,8 @@ class Uniform(distribution.Distribution):
     return self.low + self.range() * samples
 
   def _prob(self, x):
-    broadcasted_x = x * array_ops.ones(self.batch_shape_tensor())
+    broadcasted_x = x * array_ops.ones(
+        self.batch_shape_tensor(), dtype=x.dtype)
     return array_ops.where(
         math_ops.is_nan(broadcasted_x),
         broadcasted_x,
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index a840b1eddfc6922dc310490e8166efd73480c437..161f6f36596279ee0dc7d04d454d670167ba798b 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,22 +27,24 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.gen_functional_ops import *
-# pylint: enable=wildcard-import
 # pylint: disable=unused-import
-from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
+from tensorflow.python.ops.gen_functional_ops import remote_call
 # pylint: enable=unused-import
+from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -365,7 +367,15 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
     dtype_flat = output_flatten(dtype)
 
     # Convert elems to tensor array. n may be known statically.
-    n = elems_flat[0].shape[0].value or array_ops.shape(elems_flat[0])[0]
+    static_shape = elems_flat[0].shape
+    if static_shape.ndims is not None and static_shape.ndims < 1:
+      if len(elems_flat) == 1:
+        raise ValueError("elems must be a 1+ dimensional Tensor, not a scalar")
+      else:
+        raise ValueError(
+            "elements in elems must be 1+ dimensional Tensors, not scalars"
+        )
+    n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
 
     # TensorArrays are always flat
     elems_ta = [
@@ -634,3 +644,249 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
       varscope.set_caching_device(None)
 
     return output_pack(results_flat)
+
+
+# pylint: disable=invalid-name
+def If(cond, inputs, then_branch, else_branch, name=None):
+  r"""output = Cond(inputs) ? then_branch(inputs) : else_branch(inputs).
+
+  Args:
+    cond: A `Tensor`. A scalar. If the scalar is not a boolean, the scalar is
+      converted to a boolean according to the following rule: if the
+      scalar is a numerical value, non-zero means True and zero means
+      False; if the scalar is a string, non-empty means True and empty
+      means False.
+    inputs: A list of input tensors.
+    then_branch: A function takes 'inputs' and returns a list of tensors,
+        whose types are the same as what else_branch returns.
+    else_branch: A function takes 'inputs' and returns a list of tensors.
+        whose types are the same as what then_branch returns.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors returned by either then_branch(inputs)
+    or else_branch(inputs).
+  """
+  # pylint: disable=protected-access
+  return gen_functional_ops._if(
+      cond,
+      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
+      then_branch,
+      else_branch,
+      name=name)
+
+
+def Gradient(inputs, f, name=None):
+  r"""Computes the gradient function for function f via backpropagation.
+
+  Args:
+    inputs: A list of tensors of size N + M.
+    f: The function we want to compute the gradient for.
+
+      The function 'f' must be a numerical function which takes N inputs and
+      produces M outputs. Its gradient function 'g', which is  a function
+      taking N + M inputs and produces N outputs.
+
+      I.e. if we have
+         (y1, y2, ..., yM) = f(x1, x2, ..., xN),
+      then, g is
+         (dL/dx1, dL/dx2, ..., dL/dxN) = g(x1, x2, ..., xN,
+                                           dL/dy1, dL/dy2, ..., dL/dyM),
+
+      where L is a scalar-value function of (x1, x2, ..., xN) (e.g., the
+      loss function). dL/dxi is the partial derivative of L with respect
+      to xi.
+
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of tensors of size N.
+  """
+  # TODO(zhifengc): Pretty-print the above spec in latex.
+  # TODO(zhfiengc): Needs some math expert to say the comment above better.
+  tlist = [_.type for _ in f.definition.signature.input_arg]
+  return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
+
+
+# pylint: disable=invalid-name,protected-access
+def While(input_, cond, body, name=None, hostmem=None):
+  r"""output = input; While (Cond(output)) { output = Body(output) }.
+
+  Args:
+    input_: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    cond: . A function takes 'input' and returns a tensor.  If the tensor is
+      a scalar of non-boolean, the scalar is converted to a boolean
+      according to the following rule: if the scalar is a numerical
+      value, non-zero means True and zero means False; if the scalar is
+      a string, non-empty means True and empty means False. If the
+      tensor is not a scalar, non-emptiness means True and False
+      otherwise.
+    body: . A funcion takes a list of tensors and returns another
+      list tensors. Both lists have the same types as specified
+      by T.
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, input[i] is a
+      host memory tensor.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  ret = gen_functional_ops._while(input_, cond, body, name=name)
+  if hostmem:
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# b/36459430
+#
+# Ideally, we do not need this rewrite For loop into a While loop.
+# However, today, if a While runs on GPU and the condition returns a
+# boolean, the While kernel crashes. Even if we fix the crash, the
+# bool needs to be copied between GPU and CPU. So, a for loop is much
+# preferred when running on GPU.
+#
+# On the other hand, For op has no directly XLA kernel. So, when we run
+# a for loop, we need to rewrite it using a While op.
+#
+# It should be possible and probably better to write a XLA C++ kernel
+# implementing the logic in _ForUsingWhile.
+def _ForUsingWhile(start,
+                   limit,
+                   delta,
+                   inputs,
+                   forbody,
+                   name=None,
+                   hostmem=None):
+  """Helper to implement a For loop using a While."""
+  # To support negative delta (e.g., range(100, 0, -3)), we iterate
+  # over the range(n) and use iter * delta + start as the real
+  # iteration index. (e.g., for i in range(34): iter = i * (-3) +
+  # 100).
+  d = math_ops.abs(delta)
+  # XLA on TPUs doesn't support integer division
+  n = math_ops.cast(
+      math_ops.cast((math_ops.abs(limit - start) + d - 1), dtypes.float32) /
+      math_ops.cast(d, dtypes.float32), dtypes.int32)
+
+  # Carried loop variables ("extra_args") are implicitly added to the input list
+  # of the WhileBody function. WhileCond does not call forbody, and so does not
+  # depend on any of forbody's extra_args. Since WhileCond and WhileBody
+  # must have identical inputs, we have to augment the cond signature to take
+  # the same types as the carried loop variables.
+  body_sig = [dtypes.int32] * 4 + list(forbody.declared_input_types)[1:]
+  cond_sig = body_sig + [t.dtype for t in forbody.captured_inputs]
+
+  cond_name = "%s_Cond" % forbody.name
+
+  @function.Defun(*cond_sig, func_name=cond_name)
+  def WhileCond(i, n, *args):
+    del args
+    return i < n
+
+  body_name = "%s_Body" % forbody.name
+
+  @function.Defun(*body_sig, func_name=body_name)
+  def WhileBody(i, n, start, delta, *args):
+    """A While wrapper for forbody that handles loop-carried captured inputs."""
+    for_result = forbody(start + i * delta, *args)
+    # Nullary functions return an Operation. Normal functions can't do this
+    # because their return values are converted to Tensors.
+    if isinstance(for_result, ops.Operation):
+      for_result = ()
+    # Unary functions return a single Tensor value.
+    elif isinstance(for_result, ops.Tensor):
+      for_result = (for_result,)
+    extra_args = tuple(function.get_extra_args())
+    return (i + 1, n, start, delta) + tuple(for_result) + extra_args
+
+  if hostmem is not None:
+    hostmem = [(4 + _) for _ in hostmem]
+
+  results = While(
+      input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
+      cond=WhileCond,
+      body=WhileBody,
+      name=name,
+      hostmem=hostmem)
+  # Slice off the loop-carried captured inputs.
+  return list(results[4:len(results) - len(WhileBody.captured_inputs)])
+
+
+def For(start,
+        limit,
+        delta,
+        inputs,
+        body,
+        name=None,
+        hostmem=None,
+        rewrite_with_while=None):
+  r"""out = input; for i in range(start, limit, delta) out = body(i, out).
+
+  Args:
+    start: A `Tensor` of type `int32`.
+    limit: A `Tensor` of type `int32`.
+    delta: A `Tensor` of type `int32`.
+    inputs: A list of `Tensor` objects.
+      A list of input tensors whose types are T.
+    body: A function takes a list of tensors and returns another
+      list of tensors. Both lists have the same types as (int32, T...).
+    name: A name for the operation (optional).
+    hostmem: A list of integer. If i is in the list, inputs[i] is a
+      host memory tensor. In other words, (i+1)-th argument of the body
+      function is expecting a host memory.
+    rewrite_with_while: If True, using While op to implement the For.
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `input`.
+    A list of output tensors whose types are T.
+  """
+  if rewrite_with_while:
+    return _ForUsingWhile(start, limit, delta, inputs, body, name, hostmem)
+  if body.captured_inputs:
+    wrapper_name = "%s_BodyWrapper" % body.name
+
+    @function.Defun(*body.declared_input_types, func_name=wrapper_name)
+    def BodyWrapper(*args):
+      """A wrapper for body that handles loop-carried captured inputs."""
+      body_result = body(*args)
+      extra_args = tuple(function.get_extra_args())
+      # Nullary functions return an Operation. Normal functions can't do this
+      # because their return values are converted to Tensors.
+      if isinstance(body_result, ops.Operation):
+        return extra_args
+      # Unary functions return a single Tensor value.
+      elif not isinstance(body_result, tuple):
+        return (body_result,) + extra_args
+      # N-ary functions return a tuple of Tensors.
+      else:
+        return body_result + extra_args
+
+    inputs += BodyWrapper.captured_inputs
+    ret = gen_functional_ops._for(
+        start, limit, delta, inputs, BodyWrapper, name=name)
+    # Slice off the loop-carried captured inputs.
+    ret = ret[:-len(BodyWrapper.captured_inputs)]
+  else:
+    ret = gen_functional_ops._for(start, limit, delta, inputs, body, name=name)
+  if hostmem:
+    num_for_params = 3  # start/limit/delta
+
+    input_attr = attr_value_pb2.AttrValue()
+    input_attr.list.i.extend([num_for_params + i for i in hostmem])
+    ret[0].op._set_attr("_input_hostmem", input_attr)  # pylint: disable=protected-access
+
+    output_attr = attr_value_pb2.AttrValue()
+    output_attr.list.i.extend(hostmem)
+    ret[0].op._set_attr("_output_hostmem", output_attr)  # pylint: disable=protected-access
+  return ret
+
+
+# pylint: enable=invalid-name,protected-access
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 44473ec69c8ac6cf565f635621eebff7bc403225..13420b7f0ee5f2c186ff99409588b827b281c95f 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -208,7 +208,10 @@ def _AsList(x):
   return x if isinstance(x, (list, tuple)) else [x]
 
 
-def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
+def _DefaultGradYs(grad_ys,
+                   ys,
+                   colocate_gradients_with_ops,
+                   gradient_uid="__unsupported__"):
   """Fill in default values for grad_ys.
 
   Args:
@@ -216,6 +219,9 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
     ys: List of tensors.
     colocate_gradients_with_ops: If True, try colocating gradients with
       the corresponding op.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
 
   Returns:
     A list of gradients to use, without None.
@@ -231,7 +237,7 @@ def _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops):
   for i in xrange(len(grad_ys)):
     grad_y = grad_ys[i]
     y = ys[i]
-    with _maybe_colocate_with(y.op, colocate_gradients_with_ops):
+    with _maybe_colocate_with(y.op, gradient_uid, colocate_gradients_with_ops):
       if grad_y is None:
         if y.dtype.is_complex:
           raise TypeError(
@@ -338,10 +344,10 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
 
 
 @contextlib.contextmanager
-def _maybe_colocate_with(op, colocate_gradients_with_ops):
+def _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):  # pylint: disable=invalid-name
   """Context to colocate with `op` if `colocate_gradients_with_ops`."""
   if colocate_gradients_with_ops:
-    with ops.colocate_with(op):
+    with ops._colocate_with_for_gradient(op, gradient_uid):  # pylint: disable=protected-access
       yield
   else:
     yield
@@ -506,6 +512,9 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
   with ops.name_scope(
       name, "gradients",
       list(ys) + list(xs) + list(stop_gradients) + list(grad_ys)) as grad_scope:
+    # Get a uid for this call to gradients that can be used to help
+    # cluster ops for compilation.
+    gradient_uid = ops.get_default_graph().unique_name("uid")
     ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
     xs = [
         x.handle if resource_variable_ops.is_resource_variable(x) else x
@@ -513,7 +522,8 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     ]
     xs = ops.internal_convert_n_to_tensor_or_indexed_slices(
         xs, name="x", as_ref=True)
-    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)
+    grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops,
+                             gradient_uid)
 
     # The approach we take here is as follows: Create a list of all ops in the
     # subgraph between the ys and xs.  Visit these ops in reverse order of ids
@@ -570,10 +580,11 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     while queue:
       # generate gradient subgraph for op.
       op = queue.popleft()
-      with _maybe_colocate_with(op, colocate_gradients_with_ops):
+      with _maybe_colocate_with(op, gradient_uid, colocate_gradients_with_ops):
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=True)
-        out_grads = _AggregatedGrads(grads, op, loop_state, aggregation_method)
+        out_grads = _AggregatedGrads(grads, op, gradient_uid, loop_state,
+                                     aggregation_method)
         if loop_state:
           loop_state.ExitGradWhileContext(op, before=True)
 
@@ -633,7 +644,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
               if gate_gradients and len([x for x in in_grads
                                          if x is not None]) > 1:
                 with ops.device(None):
-                  with ops.colocate_with(None, ignore_existing=True):
+                  with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+                      None,
+                      gradient_uid,
+                      ignore_existing=True):
                     in_grads = control_flow_ops.tuple(in_grads)
           _LogOpGradients(op, out_grads, in_grads)
         else:
@@ -789,7 +803,7 @@ def _LogOpGradients(op, out_grads, in_grads):
                ", ".join([x.name for x in in_grads if _FilterGrad(x)]))
 
 
-def _MultiDeviceAddN(tensor_list):
+def _MultiDeviceAddN(tensor_list, gradient_uid):
   """Adds tensors from potentially multiple devices."""
   # Basic function structure comes from control_flow_ops.group().
   # Sort tensors according to their devices.
@@ -808,7 +822,10 @@ def _MultiDeviceAddN(tensor_list):
 
   for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey):
     tensors = tensors_on_device[dev]
-    with ops.colocate_with(tensors[0].op, ignore_existing=True):
+    with ops._colocate_with_for_gradient(  # pylint: disable=protected-access
+        tensors[0].op,
+        gradient_uid,
+        ignore_existing=True):
       summands.append(math_ops.add_n(tensors))
 
   return math_ops.add_n(summands)
@@ -834,12 +851,19 @@ class AggregationMethod(object):
   EXPERIMENTAL_ACCUMULATE_N = 2
 
 
-def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
+def _AggregatedGrads(grads,
+                     op,
+                     gradient_uid,
+                     loop_state,
+                     aggregation_method=None):
   """Get the aggregated gradients for op.
 
   Args:
     grads: The map of memoized gradients.
     op: The op to get gradients for.
+    gradient_uid: A unique identifier within the graph indicating
+      which invocation of gradients is being executed. Used to cluster
+      ops for compilation.
     loop_state: An object for maintaining the state of the while loops in the
                 graph. It is of type ControlFlowState. None if the graph
                 contains no while loops.
@@ -916,7 +940,7 @@ def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
             out_grads[i] = running_sum
         else:
           used = "add_n"
-          out_grads[i] = _MultiDeviceAddN(out_grad)
+          out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index c94f1396b28e2124c6e5123cf711ac86abf174ab..0603d3b6706b960a0fa9d9b33d383dd0c9063780 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
@@ -810,5 +811,29 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
       gradients.gradients(y, x)
 
 
+class ResourceCondTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    gamma = resource_variable_ops.ResourceVariable(
+        np.random.random((3,)),
+        dtype="float32", name="gamma")
+
+    inputs = array_ops.ones(shape=(3,), dtype="float32")
+
+    def TestFn():
+      output = inputs + gamma
+      return output
+
+    training = array_ops.placeholder_with_default(True, shape=())
+    output = control_flow_ops.cond(
+        training, TestFn, lambda: inputs)
+
+    loss = output
+
+    grads = gradients.gradients(
+        loss, [gamma])
+    self.assertTrue(None not in grads)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e1217e984c8f6723ca171b0fdaa9fa8aed43d75a
--- /dev/null
+++ b/tensorflow/python/ops/hidden_ops.txt
@@ -0,0 +1,395 @@
+# array_ops
+BatchToSpace
+BroadcastArgs
+BroadcastGradientArgs
+ConcatOffset
+Concat
+ConcatV2
+ConjugateTranspose
+Const
+DebugGradientIdentity
+DebugGradientRefIdentity
+EditDistance
+ExpandDims
+ListDiff
+MirrorPad
+MirrorPadGrad
+OneHot
+Pack
+Pad
+PadV2
+ParallelConcat
+Placeholder
+RefIdentity
+Reverse
+Snapshot
+SpaceToBatch
+Split
+SplitV
+Squeeze
+Slice
+TileGrad  # Exported through array_grad instead of array_ops.
+ZerosLike  # TODO(josh11b): Use this instead of the Python version.
+Unique
+UniqueV2
+UniqueWithCounts
+UniqueWithCountsV2
+Unpack
+
+# candidate_sampling_ops
+AllCandidateSampler
+ComputeAccidentalHits
+FixedUnigramCandidateSampler
+LearnedUnigramCandidateSampler
+LogUniformCandidateSampler
+ThreadUnsafeUnigramCandidateSampler
+UniformCandidateSampler
+
+# checkpoint_ops
+GenerateVocabRemapping
+LoadAndRemapMatrix
+
+
+# control_flow_ops
+Switch
+Merge
+RefMerge
+Exit
+RefExit
+
+# ctc_ops
+CTCLoss
+CTCGreedyDecoder
+CTCBeamSearchDecoder
+
+# data_flow_ops
+Barrier
+BarrierClose
+BarrierIncompleteSize
+BarrierInsertMany
+BarrierReadySize
+BarrierTakeMany
+DeleteSessionTensor
+FakeQueue
+FIFOQueue
+FIFOQueueV2
+GetSessionHandle
+GetSessionHandleV2
+GetSessionTensor
+HashTable
+HashTableV2
+InitializeTable
+InitializeTableV2
+InitializeTableFromTextFile
+InitializeTableFromTextFileV2
+LookupTableExport
+LookupTableExportV2
+LookupTableFind
+LookupTableFindV2
+LookupTableImport
+LookupTableImportV2
+LookupTableInsert
+LookupTableInsertV2
+LookupTableSize
+LookupTableSizeV2
+MutableDenseHashTable
+MutableDenseHashTableV2
+MutableHashTable
+MutableHashTableV2
+MutableHashTableOfTensors
+MutableHashTableOfTensorsV2
+Mutex
+MutexAcquire
+MutexRelease
+PaddingFIFOQueue
+PaddingFIFOQueueV2
+PriorityQueue
+PriorityQueueV2
+QueueClose
+QueueCloseV2
+QueueDequeue
+QueueDequeueV2
+QueueDequeueMany
+QueueDequeueManyV2
+QueueDequeueUpTo
+QueueDequeueUpToV2
+QueueEnqueue
+QueueEnqueueV2
+QueueEnqueueMany
+QueueEnqueueManyV2
+QueueSize
+QueueSizeV2
+RandomShuffleQueue
+RandomShuffleQueueV2
+Stack
+StackClose
+StackPop
+StackPush
+StackV2
+StackCloseV2
+StackPopV2
+StackPushV2
+TensorArray
+TensorArrayClose
+TensorArrayCloseV2
+TensorArrayConcat
+TensorArrayConcatV2
+TensorArrayGather
+TensorArrayGatherV2
+TensorArrayGrad
+TensorArrayGradV2
+TensorArrayPack
+TensorArrayPackV2
+TensorArrayRead
+TensorArrayReadV2
+TensorArrayScatter
+TensorArrayScatterV2
+TensorArraySize
+TensorArraySizeV2
+TensorArraySplit
+TensorArraySplitV2
+TensorArrayUnpack
+TensorArrayUnpackV2
+TensorArrayV2
+TensorArrayWrite
+TensorArrayWriteV2
+TensorArrayV3
+TensorArrayCloseV3
+TensorArrayConcatV3
+TensorArrayGatherV3
+TensorArrayGradV3
+TensorArrayReadV3
+TensorArrayPackV3
+TensorArrayScatterV3
+TensorArraySizeV3
+TensorArraySplitV3
+TensorArrayUnpackV3
+TensorArrayWriteV3
+
+# functional_ops
+SymbolicGradient
+
+# image_ops
+AdjustContrastv2
+NonMaxSuppression
+NonMaxSuppressionV2
+RandomCrop
+ResizeBilinearGrad
+ResizeBicubicGrad
+ResizeNearestNeighborGrad
+SampleDistortedBoundingBox
+SampleDistortedBoundingBoxV2
+ScaleImageGrad
+
+# io_ops
+FixedLengthRecordReader
+IdentityReader
+ReaderNumRecordsProduced
+ReaderNumWorkUnitsCompleted
+ReaderRead
+ReaderReadUpTo
+ReaderReset
+ReaderRestoreState
+ReaderSerializeState
+ReaderWorkQueueLength
+FixedLengthRecordReaderV2
+IdentityReaderV2
+ReaderNumRecordsProducedV2
+ReaderNumWorkUnitsCompletedV2
+ReaderReadV2
+ReaderReadUpToV2
+ReaderResetV2
+ReaderRestoreStateV2
+ReaderSerializeStateV2
+ReaderWorkQueueLengthV2
+Restore
+RestoreSlice
+Save
+SaveSlices
+ShardedFilename
+ShardedFilespec
+TextLineReader
+TFRecordReader
+WholeFileReader
+TextLineReaderV2
+TFRecordReaderV2
+WholeFileReaderV2
+LMDBReader
+DecodeCSV
+
+# linalg_ops
+BatchCholesky
+BatchCholeskyGrad
+BatchMatrixDeterminant
+BatchMatrixInverse
+BatchMatrixSolve
+BatchMatrixSolveLs
+BatchMatrixTriangularSolve
+BatchSelfAdjointEig
+BatchSelfAdjointEigV2
+BatchSvd
+LogMatrixDeterminant
+MatrixExponential
+MatrixLogarithm
+MatrixSolveLs
+SelfAdjointEig
+SelfAdjointEigV2
+Svd
+
+# logging_ops
+Assert
+AudioSummary
+AudioSummaryV2
+HistogramSummary
+ImageSummary
+MergeSummary
+Print
+ScalarSummary
+TensorSummary
+TensorSummaryV2
+
+# math_ops
+Abs
+AccumulateNV2
+AddN
+AddV2
+All
+Any
+BatchMatMul
+BatchFFT
+BatchFFT2D
+BatchFFT3D
+BatchIFFT
+BatchIFFT2D
+BatchIFFT3D
+Bucketize
+ClipByValue
+Complex
+ComplexAbs
+Conj
+FloorDiv
+FloorMod
+HistogramFixedWidth
+Max
+Mean
+Min
+Mul
+Neg
+Pow
+Prod
+Range
+RealDiv
+Select
+SparseMatMul
+Sub
+Sum
+MatMul
+Sigmoid
+Tanh
+SigmoidGrad
+TanhGrad
+InvGrad
+ReciprocalGrad
+SqrtGrad
+RsqrtGrad
+TruncateDiv
+TruncateMod
+
+# nn_ops
+AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
+AvgPool3DGrad
+BatchNormWithGlobalNormalization
+BatchNormWithGlobalNormalizationGrad
+FusedBatchNorm
+FusedBatchNormV2
+SoftmaxCrossEntropyWithLogits
+SparseSoftmaxCrossEntropyWithLogits
+LRNGrad
+MaxPoolGrad
+MaxPoolGradWithArgmax
+MaxPoolGradGrad
+MaxPoolGradGradWithArgmax
+MaxPool3DGrad
+MaxPool3DGradGrad
+ReluGrad
+Relu6Grad
+EluGrad
+SeluGrad
+SoftplusGrad
+SoftsignGrad
+TopK
+TopKV2
+BiasAdd
+BiasAddV1
+Relu6
+AvgPool
+MaxPool
+MaxPoolV2
+Softmax
+LogSoftmax
+FractionalAvgPoolGrad
+FractionalMaxPoolGrad
+InTopK
+InTopKV2
+
+# parsing_ops
+ParseExample
+ParseSingleSequenceExample
+
+# random_ops
+RandomGamma
+RandomPoisson
+RandomUniform
+RandomUniformInt
+RandomShuffle
+RandomStandardNormal
+ParameterizedTruncatedNormal
+TruncatedNormal
+
+# script_ops
+PyFunc
+PyFuncStateless
+EagerPyFunc
+
+# sdca_ops
+
+# state_ops
+Variable
+VariableV2
+TemporaryVariable
+DestroyTemporaryVariable
+
+# sparse_ops
+AddSparseToTensorsMap
+AddManySparseToTensorsMap
+TakeManySparseFromTensorsMap
+DeserializeManySparse
+DeserializeSparse
+SerializeManySparse
+SerializeSparse
+SparseAdd
+SparseAddGrad
+SparseConcat
+SparseCross
+SparseFillEmptyRows
+SparseFillEmptyRowsGrad
+SparseSplit
+SparseSelectLastK
+SparseReorder
+SparseReshape
+SparseToDense
+SparseTensorDenseAdd
+SparseTensorDenseMatMul
+
+# string_ops
+StringSplit
+
+# user_ops
+Fact
+
+# training_ops
+# (None)
+
+# word2vec deprecated ops
+NegTrain
+Skipgram
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3369fe3c9b37ca05311c5548dbfa3228ba04ee80..601010bce9efaf1bcc864ce28a4c0bb8f8622823 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -269,17 +269,7 @@ def random_flip_up_down(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_up_down', [image]) as scope:
-    image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [0]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+  return _random_flip(image, 0, seed, 'random_flip_up_down')
 
 
 @tf_export('image.random_flip_left_right')
@@ -301,14 +291,34 @@ def random_flip_left_right(image, seed=None):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'random_flip_left_right', [image]) as scope:
+  return _random_flip(image, 1, seed, 'random_flip_left_right')
+
+
+def _random_flip(image, flip_index, seed, scope_name):
+  """Randomly (50% chance) flip an image along axis `flip_index`.
+    Args:
+      image: A 3-D tensor of shape `[height, width, channels].`
+      flip_index: The dimension along which to flip the image.
+                  Vertical: 0, Horizontal: 1
+      seed: A Python integer. Used to create a random seed. See
+        @{tf.set_random_seed}
+        for behavior.
+      scope_name: Name of the scope in which the ops are added.
+
+    Returns:
+      A 3-D tensor of the same type and shape as `image`.
+
+    Raises:
+      ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
     image = _Assert3DImage(image)
     uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
     mirror_cond = math_ops.less(uniform_random, .5)
     result = control_flow_ops.cond(
         mirror_cond,
-        lambda: array_ops.reverse(image, [1]),
+        lambda: array_ops.reverse(image, [flip_index]),
         lambda: image,
         name=scope)
     return fix_image_flip_shape(image, result)
@@ -332,16 +342,7 @@ def flip_left_right(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_left_right', [image]):
-    image = ops.convert_to_tensor(image, name='image')
-    image = _AssertAtLeast3DImage(image)
-    shape = image.get_shape()
-    if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [1]))
-    elif shape.ndims == 4:
-      return array_ops.reverse(image, [2])
-    else:
-      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
+  return _flip(image, 1, 'flip_left_right')
 
 
 @tf_export('image.flip_up_down')
@@ -362,14 +363,35 @@ def flip_up_down(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'flip_up_down', [image]):
+  return _flip(image, 0, 'flip_up_down')
+
+
+def _flip(image, flip_index, scope_name):
+  """Flip an image either horizontally or vertically.
+
+  Outputs the contents of `image` flipped along the dimension `flip_index`.
+
+  See also `reverse()`.
+
+  Args:
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
+    flip_index: 0 For vertical, 1 for horizontal.
+
+  Returns:
+    A tensor of the same type and shape as `image`.
+
+  Raises:
+    ValueError: if the shape of `image` not supported.
+  """
+  with ops.name_scope(None, scope_name, [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return fix_image_flip_shape(image, array_ops.reverse(image, [0]))
+      return fix_image_flip_shape(image, array_ops.reverse(image, [flip_index]))
     elif shape.ndims == 4:
-      return array_ops.reverse(image, [1])
+      return array_ops.reverse(image, [flip_index+1])
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 40ab22951b1aa04a61e09aac155b6449ae358d7b..9dfe5ffbf42bcfc9657739b6fc6ad1f3c4823a7d 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -532,8 +532,7 @@ class Orthogonal(Initializer):
     q, r = linalg_ops.qr(a, full_matrices=False)
     # Make Q uniform
     d = array_ops.diag_part(r)
-    ph = d / math_ops.abs(d)
-    q *= ph
+    q *= math_ops.sign(d)
     if num_rows < num_cols:
       q = array_ops.matrix_transpose(q)
     return self.gain * array_ops.reshape(q, shape)
@@ -579,7 +578,11 @@ class ConvolutionDeltaOrthogonal(Initializer):
     a = random_ops.random_normal([shape[-1], shape[-1]],
                                  dtype=dtype, seed=self.seed)
     # Compute the qr factorization
-    q, _ = linalg_ops.qr(a, full_matrices=False)
+    q, r = linalg_ops.qr(a, full_matrices=False)
+    # Make Q uniform
+    d = array_ops.diag_part(r)
+    # ph = d / math_ops.abs(d)
+    q *= math_ops.sign(d)
     q = q[:shape[-2], :]
     q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
     if len(shape) == 3:
diff --git a/tensorflow/python/ops/inplace_ops.py b/tensorflow/python/ops/inplace_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b000086b05219d23cd88935948f88f2cc718bf
--- /dev/null
+++ b/tensorflow/python/ops/inplace_ops.py
@@ -0,0 +1,227 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Inplace operations.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+
+
+def _inplace_helper(x, i, v, op):
+  """Applies an inplace op on (x, i, v).
+
+  op is one of gen_array_ops.alias_inplace_update,
+  gen_array_ops.alias_inplace_add, or gen_array_ops.alias_inplace_sub.
+
+  If i is None, x and v must be the same shape. Computes
+    x op v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] op v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] op v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+    op: alias_inplace_update, alias_inplace_add, or alias_inplace_sub.
+
+  Returns:
+    Returns x.
+
+  """
+  x = ops.convert_to_tensor(x)
+  v = ops.convert_to_tensor(v, x.dtype)
+  if i is None:
+    # Full tensor.
+    return array_ops.reshape(
+        op(array_ops.reshape(x, [1, -1]), [0], array_ops.reshape(v, [1, -1])),
+        array_ops.shape(x))
+  i = math_ops.to_int32(i)
+  if i.get_shape().ndims == 0:
+    # Single 0-dim update.
+    return op(x, array_ops.reshape(i, [1]), array_ops.expand_dims(v, 0))
+  return op(x, i, v)
+
+
+def alias_inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_update)
+
+
+def alias_inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_add)
+
+
+def alias_inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v. Aliases x.
+
+  If i is None, x and v must be the same shape. Computes
+    x -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    x[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    x[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns x.
+
+  """
+  return _inplace_helper(x, i, v, gen_array_ops.inplace_sub)
+
+
+def empty_like(x, init=None):
+  """Returns a non-initialized tensor with the same shape and dtype as x.
+
+  Args:
+    x: A Tensor.
+    init: Initialize the returned tensor with the default value of
+      x.dtype(), if True. Otherwise, do not initialize. Defaults to
+      None.
+
+  Returns:
+    A tensor y, whose dtype and shape are the same as those of x.
+    y is guaranteed not to be an alias of x. Upon return, y may contain
+    arbitrary data.
+
+  """
+  x = ops.convert_to_tensor(x)
+  return gen_array_ops.empty(array_ops.shape(x), x.dtype, init=init)
+
+
+def inplace_update(x, i, v):
+  """Applies an inplace update on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y = v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] = v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] = v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_update(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_add(x, i, v):
+  """Applies an inplace add on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y += v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] += v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] += v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_add(gen_array_ops.deep_copy(x), i, v)
+
+
+def inplace_sub(x, i, v):
+  """Applies an inplace sub on input x at index i with value v.
+
+  Note that this function is not actually inplace - it allocates
+  a copy of x.  The utility is not avoiding memory copies but rather
+  specifying a sparse update.
+
+  If i is None, x and v must be the same shape. Computes
+    y = x; y -= v;
+  If i is a scalar, x has a rank 1 higher than v's. Computes
+    y = x; y[i, :] -= v;
+  Otherwise, x and v must have the same rank. Computes
+    y = x; y[i, :] -= v;
+
+  Args:
+    x: A Tensor.
+    i: None, a scalar or a vector.
+    v: A Tensor.
+
+  Returns:
+    Returns y, which is guaranteed not to be an alias of x.
+
+  """
+  return alias_inplace_sub(gen_array_ops.deep_copy(x), i, v)
+
+empty = gen_array_ops.empty
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index ce8c1580fe5ee614558bfd52afde0d9c5088abe6..07659ef44c443ad15876781d6c6254ae3bc38660 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -34,15 +34,3 @@ py_library(
         "//tensorflow/python:special_math_ops",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index c7513d5b40c5a4bb11501c90e08a9dc3a38c2e09..193c787baa2ac68feec7e5d8bb03b251fc78d781 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -166,8 +166,7 @@ class LinearOperator(object):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index ecd30e4d7e4dd7cfd4b109ad6e60aacb172700f6..0292bc51dcf9809941087dd4aa1ea4c760c064d1 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -134,8 +134,7 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.  Default is the individual
         operators names joined with `_o_`.
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index e180e830263c44fb5ae290d307f1ef80106c31d5..5beaea65a5171ad7e92042a2afa81c0507e51d0e 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -132,8 +132,7 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
index f979fb37d6c69a2683af08a1f6722b98da0b6650..5ba3b090ae9decaba239b31226db84c2d7b254bd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
+++ b/tensorflow/python/ops/linalg/linear_operator_full_matrix.py
@@ -125,8 +125,7 @@ class LinearOperatorFullMatrix(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py
index 50f3d407e85e4cca22ad6326931b5a2a736819a8..45929eb4e2e91218784a9fabba23b57851ae3cc8 100644
--- a/tensorflow/python/ops/linalg/linear_operator_identity.py
+++ b/tensorflow/python/ops/linalg/linear_operator_identity.py
@@ -236,8 +236,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
@@ -576,8 +575,7 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       assert_proper_shapes:  Python `bool`.  If `False`, only perform static
         checks that initialization and method arguments have proper shape.
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index a5130188b681813e1ccd4818dabdffeeb663e20a..c4d386ccb4efc7dede8310243e517fe2f6b45bd9 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -133,8 +133,7 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         meaning the quadratic form `x^H A x` has positive real part for all
         nonzero `x`.  Note that we do not require the operator to be
         self-adjoint to be positive-definite.  See:
-        https://en.wikipedia.org/wiki/Positive-definite_matrix\
-            #Extension_for_non_symmetric_matrices
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
       is_square:  Expect that this operator acts like square [batch] matrices.
       name: A name for this `LinearOperator`.
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py
index 427bd1e890305618264a6a588be4e4ffade33c01..9dd40765c20222c6998260547b7e8fa341e65437 100644
--- a/tensorflow/python/ops/linalg/linear_operator_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_util.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -102,6 +103,22 @@ def assert_is_batch_matrix(tensor):
         "%s" % tensor)
 
 
+def shape_tensor(shape, name=None):
+  """Convert Tensor using default type, unless empty list or tuple."""
+  # Works just like random_ops._ShapeTensor.
+  if isinstance(shape, (tuple, list)) and not shape:
+    dtype = dtypes.int32
+  else:
+    dtype = None
+  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+
+
+################################################################################
+# Broadcasting versions of common linear algebra functions.
+# TODO(b/77519145) Do this more efficiently in some special cases.
+################################################################################
+
+
 def broadcast_matrix_batch_dims(batch_matrices, name=None):
   """Broadcast leading dimensions of zero or more [batch] matrices.
 
@@ -170,7 +187,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = batch_matrices[0].get_shape()[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_static_shape(
-          bcast_batch_shape, mat.get_shape()[:-2])
+          bcast_batch_shape,
+          mat.get_shape()[:-2])
     if bcast_batch_shape.is_fully_defined():
       # The [1, 1] at the end will broadcast with anything.
       bcast_shape = bcast_batch_shape.concatenate([1, 1])
@@ -183,7 +201,8 @@ def broadcast_matrix_batch_dims(batch_matrices, name=None):
     bcast_batch_shape = array_ops.shape(batch_matrices[0])[:-2]
     for mat in batch_matrices[1:]:
       bcast_batch_shape = array_ops.broadcast_dynamic_shape(
-          bcast_batch_shape, array_ops.shape(mat)[:-2])
+          bcast_batch_shape,
+          array_ops.shape(mat)[:-2])
     bcast_shape = array_ops.concat([bcast_batch_shape, [1, 1]], axis=0)
     for i, mat in enumerate(batch_matrices):
       batch_matrices[i] = _broadcast_to_shape(mat, bcast_shape)
@@ -195,6 +214,13 @@ def _broadcast_to_shape(x, shape):
   return x + array_ops.zeros(shape=shape, dtype=x.dtype)
 
 
+def cholesky_solve_with_broadcast(chol, rhs, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "CholeskySolveWithBroadcast", [chol, rhs]):
+    chol, rhs = broadcast_matrix_batch_dims([chol, rhs])
+    return linalg_ops.cholesky_solve(chol, rhs)
+
+
 def matmul_with_broadcast(a,
                           b,
                           transpose_a=False,
@@ -206,6 +232,11 @@ def matmul_with_broadcast(a,
                           name=None):
   """Multiplies matrix `a` by matrix `b`, producing `a @ b`.
 
+  Works identically to `tf.matmul`, but broadcasts batch dims
+  of `a` and `b` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
   The inputs must be matrices (or tensors of rank > 2, representing batches of
   matrices).
 
@@ -276,7 +307,7 @@ def matmul_with_broadcast(a,
     ValueError: If transpose_a and adjoint_a, or transpose_b and adjoint_b
       are both set to True.
   """
-  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]) as name:
+  with ops.name_scope(name, "MatMulWithBroadcast", [a, b]):
     a, b = broadcast_matrix_batch_dims([a, b])
     return math_ops.matmul(
         a,
@@ -289,11 +320,43 @@ def matmul_with_broadcast(a,
         b_is_sparse=b_is_sparse)
 
 
-def shape_tensor(shape, name=None):
-  """Convert Tensor using default type, unless empty list or tuple."""
-  # Works just like random_ops._ShapeTensor.
-  if isinstance(shape, (tuple, list)) and not shape:
-    dtype = dtypes.int32
-  else:
-    dtype = None
-  return ops.convert_to_tensor(shape, dtype=dtype, name=name)
+def matrix_solve_with_broadcast(matrix, rhs, adjoint=False, name=None):
+  """Solve systems of linear equations."""
+  with ops.name_scope(name, "MatrixSolveWithBroadcast", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_solve(matrix, rhs, adjoint=adjoint)
+
+
+def matrix_triangular_solve_with_broadcast(matrix,
+                                           rhs,
+                                           lower=True,
+                                           adjoint=False,
+                                           name=None):
+  """Solves triangular systems of linear equations with by backsubstitution.
+
+  Works identically to `tf.matrix_triangular_solve`, but broadcasts batch dims
+  of `matrix` and `rhs` (by replicating) if they are determined statically to be
+  different, or if static shapes are not fully defined.  Thus, this may result
+  in an inefficient replication of data.
+
+  Args:
+    matrix: A Tensor. Must be one of the following types:
+      `float64`, `float32`, `complex64`, `complex128`. Shape is `[..., M, M]`.
+    rhs: A `Tensor`. Must have the same `dtype` as `matrix`.
+      Shape is `[..., M, K]`.
+    lower: An optional `bool`. Defaults to `True`. Indicates whether the
+      innermost matrices in `matrix` are lower or upper triangular.
+    adjoint: An optional `bool`. Defaults to `False`. Indicates whether to solve
+      with matrix or its (block-wise) adjoint.
+    name: A name for the operation (optional).
+
+  Returns:
+    `Tensor` with same `dtype` as `matrix` and shape `[..., M, K]`.
+  """
+  with ops.name_scope(name, "MatrixTriangularSolve", [matrix, rhs]):
+    matrix, rhs = broadcast_matrix_batch_dims([matrix, rhs])
+    return linalg_ops.matrix_triangular_solve(
+        matrix,
+        rhs,
+        lower=lower,
+        adjoint=adjoint)
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index bba59ebcef9c7caf1a53d724767999ae7ac079e5..bdf0774bbf834ec10f68423e89e3b8b9b96ad9a1 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -54,8 +54,8 @@ def _TensorListStackGrad(unused_op, dtensor):
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape[0] is not None:
-    num_elements = op.inputs[0].shape[0]
+  if op.inputs[0].shape[0].value is not None:
+    num_elements = op.inputs[0].shape[0].value
   else:
     num_elements = None
   if dlist is None:
@@ -63,9 +63,10 @@ def _TensorListFromTensorGrad(op, dlist):
         element_dtype=op.inputs[0].dtype,
         element_shape=gen_list_ops.tensor_list_element_shape(
             op.outputs[0], shape_type=dtypes.int32))
-  return gen_list_ops.tensor_list_stack(
-      dlist, element_dtype=op.inputs[0].dtype,
-      num_elements=num_elements)
+  tensor_grad = gen_list_ops.tensor_list_stack(
+      dlist, element_dtype=op.inputs[0].dtype, num_elements=num_elements)
+  shape_grad = None
+  return tensor_grad, shape_grad
 
 
 @ops.RegisterGradient("TensorListGetItem")
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index 07741e0c3c3ea8a9bb7d790b901e743907794dc0..4aea0265a72dcd2b2358f063fb0a51a5877076e7 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -43,15 +43,3 @@ py_test(
         "//tensorflow/python:framework_for_generated_wrappers",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 34ca1adc3e13dc67560fb21d70c16cd42dc40552..19a8eaf22cdff132b3351f4a94f27ceca9b48cc6 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
 from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -306,11 +307,8 @@ def cosine_distance(
     ValueError: If `predictions` shape doesn't match `labels` shape, or
       `axis`, `labels`, `predictions` or `weights` is `None`.
   """
-  if dim is not None:
-    if axis is not None:
-      raise ValueError("Cannot specify both 'axis' and 'dim'")
-    axis = dim
-  if axis is None and dim is None:
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  if axis is None:
     raise ValueError("You must specify 'axis'.")
   if labels is None:
     raise ValueError("labels must not be None.")
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 276897ab99e5e8770b72cb1eb27d07fb8dbc08bb..a38ecb2acb4145aec9bdd700e6a2fc179783e3c3 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -174,6 +174,7 @@ from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -184,7 +185,6 @@ arg_min = deprecation.deprecated(None, "Use `argmin` instead")(arg_min)  # pylin
 tf_export("arg_max")(arg_max)
 tf_export("arg_min")(arg_min)
 
-
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
 _resource_variable_type = None
@@ -1343,8 +1343,7 @@ def _ReductionDims(x, axis, reduction_indices):
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
     if isinstance(x, ops.Tensor) and x._rank() is not None:  # pylint: disable=protected-access
-      return constant_op.constant(
-          np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
+      return constant_op.constant(np.arange(x._rank()), dtype=dtypes.int32)  # pylint: disable=protected-access
     if (isinstance(x, sparse_tensor.SparseTensor) and
         x.dense_shape.get_shape().is_fully_defined()):
       rank = x.dense_shape.get_shape()[0].value  # sparse.dense_shape is 1-D.
@@ -1403,10 +1402,11 @@ def reduce_sum(input_tensor,
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
-    The reduced tensor.
+    The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum
+  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
@@ -1522,7 +1522,7 @@ def reduce_mean(input_tensor,
     input_tensor: The tensor to reduce. Should have numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor)]`.
+      `[-rank(input_tensor), rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1632,7 +1632,7 @@ def reduce_min(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1681,7 +1681,7 @@ def reduce_max(input_tensor,
   tensor with a single element is returned.
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -2273,10 +2273,11 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
     ValueError: If `inputs` don't all have same shape and dtype or the shape
     cannot be inferred.
   """
+
   def _input_error():
-    return ValueError(
-        "inputs must be a list of at least one Tensor with the "
-        "same dtype and shape")
+    return ValueError("inputs must be a list of at least one Tensor with the "
+                      "same dtype and shape")
+
   if not inputs or not isinstance(inputs, (list, tuple)):
     raise _input_error()
   inputs = ops.convert_n_to_tensor_or_indexed_slices(inputs)
@@ -2294,8 +2295,8 @@ def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
 
   # tensor_dtype is for safety only; operator's output type computed in C++
   if tensor_dtype is not None and tensor_dtype != inputs[0].dtype:
-    raise TypeError("tensor_dtype is {}, but input is of type {}"
-                    .format(tensor_dtype, inputs[0].dtype))
+    raise TypeError("tensor_dtype is {}, but input is of type {}".format(
+        tensor_dtype, inputs[0].dtype))
 
   if len(inputs) == 1 and name is None:
     return inputs[0]
@@ -2761,14 +2762,14 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sum(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_mean")
-def sparse_segment_mean(data, indices, segment_ids, name=None,
+def sparse_segment_mean(data,
+                        indices,
+                        segment_ids,
+                        name=None,
                         num_segments=None):
   r"""Computes the mean along sparse segments of a tensor.
 
@@ -2805,14 +2806,14 @@ def sparse_segment_mean(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_mean(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("sparse_segment_sqrt_n")
-def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
+def sparse_segment_sqrt_n(data,
+                          indices,
+                          segment_ids,
+                          name=None,
                           num_segments=None):
   r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
 
@@ -2842,10 +2843,7 @@ def sparse_segment_sqrt_n(data, indices, segment_ids, name=None,
         name=name)
   else:
     return gen_math_ops.sparse_segment_sqrt_n(
-        data=data,
-        indices=indices,
-        segment_ids=segment_ids,
-        name=name)
+        data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
 @tf_export("tensordot", "linalg.tensordot")
@@ -3016,6 +3014,47 @@ def tensordot(a, b, axes, name=None):
       return product
 
 
+@tf_export("math.polyval")
+def polyval(coeffs, x, name=None):
+  r"""Computes the elementwise value of a polynomial.
+
+  If `x` is a tensor and `coeffs` is a list n + 1 tensors, this function returns
+  the value of the n-th order polynomial
+
+     p(x) = coeffs[n-1] + coeffs[n-2] * x + ...  + coeffs[0] * x**(n-1)
+
+  evaluated using Horner's method, i.e.
+
+     p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
+            x * coeffs[0]))
+
+  Args:
+    coeffs: A list of `Tensor` representing the coefficients of the polynomial.
+    x: A `Tensor` representing the variable of the polynomial.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as the expression p(x) with usual broadcasting rules
+    for element-wise addition and multiplication applied.
+
+  @compatibility(numpy)
+  Equivalent to numpy.polyval.
+  @end_compatibility
+  """
+
+  with ops.name_scope(name, "polyval", nest.flatten(coeffs) + [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    if len(coeffs) < 1:
+      return array_ops.zeros_like(x, name=name)
+    coeffs = [
+        ops.convert_to_tensor(coeff, name=("coeff_%d" % index))
+        for index, coeff in enumerate(coeffs)
+    ]
+    p = coeffs[0]
+    for c in coeffs[1:]:
+      p = c + p * x
+    return p
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9f85188b3513563a7444f7a0e908f11af985498b..05bcee8801259e4bc6c20c3f61cf20025ba5ea33 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -155,9 +155,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def testRounding(self):
-    x = [0.49, 0.7, -0.3, -0.8]
-    # TODO(nolivia): Remove this when RoundOp is forwards compatible
-    # x = np.arange(-5.0, 5.0, .25)
+    x = np.arange(-5.0, 5.0, .25)
     for dtype in [np.float32, np.double, np.int32]:
       x_np = np.array(x, dtype=dtype)
       with test_util.device(use_gpu=True):
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 9ec49545796cfa7a603b31c23bfd0d495639898d..47eea6ef6b58abd4819544e29783048964104922 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -626,10 +627,16 @@ def auc(labels,
     curve: Specifies the name of the curve to be computed, 'ROC' [default] or
       'PR' for the Precision-Recall-curve.
     name: An optional variable_scope name.
-    summation_method: Specifies the Riemann summation method used, 'trapezoidal'
-      [default] that applies the trapezoidal rule, 'minoring' that applies
-      left summation for increasing intervals and right summation for decreasing
-      intervals or 'majoring' that applies the opposite.
+    summation_method: Specifies the Riemann summation method used
+      (https://en.wikipedia.org/wiki/Riemann_sum): 'trapezoidal' [default] that
+      applies the trapezoidal rule; 'careful_interpolation', a variant of it
+      differing only by a more correct interpolation scheme for PR-AUC -
+      interpolating (true/false) positives but not the ratio that is precision;
+      'minoring' that applies left summation for increasing intervals and right
+      summation for decreasing intervals; 'majoring' that does the opposite.
+      Note that 'careful_interpolation' is strictly preferred to 'trapezoidal'
+      (to be deprecated soon) as it applies the same method for ROC, and a
+      better one (see Davis & Goadrich 2006 for details) for the PR curve.
 
   Returns:
     auc: A scalar `Tensor` representing the current area-under-curve.
@@ -664,8 +671,62 @@ def auc(labels,
     # Add epsilons to avoid dividing by 0.
     epsilon = 1.0e-6
 
+    def interpolate_pr_auc(tp, fp, fn):
+      """Interpolation formula inspired by section 4 of Davis & Goadrich 2006.
+
+      Note here we derive & use a closed formula not present in the paper
+      - as follows:
+      Modeling all of TP (true positive weight),
+      FP (false positive weight) and their sum P = TP + FP (positive weight)
+      as varying linearly within each interval [A, B] between successive
+      thresholds, we get
+        Precision = (TP_A + slope * (P - P_A)) / P
+      with slope = dTP / dP = (TP_B - TP_A) / (P_B - P_A).
+      The area within the interval is thus (slope / total_pos_weight) times
+        int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P}
+        int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P}
+      where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in
+        int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A)
+      Bringing back the factor (slope / total_pos_weight) we'd put aside, we get
+         slope * [dTP + intercept *  log(P_B / P_A)] / total_pos_weight
+      where dTP == TP_B - TP_A.
+      Note that when P_A == 0 the above calculation simplifies into
+        int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A)
+      which is really equivalent to imputing constant precision throughout the
+      first bucket having >0 true positives.
+
+      Args:
+        tp: true positive counts
+        fp: false positive counts
+        fn: false negative counts
+      Returns:
+        pr_auc: an approximation of the area under the P-R curve.
+      """
+      dtp = tp[:num_thresholds - 1] - tp[1:]
+      p = tp + fp
+      prec_slope = _safe_div(dtp, p[:num_thresholds - 1] - p[1:], 'prec_slope')
+      intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
+      safe_p_ratio = array_ops.where(
+          math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
+          _safe_div(p[:num_thresholds - 1], p[1:], 'recall_relative_ratio'),
+          array_ops.ones_like(p[1:]))
+      return math_ops.reduce_sum(
+          _safe_div(
+              prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
+              tp[1:] + fn[1:],
+              name='pr_auc_increment'),
+          name='interpolate_pr_auc')
+
     def compute_auc(tp, fn, tn, fp, name):
       """Computes the roc-auc or pr-auc based on confusion counts."""
+      if curve == 'PR':
+        if summation_method == 'trapezoidal':
+          logging.warning(
+              'Trapezoidal rule is known to produce incorrect PR-AUCs; '
+              'please switch to "careful_interpolation" instead.')
+        elif summation_method == 'careful_interpolation':
+          # This one is a bit tricky and is handled separately.
+          return interpolate_pr_auc(tp, fp, fn)
       rec = math_ops.div(tp + epsilon, tp + fn + epsilon)
       if curve == 'ROC':
         fp_rate = math_ops.div(fp, fp + tn + epsilon)
@@ -675,7 +736,9 @@ def auc(labels,
         prec = math_ops.div(tp + epsilon, tp + fp + epsilon)
         x = rec
         y = prec
-      if summation_method == 'trapezoidal':
+      if summation_method in ('trapezoidal', 'careful_interpolation'):
+        # Note that the case ('PR', 'careful_interpolation') has been handled
+        # above.
         return math_ops.reduce_sum(
             math_ops.multiply(x[:num_thresholds - 1] - x[1:],
                               (y[:num_thresholds - 1] + y[1:]) / 2.),
@@ -923,8 +986,8 @@ def mean_per_class_accuracy(labels,
         weights = array_ops.reshape(weights, [-1])
       weights = math_ops.to_float(weights)
 
-      is_correct = is_correct * weights
-      ones = ones * weights
+      is_correct *= weights
+      ones *= weights
 
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index ee1a00623a734e18d4aebe6c84f77ba53ee1050c..244702d13beaa4a9ba86a47622afe40c1db777e3 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -126,8 +126,6 @@ from tensorflow.python.ops.nn_impl import *
 from tensorflow.python.ops.nn_ops import *
 from tensorflow.python.ops.candidate_sampling_ops import *
 from tensorflow.python.ops.embedding_ops import *
-from tensorflow.python.ops.rnn import *
-from tensorflow.python.ops import rnn_cell
 # pylint: enable=wildcard-import,unused-import
 
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index a74de39eab34a1a27df90f70adf0f4c68ec29465..07ca32953f796466964d4555e45052fcf3c53ce0 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1808,7 +1808,7 @@ def softmax_cross_entropy_with_logits_v2(
   or `float64`).
 
   Backpropagation will happen into both `logits` and `labels`.  To disallow
-  backpropagation into `labels`, pass label tensors through a `stop_gradients`
+  backpropagation into `labels`, pass label tensors through @{tf.stop_gradient}
   before feeding it to this function.
 
   **Note that to avoid confusion, it is required to pass only named arguments to
@@ -1836,8 +1836,10 @@ def softmax_cross_entropy_with_logits_v2(
                       [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
+    convert_to_float32 = (
+        logits.dtype == dtypes.float16 or logits.dtype == dtypes.bfloat16)
     precise_logits = math_ops.cast(
-        logits, dtypes.float32) if (logits.dtype == dtypes.float16) else logits
+        logits, dtypes.float32) if convert_to_float32 else logits
     # labels and logits must be of the same type
     labels = math_ops.cast(labels, precise_logits.dtype)
     input_rank = array_ops.rank(precise_logits)
@@ -1883,8 +1885,8 @@ def softmax_cross_entropy_with_logits_v2(
       del shape[dim]
       cost.set_shape(shape)
 
-    if logits.dtype == dtypes.float16:
-      return math_ops.cast(cost, dtypes.float16)
+    if convert_to_float32:
+      return math_ops.cast(cost, logits.dtype)
     else:
       return cost
 
@@ -1893,7 +1895,7 @@ _XENT_DEPRECATION = """
 Future major versions of TensorFlow will allow gradients to flow
 into the labels input on backprop by default.
 
-See tf.nn.softmax_cross_entropy_with_logits_v2.
+See @{tf.nn.softmax_cross_entropy_with_logits_v2}.
 """
 
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index af9dae2aa64f0994f403ac81dcba800699d3c960..46a5f4fae6b15766c21011ebeae5437262192df7 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -852,6 +852,57 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
       self.assertAllClose(exp_sampled_softmax_loss,
                           got_sampled_softmax_loss.eval(), 1e-4)
 
+  def testSampledSoftmaxLossBf16(self):
+    # A simple test to verify the numerics for bfloat16.
+    def _SoftmaxCrossEntropyWithLogits(logits, targets):
+      # logits, targets: float arrays of the same shape.
+      assert logits.shape == targets.shape
+      stable_exp_logits = np.exp(
+          logits - np.amax(logits, axis=1, keepdims=True))
+      pred = stable_exp_logits / np.sum(stable_exp_logits, 1, keepdims=True)
+      return -np.sum(targets * np.log(pred + 1.0e-20), axis=1)
+
+    np.random.seed(0)
+    num_classes = 5
+    batch_size = 3
+    labels = [0, 1, 2]
+    sampled = [1, 0, 2, 3]
+    (weights, biases, hidden_acts, _, exp_logits,
+     exp_labels) = self._GenerateTestData(
+         num_classes=num_classes,
+         dim=10,
+         batch_size=batch_size,
+         num_true=1,
+         labels=labels,
+         sampled=sampled,
+         subtract_log_q=True)
+    exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
+        exp_logits, exp_labels)
+
+    with self.test_session():
+      true_exp_bf16 = np.full(
+          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_exp_bf16 = np.full(
+          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
+      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+      got_sampled_softmax_loss = math_ops.cast(
+          nn_impl.sampled_softmax_loss(
+              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+              labels=constant_op.constant(
+                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+              num_sampled=4,
+              num_classes=num_classes,
+              num_true=1,
+              sampled_values=sampled_vals_bf16,
+              remove_accidental_hits=False,
+              partition_strategy="div"), dtypes.float32)
+
+      self.assertAllClose(exp_sampled_softmax_loss,
+                          got_sampled_softmax_loss.eval(), 1e-1)
+
 
 class CReluTest(test_lib.TestCase):
 
@@ -1030,6 +1081,42 @@ class DataFormatDimMapTest(test_lib.TestCase):
     self._test([1, -3, -2], [2, 2, 3])
     self._test([[1, -3], [1, -1]], [[2, 2], [2, 1]])
 
+  def testNHWCtoNCHW(self):
+    x_val = [1, -3, -2]
+    y_val_expected = [2, 2, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoHWNC(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNHWCtoWHCN(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testArbitraryASCII(self):
+    x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
+    y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
+    with self.test_session(use_gpu=test_lib.is_gpu_available()) as sess:
+      y_val = sess.run(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
 
 class DataFormatVectorPermuteTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 075b38d743d13329e646c0b268e938b5c5704e47..d8d9af545f17fe3e0133b51b1eab82f7732dc299 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -1176,8 +1176,13 @@ def _parse_single_sequence_example_raw(serialized,
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export("decode_csv")
-def decode_csv(records, record_defaults, field_delim=",",
-               use_quote_delim=True, name=None, na_value=""):
+def decode_csv(records,
+               record_defaults,
+               field_delim=",",
+               use_quote_delim=True,
+               name=None,
+               na_value="",
+               select_cols=None):
   """Convert CSV records to tensors. Each column maps to one tensor.
 
   RFC 4180 format is expected for the CSV records.
@@ -1200,19 +1205,32 @@ def decode_csv(records, record_defaults, field_delim=",",
       Bullet 5).
     name: A name for the operation (optional).
     na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
 
   Returns:
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
+
+  Raises:
+    ValueError: If any of the arguments is malformed.
   """
-  # TODO(martinwicke), remove the wrapper when new Python API generator is done.
+  if select_cols is not None and any(select_cols[i] >= select_cols[i + 1]
+                                     for i in range(len(select_cols) - 1)):
+    raise ValueError("select_cols is not strictly increasing.")
+  if select_cols is not None and select_cols[0] < 0:
+    raise ValueError("select_cols contains negative values.")
+  if select_cols is not None and len(select_cols) != len(record_defaults):
+    raise ValueError("Length of select_cols and record_defaults do not match.")
   return gen_parsing_ops.decode_csv(
       records=records,
       record_defaults=record_defaults,
       field_delim=field_delim,
       use_quote_delim=use_quote_delim,
       na_value=na_value,
-      name=name)
+      name=name,
+      select_cols=select_cols,
+  )
 
 
 # TODO(b/70890287): Combine the implementation of this op and
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index df873da98e7fac7accc99a229ffb53a60a74c9bb..508ba9bfeeb4dcb627288357c1c7b6ab4ef14c5c 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -72,7 +72,12 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
     # know the shape and dtype of the variable pointed to by a handle. Since
     # shape inference doesn't run in eager mode we copy this data here for when
     # the handle is captured by an eager mode function.
-    handle._handle_data = h._handle_data  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    if h._handle_data is None:
+      ops.set_shape_and_handle_data_for_outputs(h.op)
+    handle._handle_data = h._handle_data
+    # pylint: enable=protected-access
+
   # Clean up our reference cycles to avoid making the garbage collector run.
   # pylint: disable=protected-access
   # OrderedDict, constructed on Graph creation, makes a simple reference loop
@@ -171,7 +176,9 @@ class ResourceVariable(variables.Variable):
   to see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
-  value of the variable on which the read_value operation does not depend on.
+  value of the variable from operations that depend on the read_value operation.
+  Updates from operations that have no dependency relationship to the read_value
+  operation might or might not be visible to read_value.
 
   For example, if there is more than one assignment to a ResourceVariable in
   a single session.run call there is a well-defined value for each operation
@@ -1087,6 +1094,11 @@ ops.register_proto_function(
     proto_type=variable_pb2.VariableDef,
     to_proto=_to_proto_fn,
     from_proto=_from_proto_fn)
+ops.register_proto_function(
+    ops.GraphKeys.GLOBAL_STEP,
+    proto_type=variable_pb2.VariableDef,
+    to_proto=_to_proto_fn,
+    from_proto=_from_proto_fn)
 
 
 def is_resource_variable(var):
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index fe380c44dafdad6dc25d50102bacba610132674d..54f4e0f2407393e1a617633d886b43ab59cade29 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -599,9 +599,9 @@ class BasicLSTMCell(LayerRNNCell):
     Args:
       inputs: `2-D` tensor with shape `[batch_size, input_size]`.
       state: An `LSTMStateTuple` of state tensors, each shaped
-        `[batch_size, self.state_size]`, if `state_is_tuple` has been set to
+        `[batch_size, num_units]`, if `state_is_tuple` has been set to
         `True`.  Otherwise, a `Tensor` shaped
-        `[batch_size, 2 * self.state_size]`.
+        `[batch_size, 2 * num_units]`.
 
     Returns:
       A pair containing the new hidden state, and the new state (either a
@@ -1206,7 +1206,16 @@ class DeviceWrapper(RNNCell):
 
 @tf_export("nn.rnn_cell.MultiRNNCell")
 class MultiRNNCell(RNNCell):
-  """RNN cell composed sequentially of multiple simple cells."""
+  """RNN cell composed sequentially of multiple simple cells.
+
+  Example:
+
+  ```python
+  num_units = [128, 64]
+  cells = [BasicLSTMCell(num_units=n) for n in num_units]
+  stacked_rnn_cell = MultiRNNCell(cells)
+  ```
+  """
 
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 1b4111bca630ffa122ed590b0e3d54b796ab6b7a..96fb0247157851b8bd931142b048b3df4da65503 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -334,7 +334,11 @@ def py_func(func, inp, Tout, stateful=True, name=None):
     result = func(*[x.numpy() for x in inp])
     result = nest.flatten(result)
 
-    return [x if x is None else ops.convert_to_tensor(x) for x in result]
+    result = [x if x is None else ops.convert_to_tensor(x) for x in result]
+    if len(result) == 1:
+      # Mimic the automatic unwrapping in graph-mode py_func
+      result, = result
+    return result
 
   return _internal_py_func(
       func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name)
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 01fc3182bc6f7b4f85d0df540bb26308d9fec72f..f6a11ca625b46cd088c3764039a10bc72619d1f8 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -423,3 +423,55 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
       ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
       use_locking, name)]):
     return ref.read_value()
+
+
+@tf_export("scatter_add")
+def scatter_add(ref, indices, updates, use_locking=False, name=None):
+  # pylint: disable=line-too-long
+  r"""Adds sparse updates to the variable referenced by `resource`.
+
+  This operation computes
+
+  ```python
+      # Scalar indices
+      ref[indices, ...] += updates[...]
+
+      # Vector indices (for each i)
+      ref[indices[i], ...] += updates[i, ...]
+
+      # High rank indices (for each i, ..., j)
+      ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+  ```
+
+  This operation outputs `ref` after the update is done.
+  This makes it easier to chain operations that need to use the updated value.
+  Duplicate entries are handled correctly: if multiple `indices` reference
+  the same location, their contributions add.
+
+  Requires `updates.shape = indices.shape + ref.shape[1:]`.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+  </div>
+
+  Args:
+    ref: A `Variable`.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into the first dimension of `ref`.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to store in `ref`.
+    use_locking: An optional `bool`. Defaults to `True`.
+      If True, the assignment will be protected by a lock;
+      otherwise the behavior is undefined, but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    Same as `ref`.  Returned as a convenience for operations that want
+    to use the updated values after the update is done.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_add(ref, indices, updates,
+                                     use_locking=use_locking, name=name)
+  return ref._lazy_read(gen_resource_variable_ops.resource_scatter_add(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 37b80d5e20bf06c041a669c14ac6d88201af2180..a793f634bda06ad43991fb978f865a2c5fe25437 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -23,6 +23,7 @@ import re
 
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import distribute
 
 
 def collect(val, collections, default_collections):
@@ -42,6 +43,16 @@ def collect(val, collections, default_collections):
 _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
+def skip_summary():
+  # If using multiple towers in distributed strategy, skip summaries on all
+  # towers except the first one (tower_id=0).
+  # TODO(priyag): Add a new optional argument that will provide multiple
+  # alternatives to override default behavior. (e.g. run on last tower,
+  # compute sum or mean across towers).
+  tower_context = distribute.get_tower_context()
+  return tower_context and tower_context.tower_id > 0
+
+
 def clean_tag(name):
   """Cleans a tag. Removes illegal characters for instance.
 
diff --git a/tensorflow/python/ops/summary_ops.py b/tensorflow/python/ops/summary_ops.py
index 037bc9845a3f734f65b73b0c4b4ca19fb653731d..ec4d4a6e9242107fd7f4bebe1416198457e32cee 100644
--- a/tensorflow/python/ops/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import summary_pb2
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import summary_op_util
@@ -71,6 +72,8 @@ def tensor_summary(name,
 
   serialized_summary_metadata = summary_metadata.SerializeToString()
 
+  if summary_op_util.skip_summary():
+    return constant_op.constant("")
   with summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = gen_logging_ops.tensor_summary_v2(
diff --git a/tensorflow/contrib/summary/summary_ops.py b/tensorflow/python/ops/summary_ops_v2.py
similarity index 90%
rename from tensorflow/contrib/summary/summary_ops.py
rename to tensorflow/python/ops/summary_ops_v2.py
index bc763fe655edc455e2538e536d6efab314c8228c..12f361c513fcebf8ce4b9c367d101b11ab10260b 100644
--- a/tensorflow/contrib/summary/summary_ops.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -31,7 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.layers import utils
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_summary_ops
@@ -108,8 +108,10 @@ class SummaryWriter(object):
   - @{tf.contrib.summary.create_db_writer}
   """
 
-  def  __init__(self, resource):
+  def  __init__(self, resource, init_op_fn):
     self._resource = resource
+    # TODO(nickfelt): cache constructed ops in graph mode
+    self._init_op_fn = init_op_fn
     if context.executing_eagerly() and self._resource is not None:
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           handle=self._resource, handle_device="cpu:0")
@@ -129,10 +131,32 @@ class SummaryWriter(object):
       yield self
       # Flushes the summary writer in eager mode or in graph functions, but not
       # in legacy graph mode (you're on your own there).
-      with ops.device("cpu:0"):
-        gen_summary_ops.flush_summary_writer(self._resource)
+      self.flush()
       context.context().summary_writer_resource = old
 
+  def init(self):
+    """Operation to initialize the summary writer resource."""
+    if self._resource is not None:
+      return self._init_op_fn()
+
+  def _flush(self):
+    return _flush_fn(writer=self)
+
+  def flush(self):
+    """Operation to force the summary writer to flush any buffered data."""
+    if self._resource is not None:
+      return self._flush()
+
+  def _close(self):
+    with ops.control_dependencies([self.flush()]):
+      with ops.device("cpu:0"):
+        return gen_summary_ops.close_summary_writer(self._resource)
+
+  def close(self):
+    """Operation to flush and close the summary writer resource."""
+    if self._resource is not None:
+      return self._close()
+
 
 def initialize(
     graph=None,  # pylint: disable=redefined-outer-name
@@ -178,7 +202,7 @@ def create_file_writer(logdir,
                        flush_millis=None,
                        filename_suffix=None,
                        name=None):
-  """Creates a summary file writer in the current context.
+  """Creates a summary file writer in the current context under the given name.
 
   Args:
     logdir: a string, or None. If a string, creates a summary file writer
@@ -186,18 +210,20 @@ def create_file_writer(logdir,
      a mock object which acts like a summary writer but does nothing,
      useful to use as a context manager.
     max_queue: the largest number of summaries to keep in a queue; will
-     flush once the queue gets bigger than this.
-    flush_millis: the largest interval between flushes.
-    filename_suffix: optional suffix for the event file name.
+     flush once the queue gets bigger than this. Defaults to 10.
+    flush_millis: the largest interval between flushes. Defaults to 120,000.
+    filename_suffix: optional suffix for the event file name. Defaults to `.v2`.
     name: Shared name for this SummaryWriter resource stored to default
-      Graph.
+      Graph. Defaults to the provided logdir prefixed with `logdir:`. Note: if a
+      summary writer resource with this shared name already exists, the returned
+      SummaryWriter wraps that resource and the other arguments have no effect.
 
   Returns:
     Either a summary writer or an empty object which can be used as a
     summary writer.
   """
   if logdir is None:
-    return SummaryWriter(None)
+    return SummaryWriter(None, None)
   with ops.device("cpu:0"):
     if max_queue is None:
       max_queue = constant_op.constant(10)
@@ -205,6 +231,8 @@ def create_file_writer(logdir,
       flush_millis = constant_op.constant(2 * 60 * 1000)
     if filename_suffix is None:
       filename_suffix = constant_op.constant(".v2")
+    if name is None:
+      name = "logdir:" + logdir
     return _make_summary_writer(
         name,
         gen_summary_ops.create_summary_file_writer,
@@ -267,13 +295,12 @@ def create_db_writer(db_uri,
 
 def _make_summary_writer(name, factory, **kwargs):
   resource = gen_summary_ops.summary_writer(shared_name=name)
+  init_op_fn = lambda: factory(resource, **kwargs)
   # TODO(apassos): Consider doing this instead.
-  # node = factory(resource, **kwargs)
   # if not context.executing_eagerly():
-  #   ops.get_default_session().run(node)
-  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME,
-                        factory(resource, **kwargs))
-  return SummaryWriter(resource)
+  #   ops.get_default_session().run(init_op)
+  ops.add_to_collection(_SUMMARY_WRITER_INIT_COLLECTION_NAME, init_op_fn())
+  return SummaryWriter(resource, init_op_fn)
 
 
 def _cleanse_string(name, pattern, value):
@@ -341,7 +368,7 @@ def summary_writer_function(name, tensor, function, family=None):
   if context.context().summary_writer_resource is None:
     return control_flow_ops.no_op()
   with ops.device("cpu:0"):
-    op = utils.smart_cond(
+    op = smart_cond.smart_cond(
         should_record_summaries(), record, _nothing, name="")
     ops.add_to_collection(ops.GraphKeys._SUMMARY_COLLECTION, op)  # pylint: disable=protected-access
   return op
@@ -538,7 +565,14 @@ def flush(writer=None, name=None):
     writer = context.context().summary_writer_resource
     if writer is None:
       return control_flow_ops.no_op()
-  return gen_summary_ops.flush_summary_writer(writer, name=name)
+  else:
+    if isinstance(writer, SummaryWriter):
+      writer = writer._resource  # pylint: disable=protected-access
+  with ops.device("cpu:0"):
+    return gen_summary_ops.flush_summary_writer(writer, name=name)
+
+
+_flush_fn = flush  # for within SummaryWriter.flush()
 
 
 def eval_dir(model_dir, name=None):
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index c35735ca656b21d43f758830e68e5777d654f271..ba213ef884165f7f72094d27932913e39c9a5901 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -307,6 +307,17 @@ class _VariableStore(object):
       raise ValueError(
           "Passed a custom_getter which is not callable: %s" % custom_getter)
 
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # Variable creation and initialization takes place in `init_scope`s;
+        # as such, if an `init_scope` lifts us into the eager context, then we
+        # need to use `ResourceVariable`s.
+        use_resource = True
+
+    # Note that it's fine to reuse eager variables whose initialization was
+    # lifted from a function-building graph into the eager context (that's why
+    # the following clause is not wrapped in an `init_scope`); lifted variables
+    # are tracked by the graph's `VariableStore`.
     if context.executing_eagerly():
       if not self._store_eager_variables and reuse:
         raise RuntimeError(
@@ -315,7 +326,6 @@ class _VariableStore(object):
             " EagerVariableStore for example usage.")
       if self._store_eager_variables:
         reuse = AUTO_REUSE
-      use_resource = True
 
     # If a *_ref type is passed in an error would be triggered further down the
     # stack. We prevent this using base_dtype to get a non-ref version of the
@@ -1164,7 +1174,7 @@ class _VariableScopeStore(threading.local):
       self.variable_scopes_count[scope_name] = 1
 
   def close_variable_subscopes(self, scope_name):
-    for k in self.variable_scopes_count:
+    for k in list(self.variable_scopes_count.keys()):
       if not scope_name or k.startswith(scope_name + "/"):
         self.variable_scopes_count[k] = 0
 
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index dbefca2be9615b18418a92f4cbe0b1a0b2917449..478dd46f7e6965f8727e5741f2ccdfdc69247980 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -229,3 +229,25 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 %define final %enddef
 %define override %enddef
 #endif
+
+// Typemaps to automatically raise a Python exception from bad output TF_Status.
+// TODO(b/77295559): expand this to all TF_Status* output params and deprecate
+// raise_exception_on_not_ok_status (currently it only affects the C API).
+%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+  $1 = TF_NewStatus();
+}
+
+%typemap(freearg) (TF_Status* status) {
+ TF_DeleteStatus($1);
+}
+
+%typemap(argout) TF_Status* status {
+  TF_Code code = TF_GetCode($1);
+  if (code != TF_OK) {
+    PyObject* exc = tensorflow::PyExceptionRegistry::Lookup(code);
+    // Arguments to OpError.
+    PyObject* exc_args = Py_BuildValue("sss", nullptr, nullptr, TF_Message($1));
+    SWIG_SetErrorObj(exc, exc_args);
+    SWIG_fail;
+  }
+}
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index c815aad0a065eaba4a0dc52487b5ee67e271a146..0654104a3436366bb5fe88e2c3415cc957cbfde8 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -156,18 +156,3 @@ py_test(
         "@com_google_pprof//:pprof_proto_py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 362a1c49e64118134a4039ae3a5d939ed0b6d730..994206cd63a915de93bc109e7b217ad997c787a7 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -70,18 +70,3 @@ cuda_py_test(
         "no_pip",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 39fabb9c1bc646a09557293c1f645a8b97f5bbdd..5ee55301df986998b22b8b57b5f01b1f6b4918ac 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+%include "tensorflow/python/platform/base.i"
+
 %ignore "";
 
 %rename("%s") TFE_NewContext;
@@ -118,9 +120,9 @@ limitations under the License.
 
 }
 %typemap(out) (TFE_Context*) {
-  if ($1 == nullptr) {
-    SWIG_fail;
-  } else {
+  // When the TFE_Context* returned is a nullptr, we expect the status is not
+  // OK. This will raise an error (happens in another typemap).
+  if ($1 != nullptr) {
     $result = PyCapsule_New($1, nullptr, TFE_DeleteContextCapsule);
   }
 }
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 30e0a099d8b2e30cff36b69164ba9f1789dd8916..2609a5d222659f6ebf775d6baa48bd7bc39fd7f6 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -235,15 +235,3 @@ py_test(
 
 # -----------------------------------------------------------------------------
 # Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 97f2ddfdfc49e415bdcff428d6bd3f5b61cc3f20..1286ed670390350ff1695b748714e4b2be60352e 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -50,6 +50,7 @@ from tensorflow.core.util.event_pb2 import TaggedRunMetadata
 
 
 from tensorflow.python.eager import context as _context
+from tensorflow.python.framework import constant_op as _constant_op
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.ops import gen_logging_ops as _gen_logging_ops
@@ -98,6 +99,8 @@ def scalar(name, tensor, collections=None, family=None):
   Raises:
     ValueError: If tensor has the wrong shape or type.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = _gen_logging_ops.scalar_summary(tags=tag, values=tensor, name=scope)
@@ -151,6 +154,8 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[tensor]) as (tag, scope):
     val = _gen_logging_ops.image_summary(
@@ -189,6 +194,8 @@ def histogram(name, values, collections=None, family=None):
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family, values=[values],
       default_name='HistogramSummary') as (tag, scope):
@@ -234,6 +241,8 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
     A scalar `Tensor` of type `string`. The serialized `Summary` protocol
     buffer.
   """
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   with _summary_op_util.summary_scope(
       name, family=family, values=[tensor]) as (tag, scope):
     sample_rate = _ops.convert_to_tensor(
@@ -282,6 +291,8 @@ def merge(inputs, collections=None, name=None):
     raise RuntimeError(
         'Merging tf.summary.* ops is not compatible with eager execution. '
         'Use tf.contrib.summary instead.')
+  if _summary_op_util.skip_summary():
+    return _constant_op.constant('')
   name = _summary_op_util.clean_tag(name)
   with _ops.name_scope(name, 'Merge', inputs):
     val = _gen_logging_ops.merge_summary(inputs=inputs, name=name)
diff --git a/tensorflow/python/summary/writer/event_file_writer_v2.py b/tensorflow/python/summary/writer/event_file_writer_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c66c0f7a8fd4c65d3539d257b4e4fa89f839a98
--- /dev/null
+++ b/tensorflow/python/summary/writer/event_file_writer_v2.py
@@ -0,0 +1,140 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Writes events to disk in a logdir."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
+
+
+class EventFileWriterV2(object):
+  """Writes `Event` protocol buffers to an event file via the graph.
+
+  The `EventFileWriterV2` class is backed by the summary file writer in the v2
+  summary API (currently in tf.contrib.summary), so it uses a shared summary
+  writer resource and graph ops to write events.
+
+  As with the original EventFileWriter, this class will asynchronously write
+  Event protocol buffers to the backing file. The Event file is encoded using
+  the tfrecord format, which is similar to RecordIO.
+  """
+
+  def __init__(self, session, logdir, max_queue=10, flush_secs=120,
+               filename_suffix=''):
+    """Creates an `EventFileWriterV2` and an event file to write to.
+
+    On construction, this calls `tf.contrib.summary.create_file_writer` within
+    the graph from `session.graph` to look up a shared summary writer resource
+    for `logdir` if one exists, and create one if not. Creating the summary
+    writer resource in turn creates a new event file in `logdir` to be filled
+    with `Event` protocol buffers passed to `add_event`. Graph ops to control
+    this writer resource are added to `session.graph` during this init call;
+    stateful methods on this class will call `session.run()` on these ops.
+
+    Note that because the underlying resource is shared, it is possible that
+    other parts of the code using the same session may interact independently
+    with the resource, e.g. by flushing or even closing it. It is the caller's
+    responsibility to avoid any undesirable sharing in this regard.
+
+    The remaining arguments to the constructor (`flush_secs`, `max_queue`, and
+    `filename_suffix`) control the construction of the shared writer resource
+    if one is created. If an existing resource is reused, these arguments have
+    no effect.  See `tf.contrib.summary.create_file_writer` for details.
+
+    Args:
+      session: A `tf.Session`. Session that will hold shared writer resource.
+        The writer ops will be added to session.graph during this init call.
+      logdir: A string. Directory where event file will be written.
+      max_queue: Integer. Size of the queue for pending events and summaries.
+      flush_secs: Number. How often, in seconds, to flush the
+        pending events and summaries to disk.
+      filename_suffix: A string. Every event file's name is suffixed with
+        `filename_suffix`.
+    """
+    self._session = session
+    self._logdir = logdir
+    self._closed = False
+    if not gfile.IsDirectory(self._logdir):
+      gfile.MakeDirs(self._logdir)
+
+    with self._session.graph.as_default():
+      with ops.name_scope('filewriter'):
+        file_writer = summary_ops_v2.create_file_writer(
+            logdir=self._logdir,
+            max_queue=max_queue,
+            flush_millis=flush_secs * 1000,
+            filename_suffix=filename_suffix)
+        with summary_ops_v2.always_record_summaries(), file_writer.as_default():
+          self._event_placeholder = array_ops.placeholder_with_default(
+              constant_op.constant('unused', dtypes.string),
+              shape=[])
+          self._add_event_op = summary_ops_v2.import_event(
+              self._event_placeholder)
+        self._init_op = file_writer.init()
+        self._flush_op = file_writer.flush()
+        self._close_op = file_writer.close()
+      self._session.run(self._init_op)
+
+  def get_logdir(self):
+    """Returns the directory where event file will be written."""
+    return self._logdir
+
+  def reopen(self):
+    """Reopens the EventFileWriter.
+
+    Can be called after `close()` to add more events in the same directory.
+    The events will go into a new events file.
+
+    Does nothing if the EventFileWriter was not closed.
+    """
+    if self._closed:
+      self._closed = False
+      self._session.run(self._init_op)
+
+  def add_event(self, event):
+    """Adds an event to the event file.
+
+    Args:
+      event: An `Event` protocol buffer.
+    """
+    if not self._closed:
+      event_pb = event.SerializeToString()
+      self._session.run(
+          self._add_event_op, feed_dict={self._event_placeholder: event_pb})
+
+  def flush(self):
+    """Flushes the event file to disk.
+
+    Call this method to make sure that all pending events have been written to
+    disk.
+    """
+    self._session.run(self._flush_op)
+
+  def close(self):
+    """Flushes the event file to disk and close the file.
+
+    Call this method when you do not need the summary writer anymore.
+    """
+    if not self._closed:
+      self.flush()
+      self._session.run(self._close_op)
+      self._closed = True
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index 57f78c156b1334a5486b29f2ddec957e49156e73..aca084fc9168e710316e4c988594cff69e54ebab 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import plugin_asset
 from tensorflow.python.summary.writer.event_file_writer import EventFileWriter
+from tensorflow.python.summary.writer.event_file_writer_v2 import EventFileWriterV2
 from tensorflow.python.util.tf_export import tf_export
 
 _PLUGINS_DIR = "plugins"
@@ -286,6 +287,11 @@ class FileWriter(SummaryToEventTransformer):
   file contents asynchronously. This allows a training program to call methods
   to add data to the file directly from the training loop, without slowing down
   training.
+
+  When constructed with a `tf.Session` parameter, a `FileWriter` instead forms
+  a compatibility layer over new graph-based summaries (`tf.contrib.summary`)
+  to facilitate the use of new summary writing with pre-existing code that
+  expects a `FileWriter` instance.
   """
 
   def __init__(self,
@@ -294,10 +300,11 @@ class FileWriter(SummaryToEventTransformer):
                max_queue=10,
                flush_secs=120,
                graph_def=None,
-               filename_suffix=None):
-    """Creates a `FileWriter` and an event file.
+               filename_suffix=None,
+               session=None):
+    """Creates a `FileWriter`, optionally shared within the given session.
 
-    On construction the summary writer creates a new event file in `logdir`.
+    Typically, constructing a file writer creates a new event file in `logdir`.
     This event file will contain `Event` protocol buffers constructed when you
     call one of the following functions: `add_summary()`, `add_session_log()`,
     `add_event()`, or `add_graph()`.
@@ -317,13 +324,16 @@ class FileWriter(SummaryToEventTransformer):
     writer = tf.summary.FileWriter(<some-directory>, sess.graph)
     ```
 
-    The other arguments to the constructor control the asynchronous writes to
-    the event file:
-
-    *  `flush_secs`: How often, in seconds, to flush the added summaries
-       and events to disk.
-    *  `max_queue`: Maximum number of summaries or events pending to be
-       written to disk before one of the 'add' calls block.
+    The `session` argument to the constructor makes the returned `FileWriter` a
+    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    Crucially, this means the underlying writer resource and events file will
+    be shared with any other `FileWriter` using the same `session` and `logdir`,
+    and with any `tf.contrib.summary.SummaryWriter` in this session using the
+    the same shared resource name (which by default scoped to the logdir). If
+    no such resource exists, one will be created using the remaining arguments
+    to this constructor, but if one already exists those arguments are ignored.
+    In either case, ops will be added to `session.graph` to control the
+    underlying file writer resource. See `tf.contrib.summary` for more details.
 
     Args:
       logdir: A string. Directory where event file will be written.
@@ -334,6 +344,7 @@ class FileWriter(SummaryToEventTransformer):
       graph_def: DEPRECATED: Use the `graph` argument instead.
       filename_suffix: A string. Every event file's name is suffixed with
         `suffix`.
+      session: A `tf.Session` object. See details above.
 
     Raises:
       RuntimeError: If called with eager execution enabled.
@@ -347,9 +358,12 @@ class FileWriter(SummaryToEventTransformer):
       raise RuntimeError(
           "tf.summary.FileWriter is not compatible with eager execution. "
           "Use tf.contrib.summary instead.")
-
-    event_writer = EventFileWriter(logdir, max_queue, flush_secs,
-                                   filename_suffix)
+    if session is not None:
+      event_writer = EventFileWriterV2(
+          session, logdir, max_queue, flush_secs, filename_suffix)
+    else:
+      event_writer = EventFileWriter(logdir, max_queue, flush_secs,
+                                     filename_suffix)
     super(FileWriter, self).__init__(event_writer, graph, graph_def)
 
   def __enter__(self):
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 88ade0aac33f1cd8f9d8cb30344aabca76a13511..dc990c2602427049ecdb7588ff217207a69cbcd2 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -29,10 +29,12 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
+from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import plugin_asset
@@ -42,7 +44,10 @@ from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.util import compat
 
 
-class SummaryWriterTestCase(test.TestCase):
+class FileWriterTestCase(test.TestCase):
+
+  def _FileWriter(self, *args, **kwargs):
+    return writer.FileWriter(*args, **kwargs)
 
   def _TestDir(self, test_name):
     test_dir = os.path.join(self.get_temp_dir(), test_name)
@@ -96,7 +101,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.add_summary(
@@ -171,7 +176,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, graph=g)
+    sw = self._FileWriter(test_dir, graph=g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -179,7 +184,7 @@ class SummaryWriterTestCase(test.TestCase):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
-    sw = writer.FileWriter(test_dir, g)
+    sw = self._FileWriter(test_dir, g)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
@@ -188,7 +193,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, graph_def=gd)
+    sw = self._FileWriter(test_dir, graph_def=gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -197,7 +202,7 @@ class SummaryWriterTestCase(test.TestCase):
     with ops.Graph().as_default() as g:
       constant_op.constant([12], name="douze")
     gd = g.as_graph_def()
-    sw = writer.FileWriter(test_dir, gd)
+    sw = self._FileWriter(test_dir, gd)
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
@@ -207,18 +212,18 @@ class SummaryWriterTestCase(test.TestCase):
       with ops.Graph().as_default() as g:
         constant_op.constant([12], name="douze")
       gd = g.as_graph_def()
-      sw = writer.FileWriter(test_dir, graph=g, graph_def=gd)
+      sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
-      sw = writer.FileWriter(test_dir, "string instead of graph object")
+      sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     sw.close()
     # Sleep at least one second to make sure we get a new event file name.
@@ -261,7 +266,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     # Sleep 1.2 seconds to make sure event queue is empty.
     time.sleep(1.2)
     time_before_close = time.time()
@@ -270,7 +275,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
-    with writer.FileWriter(test_dir) as sw:
+    with self._FileWriter(test_dir) as sw:
       sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
     event_paths = sorted(glob.glob(os.path.join(test_dir, "event*")))
     self.assertEquals(1, len(event_paths))
@@ -280,7 +285,7 @@ class SummaryWriterTestCase(test.TestCase):
   # protocol buffers correctly.
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
     with self.test_session():
       i = constant_op.constant(1, dtype=dtypes.int32, shape=[])
       l = constant_op.constant(2, dtype=dtypes.int64, shape=[])
@@ -327,7 +332,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
-    sw = writer.FileWriter(test_dir)
+    sw = self._FileWriter(test_dir)
 
     sw.add_session_log(event_pb2.SessionLog(status=SessionLog.START), 1)
 
@@ -386,7 +391,7 @@ class SummaryWriterTestCase(test.TestCase):
 
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
-    sw = writer.FileWriter(test_dir, filename_suffix="_test_suffix")
+    sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
     for _ in range(10):
       sw.add_summary(
           summary_pb2.Summary(value=[
@@ -400,9 +405,178 @@ class SummaryWriterTestCase(test.TestCase):
     for filename in event_filenames:
       self.assertTrue(filename.endswith("_test_suffix"))
 
+  def testPluginAssetSerialized(self):
+    class ExamplePluginAsset(plugin_asset.PluginAsset):
+      plugin_name = "example"
+
+      def assets(self):
+        return {"foo.txt": "foo!", "bar.txt": "bar!"}
+
+    with ops.Graph().as_default() as g:
+      plugin_asset.get_plugin_asset(ExamplePluginAsset)
+
+      logdir = self.get_temp_dir()
+      fw = self._FileWriter(logdir)
+      fw.add_graph(g)
+    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
+
+    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "foo!")
+
+    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
+      content = f.read()
+    self.assertEqual(content, "bar!")
 
-class SummaryWriterCacheTest(test.TestCase):
-  """SummaryWriterCache tests."""
+
+class SessionBasedFileWriterTestCase(FileWriterTestCase):
+  """Tests for FileWriter behavior when passed a Session argument."""
+
+  def _FileWriter(self, *args, **kwargs):
+    if "session" not in kwargs:
+      # Pass in test_session() as the session. It will be cached during this
+      # test method invocation so that any other use of test_session() with no
+      # graph should result in re-using the same underlying Session.
+      with self.test_session() as sess:
+        kwargs["session"] = sess
+        return writer.FileWriter(*args, **kwargs)
+    return writer.FileWriter(*args, **kwargs)
+
+  def _createTaggedSummary(self, tag):
+    summary = summary_pb2.Summary()
+    summary.value.add(tag=tag)
+    return summary
+
+  def testSharing_withOtherSessionBasedFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # File writer, should share file with writer1
+      writer2 = writer.FileWriter(session=sess, logdir=logdir)
+      writer2.add_summary(self._createTaggedSummary("two"), 2)
+      writer2.flush()
+
+      # File writer with different logdir (shouldn't be in this logdir at all)
+      writer3 = writer.FileWriter(session=sess, logdir=logdir + "-other")
+      writer3.add_summary(self._createTaggedSummary("three"), 3)
+      writer3.flush()
+
+      # File writer in a different session (should be in separate file)
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer4.add_summary(self._createTaggedSummary("four"), 4)
+        writer4.flush()
+
+      # One more file writer, should share file with writer1
+      writer5 = writer.FileWriter(session=sess, logdir=logdir)
+      writer5.add_summary(self._createTaggedSummary("five"), 5)
+      writer5.flush()
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "four"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+    # Just check that the other logdir file exists to be sure we wrote it
+    self.assertTrue(glob.glob(os.path.join(logdir + "-other", "event*")))
+
+  def testSharing_withExplicitSummaryFileWriters(self):
+    logdir = self.get_temp_dir()
+    with session.Session() as sess:
+      # Initial file writer via FileWriter(session=?)
+      writer1 = writer.FileWriter(session=sess, logdir=logdir)
+      writer1.add_summary(self._createTaggedSummary("one"), 1)
+      writer1.flush()
+
+      # Next one via create_file_writer(), should use same file
+      writer2 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer2.as_default():
+        summary2 = summary_ops_v2.scalar("two", 2.0, step=2)
+      sess.run(writer2.init())
+      sess.run(summary2)
+      sess.run(writer2.flush())
+
+      # Next has different shared name, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      writer3 = summary_ops_v2.create_file_writer(logdir=logdir, name="other")
+      with summary_ops_v2.always_record_summaries(), writer3.as_default():
+        summary3 = summary_ops_v2.scalar("three", 3.0, step=3)
+      sess.run(writer3.init())
+      sess.run(summary3)
+      sess.run(writer3.flush())
+
+      # Next uses a second session, should be in separate file
+      time.sleep(1.1)  # Ensure filename has a different timestamp
+      with session.Session() as other_sess:
+        writer4 = summary_ops_v2.create_file_writer(logdir=logdir)
+        with summary_ops_v2.always_record_summaries(), writer4.as_default():
+          summary4 = summary_ops_v2.scalar("four", 4.0, step=4)
+        other_sess.run(writer4.init())
+        other_sess.run(summary4)
+        other_sess.run(writer4.flush())
+
+        # Next via FileWriter(session=?) uses same second session, should be in
+        # same separate file. (This checks sharing in the other direction)
+        writer5 = writer.FileWriter(session=other_sess, logdir=logdir)
+        writer5.add_summary(self._createTaggedSummary("five"), 5)
+        writer5.flush()
+
+      # One more via create_file_writer(), should use same file
+      writer6 = summary_ops_v2.create_file_writer(logdir=logdir)
+      with summary_ops_v2.always_record_summaries(), writer6.as_default():
+        summary6 = summary_ops_v2.scalar("six", 6.0, step=6)
+      sess.run(writer6.init())
+      sess.run(summary6)
+      sess.run(writer6.flush())
+
+    event_paths = iter(sorted(glob.glob(os.path.join(logdir, "event*"))))
+
+    # First file should have tags "one", "two", and "six"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("one", next(events).summary.value[0].tag)
+    self.assertEqual("two", next(events).summary.value[0].tag)
+    self.assertEqual("six", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Second file should have just "three"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("three", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # Third file should have "four" and "five"
+    events = summary_iterator.summary_iterator(next(event_paths))
+    self.assertEqual("brain.Event:2", next(events).file_version)
+    self.assertEqual("four", next(events).summary.value[0].tag)
+    self.assertEqual("five", next(events).summary.value[0].tag)
+    self.assertRaises(StopIteration, lambda: next(events))
+
+    # No more files
+    self.assertRaises(StopIteration, lambda: next(event_paths))
+
+
+class FileWriterCacheTest(test.TestCase):
+  """FileWriterCache tests."""
 
   def _test_dir(self, test_name):
     """Create an empty dir to use for tests.
@@ -448,32 +622,5 @@ class SummaryWriterCacheTest(test.TestCase):
       self.assertFalse(sw1 == sw2)
 
 
-class ExamplePluginAsset(plugin_asset.PluginAsset):
-  plugin_name = "example"
-
-  def assets(self):
-    return {"foo.txt": "foo!", "bar.txt": "bar!"}
-
-
-class PluginAssetsTest(test.TestCase):
-
-  def testPluginAssetSerialized(self):
-    with ops.Graph().as_default() as g:
-      plugin_asset.get_plugin_asset(ExamplePluginAsset)
-
-      logdir = self.get_temp_dir()
-      fw = writer.FileWriter(logdir)
-      fw.add_graph(g)
-    plugin_dir = os.path.join(logdir, writer._PLUGINS_DIR, "example")
-
-    with gfile.Open(os.path.join(plugin_dir, "foo.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "foo!")
-
-    with gfile.Open(os.path.join(plugin_dir, "bar.txt"), "r") as f:
-      content = f.read()
-    self.assertEqual(content, "bar!")
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/tensorflow.i b/tensorflow/python/tensorflow.i
index 82b908ac0e95643d1daf5ed062be44a58cfea97f..26e8acd8977734768accb1f9c7e37431c337ee34 100644
--- a/tensorflow/python/tensorflow.i
+++ b/tensorflow/python/tensorflow.i
@@ -25,6 +25,7 @@ limitations under the License.
 %include "tensorflow/python/util/tfprof.i"
 
 %include "tensorflow/python/lib/core/py_func.i"
+%include "tensorflow/python/lib/core/py_exception_registry.i"
 
 %include "tensorflow/python/lib/io/py_record_reader.i"
 %include "tensorflow/python/lib/io/py_record_writer.i"
@@ -54,4 +55,3 @@ limitations under the License.
 %include "tensorflow/python/grappler/tf_optimizer.i"
 %include "tensorflow/python/grappler/cost_analyzer.i"
 %include "tensorflow/python/grappler/model_analyzer.i"
-
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 1de1adcfbc35e2b760f362cb9784dd415b9a4dc4..cc2884a4f66ef66a4c28814f357bac56a919b1c3 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -28,7 +28,7 @@ py_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow:tensorflow_py"],
+    deps = ["//tensorflow/contrib/saved_model:reader"],
 )
 
 py_library(
@@ -38,11 +38,12 @@ py_library(
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python:client",
         "//tensorflow/python:framework",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
         "//tensorflow/python:training",
+        "//tensorflow/python/saved_model:loader",
         "@six_archive//:six",
     ],
 )
@@ -52,14 +53,7 @@ py_binary(
     srcs = ["freeze_graph.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":saved_model_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:training",
-        "@six_archive//:six",
+        ":freeze_graph_lib",
     ],
 )
 
@@ -258,17 +252,3 @@ py_test(
         "//tensorflow/core:protos_all_py",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-            "bin/**",
-            "gen/**",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/python/tools/optimize_for_inference.py b/tensorflow/python/tools/optimize_for_inference.py
index 902748d55efedf2166cbeb0d8e0fcff0d18ed152..dac6a06a89c7596dd66d0ed7a2e5a59a0ba9b9dd 100644
--- a/tensorflow/python/tools/optimize_for_inference.py
+++ b/tensorflow/python/tools/optimize_for_inference.py
@@ -87,7 +87,9 @@ def main(unused_args):
   output_graph_def = optimize_for_inference_lib.optimize_for_inference(
       input_graph_def,
       FLAGS.input_names.split(","),
-      FLAGS.output_names.split(","), FLAGS.placeholder_type_enum)
+      FLAGS.output_names.split(","),
+      FLAGS.placeholder_type_enum,
+      FLAGS.toco_compatible)
 
   if FLAGS.frozen_graph:
     f = gfile.FastGFile(FLAGS.output, "w")
@@ -138,6 +140,14 @@ def parse_args():
       type=int,
       default=dtypes.float32.as_datatype_enum,
       help="The AttrValue enum to use for placeholders.")
+  parser.add_argument(
+      "--toco_compatible",
+      type=bool,
+      default=False,
+      help="""\
+      If true, only use ops compatible with Tensorflow
+      Lite Optimizing Converter.\
+      """)
   return parser.parse_known_args()
 
 
diff --git a/tensorflow/python/tools/optimize_for_inference_lib.py b/tensorflow/python/tools/optimize_for_inference_lib.py
index 9c1927122252f45ddfa8092045c7589fa0f45532..bb90d1cd6e33aacf4bb7498fb9c9e7ecfb447c04 100644
--- a/tensorflow/python/tools/optimize_for_inference_lib.py
+++ b/tensorflow/python/tools/optimize_for_inference_lib.py
@@ -87,7 +87,7 @@ EPSILON_ATTR = {
 
 
 def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
-                           placeholder_type_enum):
+                           placeholder_type_enum, toco_compatible=False):
   """Applies a series of inference optimizations on the input graph.
 
   Args:
@@ -98,6 +98,8 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
       results.
     placeholder_type_enum: The AttrValue enum for the placeholder data type, or
         a list that specifies one value per input node name.
+    toco_compatible: Boolean, if True, only runs optimizations that result in
+      TOCO compatible graph operations (default=False).
 
   Returns:
     An optimized version of the input graph.
@@ -110,8 +112,9 @@ def optimize_for_inference(input_graph_def, input_node_names, output_node_names,
   optimized_graph_def = graph_util.remove_training_nodes(
       optimized_graph_def, output_node_names)
   optimized_graph_def = fold_batch_norms(optimized_graph_def)
-  optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
-                                             output_node_names)
+  if not toco_compatible:
+    optimized_graph_def = fuse_resize_and_conv(optimized_graph_def,
+                                               output_node_names)
   ensure_graph_is_valid(optimized_graph_def)
   return optimized_graph_def
 
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index aae757b99aa9abb2fca112dcc781fc31e367649d..094a9e886ba87b639c143983d77949d664c8255a 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -859,6 +859,7 @@ class ProfilerHook(session_run_hook.SessionRunHook):
           showing the sizes and lifetimes of tensors.
     """
     self._output_file = os.path.join(output_dir, "timeline-{}.json")
+    self._file_writer = SummaryWriterCache.get(output_dir)
     self._show_dataflow = show_dataflow
     self._show_memory = show_memory
     self._timer = SecondOrStepTimer(
@@ -889,6 +890,8 @@ class ProfilerHook(session_run_hook.SessionRunHook):
       self._save(global_step,
                  self._output_file.format(global_step),
                  run_values.run_metadata.step_stats)
+      self._file_writer.add_run_metadata(run_values.run_metadata,
+                                         "step_%d" % global_step)
 
     self._next_step = global_step + 1
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2547661e5250e94136a100aa8c30c9dbb7455018..f39a5261a93c3dc2df3a2364010448df116461e9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -1274,6 +1274,19 @@ class ProfilerHookTest(test.TestCase):
         sess.run(self.train_op)  # Saved.
         self.assertEqual(3, self._count_timeline_files())
 
+  def test_run_metadata_saves_in_first_step(self):
+    writer_cache.FileWriterCache.clear()
+    fake_summary_writer.FakeSummaryWriter.install()
+    fake_writer = writer_cache.FileWriterCache.get(self.output_dir)
+    with self.graph.as_default():
+      hook = basic_session_run_hooks.ProfilerHook(
+          save_secs=2, output_dir=self.output_dir)
+      with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
+        sess.run(self.train_op)  # Saved.
+        self.assertEqual(
+            list(fake_writer._added_run_metadata.keys()), ['step_1'])
+    fake_summary_writer.FakeSummaryWriter.uninstall()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable.py
index d0650eb127640a5cfb28f9c238343791bfa1746c..9bf48df22eb6c933aeb5362b68e9efd97d263d16 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable.py
@@ -94,12 +94,13 @@ class _CheckpointPosition(object):
 
   def restore(self, checkpointable):
     """Restore this value into `checkpointable`."""
-    if self.bind_object(checkpointable):
-      # This object's correspondence with a checkpointed object is new, so
-      # process deferred restorations for it and its dependencies.
-      restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
-      if restore_ops:
-        self._checkpoint.restore_ops.extend(restore_ops)
+    with ops.init_scope():
+      if self.bind_object(checkpointable):
+        # This object's correspondence with a checkpointed object is new, so
+        # process deferred restorations for it and its dependencies.
+        restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
+        if restore_ops:
+          self._checkpoint.restore_ops.extend(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -409,28 +410,29 @@ class CheckpointableBase(object):
            "Checkpointable._add_variable called to create another with "
            "that name. Variable names must be unique within a Checkpointable "
            "object.") % (name,))
-    if context.executing_eagerly():
-      # If this is a variable with a single Tensor stored in the checkpoint, we
-      # can set that value as an initializer rather than initializing and then
-      # assigning (when executing eagerly). This call returns None if there is
-      # nothing to restore.
-      checkpoint_initializer = self._preload_simple_restoration(
-          name=name, shape=shape)
-    else:
-      checkpoint_initializer = None
-    if (checkpoint_initializer is not None
-        and not (
-            isinstance(initializer, CheckpointInitialValue)
-            and initializer.restore_uid > checkpoint_initializer.restore_uid)):
-      # If multiple Checkpointable objects are "creating" the same variable via
-      # the magic of custom getters, the one with the highest restore UID (the
-      # one called last) has to make the final initializer. If another custom
-      # getter interrupts this process by overwriting the initializer, then
-      # we'll catch that when we call _track_checkpointable. So this is "best
-      # effort" to set the initializer with the highest restore UID.
-      initializer = checkpoint_initializer
-      shape = None
-
+    with ops.init_scope():
+      if context.executing_eagerly():
+        # If this is a variable with a single Tensor stored in the checkpoint,
+        # we can set that value as an initializer rather than initializing and
+        # then assigning (when executing eagerly). This call returns None if
+        # there is nothing to restore.
+        checkpoint_initializer = self._preload_simple_restoration(
+            name=name, shape=shape)
+      else:
+        checkpoint_initializer = None
+      if (checkpoint_initializer is not None
+          and not (
+              isinstance(initializer, CheckpointInitialValue)
+              and (initializer.restore_uid
+                   > checkpoint_initializer.restore_uid))):
+        # If multiple Checkpointable objects are "creating" the same variable
+        # via the magic of custom getters, the one with the highest restore UID
+        # (the one called last) has to make the final initializer. If another
+        # custom getter interrupts this process by overwriting the initializer,
+        # then we'll catch that when we call _track_checkpointable. So this is
+        # "best effort" to set the initializer with the highest restore UID.
+        initializer = checkpoint_initializer
+        shape = None
     new_variable = getter(
         name=name, shape=shape, dtype=dtype, initializer=initializer,
         **kwargs_for_getter)
@@ -560,6 +562,7 @@ class CheckpointableBase(object):
       checkpointable: The Checkpointable object to restore (inheriting from
         `CheckpointableBase`).
     """
+    self._maybe_initialize_checkpointable()
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index d31c375b4ce48dcb9bc2918514707636a647c675..be80c3657158b52d063b5d2b7731f25d184794a0 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -25,14 +25,13 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util.tf_export import tf_export
 
-# This is a tuple of PS ops used by tf.estimator.Esitmator which should work in
+# This is a tuple of PS ops used by tf.estimator.Estimator which should work in
 # almost all of cases.
-STANDARD_PS_OPS = (
-    "Variable", "VariableV2", "AutoReloadVariable", "MutableHashTable",
-    "MutableHashTableV2", "MutableHashTableOfTensors",
-    "MutableHashTableOfTensorsV2", "MutableDenseHashTable",
-    "MutableDenseHashTableV2", "VarHandleOp"
-)
+STANDARD_PS_OPS = ("Variable", "VariableV2", "AutoReloadVariable",
+                   "MutableHashTable", "MutableHashTableV2",
+                   "MutableHashTableOfTensors", "MutableHashTableOfTensorsV2",
+                   "MutableDenseHashTable", "MutableDenseHashTableV2",
+                   "VarHandleOp", "BoostedTreesEnsembleResourceHandleOp")
 
 
 class _RoundRobinStrategy(object):
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 9261e132302043e97f2adb696fbde2dd01c897ce..c6b2dcdf98bd8436357a26c2ff69072b5009bed3 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -23,8 +23,11 @@ import threading
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.training import device_util
 from tensorflow.python.util import nest
 
@@ -99,6 +102,7 @@ _update_device = threading.local()
 
 
 def get_update_device():
+  """Get the current device if in a `DistributionStrategy.update()` call."""
   try:
     return _update_device.current
   except AttributeError:
@@ -126,16 +130,18 @@ class UpdateContext(object):
 
 
 def get_tower_context():
-  """Returns the current TowerContext or None.
+  """Returns the current TowerContext or None if in a cross-tower context.
 
   Note that execution:
-  1. starts in the default (single-tower) tower context;
-  2. switches to cross-tower context when entering a
-     `with DistributionStrategy.scope():` block;
+  1. starts in the default (single-tower) tower context (this function
+     will return the default TowerContext object);
+  2. switches to cross-tower context (in which case this will return
+     None) when entering a `with DistributionStrategy.scope():` block;
   3. switches to a (non-default) tower context inside
      `call_for_each_tower(fn, ...)`;
   4. if `fn` calls `get_tower_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-tower context.
+     inside `merge_fn` you are back in the cross-tower context (and again
+     this function will return None).
 
   Note that you can also go directly from step 1 to 4 to switch to a
   cross-tower context for the default `DistributionStrategy`. You may
@@ -188,6 +194,9 @@ def get_cross_tower_context():
 def get_distribution_strategy():
   """Returns the current `DistributionStrategy` object.
 
+  Prefer to use `get_tower_context()` or `get_cross_tower_context()`
+  instead when possible.
+
   Returns:
     A `DistributionStrategy` object. Inside a
     `with distribution_strategy.scope()` block, it returns
@@ -279,17 +288,22 @@ def _require_distribution_strategy_scope(distribution_strategy):
 class _CurrentDistributionContext(object):
   """Context manager for setting the `DistributionStrategy` and var creator."""
 
-  def __init__(self, distribution_strategy, var_creator_scope):
+  def __init__(self, distribution_strategy, var_creator_scope, var_scope=None):
     self._context = _CrossTowerThreadMode(distribution_strategy)
     self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
 
   def __enter__(self):
     _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
     self._var_creator_scope.__enter__()
     return self._context.distribution_strategy
 
   def __exit__(self, exception_type, exception_value, traceback):
     self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
     _pop_per_thread_mode()
 
 
@@ -362,7 +376,9 @@ class DistributionStrategy(object):
     update. Allreduce is an algorithm for performing a reduction on
     values from multiple devices and making the result available on
     all of those devices.
-  * TODO(josh11b): Future: partitioned variables
+  * In the future we will have support for TensorFlows' partitioned
+    variables, where a single variable is split across multiple
+    devices.
 
   We have then a few approaches we want to support:
   * Code written (as if) with no knowledge of class `DistributionStrategy`.
@@ -376,7 +392,6 @@ class DistributionStrategy(object):
     ```
     with my_distribution.scope():
       iterator = my_distribution.distribute_dataset(dataset)
-      # TODO(josh11b): iterator = dataset.make_one_shot_iterator()
       tower_train_ops = my_distribution.call_for_each_tower(
           tower_fn, iterator.get_next())
       train_op = tf.group(my_distribution.unwrap(tower_train_ops))
@@ -388,6 +403,10 @@ class DistributionStrategy(object):
     using `my_distribution`'s policy, and library functions called by
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
+
+    Note that in the future we will add support for initializable
+    Dataset iterators, at which point this example code will change.
+
   * If you want to write a distributed algorithm, you may use any of
     the `DistributionStrategy` APIs inside a
     `with my_distribution.scope():` block of code.
@@ -401,19 +420,19 @@ class DistributionStrategy(object):
     different across devices, and "Mirrored" when the value are the same.
   * Unwrapping and merging: Consider calling a function `fn` on
     multiple devices, like `call_for_each_tower(fn, w)` with an
-    argument `w that is a wrapped value. This means `w` will have a
+    argument `w` that is a wrapped value. This means `w` will have a
     map taking tower device `d0` to `w0`, tower device `d1` to `w1`,
     etc. `call_for_each_tower()` unwraps `w` before calling `fn`, so
     it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
     the return values from `fn()`, which can possibly result in
     wrapped values. For example, let's say `fn()` returns a tuple with
-    three components: (x, a, v0) from tower 0, (x, b, v1) on tower 1,
+    three components: `(x, a, v0)` from tower 0, `(x, b, v1)` on tower 1,
     etc. If the first component is the same object `x` from every
     tower, then the first component of the merged result will also be
     `x`. If the second component is different (`a`, `b`, ...)  from
     each tower, then the merged value will have a wrapped map from
     tower device to the different values. If the third component is
-    the members of a mirrored variable (`v` maps `d0` to `v0, `d1` to
+    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
     `v1`, etc.), then the merged result will be that mirrored variable
     (`v`).
   * Tower context vs. Cross-tower context: _tower context_ is when we
@@ -500,7 +519,7 @@ class DistributionStrategy(object):
 
   Steps 3 and 4 are done automatically by class `Optimizer` if you call
   its `apply_gradients` method in a tower context. Otherwise you can
-  manually call its `distributed_apply` method in a cross-tower context.
+  manually call its `_distributed_apply` method in a cross-tower context.
 
   Another thing you might want to do in the middle of your tower function
   is an all-reduce of some intermediate value, using `d.reduce()` or
@@ -519,14 +538,13 @@ class DistributionStrategy(object):
   in the distributed vs. single tower cases.
   """
 
-  # TODO(josh11b): Raise an exception if variable paritioning requested before
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
   #   we add support.
   # TODO(josh11b): Also `parameter_device_index` property?
   # TODO(josh11b): `map()`
   # TODO(josh11b): ClusterSpec/ClusterResolver
   # TODO(josh11b): Partitioned computations, state; sharding
   # TODO(josh11b): Model parallelism: "towers" with multiple devices; shuffling
-  # TODO(josh11b): Tower-local variables
   # TODO(josh11b): List of towers with their worker and parameter devices
   #   (where the parameter devices may overlap in the ps case).
 
@@ -549,17 +567,69 @@ class DistributionStrategy(object):
       kwargs["use_resource"] = True
       return self._create_variable(*args, **kwargs)
 
+    def disable_partitioned_variables(getter, *args, **kwargs):
+      if kwargs.pop("partitioner", None) is not None:
+        tf_logging.log_first_n(
+            tf_logging.WARN, "Partitioned variables are disabled when using "
+            "DistributionStrategy.", 1)
+      return getter(*args, **kwargs)
+
     return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator_with_resource_vars))
+        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=disable_partitioned_variables))
 
   def _create_variable(self, next_creator, *args, **kwargs):
     # Note: should support "colocate_with" argument.
     raise NotImplementedError("must be implemented in descendants")
 
+  def tower_local_var_scope(self, reduce_method):
+    """Inside this scope, new variables will not be mirrored.
+
+    There will still be one component variable per tower, but there is
+    no requirement that they stay in sync. Instead, when saving them
+    or calling `fetch()`, we use the value that results when calling
+    `reduce()` on all the towers' variables.
+
+    Note: tower-local implies not trainable. Instead, it is expected
+    that each tower will directly update (using `assign_add()` or
+    whatever) its local variable instance but only the aggregated
+    value (accessible using `fetch()`) will be exported from the
+    model. When it is acceptable to only aggregate on export, we
+    greatly reduce communication overhead by using tower-local
+    variables.
+
+    Note: All component variables will be initialized to the same
+    value, using the initialization expression from the first tower.
+    The values will match even if the initialization expression uses
+    random numbers.
+
+    Args:
+      reduce_method: String used as a `method_string` to `reduce()`
+        to get the value to save when checkpointing.
+
+    Returns:
+      A context manager.
+    """
+    def create_tower_local_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope(self)
+      kwargs["use_resource"] = True
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope(self)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
   def colocate_vars_with(self, colocate_with_variable):
-    """Controls which devices variables will be created on.
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
 
-    Note this may only be used inside `self.scope()`.
+    This may only be used inside `self.scope()`.
 
     Example usage:
 
@@ -905,6 +975,10 @@ class DistributionStrategy(object):
   def _worker_device_index(self):
     raise NotImplementedError("must be implemented in descendants")
 
+  def configure(self, session_config=None):
+    """Find the best configuration given a tensorflow session config."""
+    del session_config
+
 
 # A note about the difference between the context managers
 # `TowerContext` (defined here) and `_CurrentDistributionContext`
@@ -979,6 +1053,10 @@ class TowerContext(object):
     finally:
       _pop_per_thread_mode()
 
+  def tower_local_var_scope(self, reduce_method):
+    """Alias for distribution_strategy.tower_local_var_scope()."""
+    return self._distribution_strategy.tower_local_var_scope(reduce_method)
+
   @property
   def is_single_tower(self):
     """Returns whether there is a single tower or multiple."""
@@ -1025,20 +1103,27 @@ class _DefaultDistributionStrategy(DistributionStrategy):
 
     def creator(next_creator, *args, **kwargs):
       _require_distribution_strategy_scope(self)
+      if kwargs.pop("tower_local_reduce_method", None) is not None:
+        kwargs["trainable"] = False
       return next_creator(*args, **kwargs)
 
     return _CurrentDistributionContext(
         self, variable_scope.variable_creator_scope(creator))
 
-  def colocate_vars_with(self, colocate_with_variable):
-    """Does not require `self.scope`."""
-    def create_colocated_variable(next_creator, *args, **kwargs):
+  def tower_local_var_scope(self, reduce_method):
+    """Does not set to resource variables."""
+    def create_tower_local_variable(next_creator, *args, **kwargs):
       _require_distribution_strategy_scope(self)
-      with ops.colocate_with(colocate_with_variable):
-        return next_creator(*args, **kwargs)
+      kwargs["tower_local_reduce_method"] = reduce_method
+      return next_creator(*args, **kwargs)
 
     _require_distribution_strategy_scope(self)
-    return variable_scope.variable_creator_scope(create_colocated_variable)
+    return variable_scope.variable_creator_scope(create_tower_local_variable)
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope(self)
+    return ops.colocate_with(colocate_with_variable)
 
   def distribute_dataset(self, dataset):
     # TODO(josh11b): Support for this when executing eagerly is currently only
@@ -1108,6 +1193,24 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     raise RuntimeError("worker_device_index() method unsupported by "
                        "_DefaultDistributionStrategy.")
 
+# ------------------------------------------------------------------------------
+# Common operations
+
+
+def increment_var(v, amount=1):
+  """`v += amount`, distributed-aware version."""
+  def update(vu):
+    if isinstance(vu, resource_variable_ops.ResourceVariable):
+      return vu.assign_add(amount, read_value=False)
+    else:
+      return state_ops.assign_add(vu, amount)
+
+  def merge_fn(dist, vm):
+    return dist.group(dist.update(vm, update))
+
+  tower_context = get_tower_context()
+  return tower_context.merge_call(merge_fn, v)
+
 
 # ------------------------------------------------------------------------------
 # Singletons
@@ -1116,3 +1219,23 @@ _default_distribution_strategy = _DefaultDistributionStrategy()
 _default_tower_context = TowerContext(
     _default_distribution_strategy, tower_id=0)
 _default_tower_mode = _DefaultTowerThreadMode()
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
+def _from_proto_fn(v, import_scope=None):
+  if has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using"
+        "distributed strategies.")
+  else:
+    return _original_from_proto(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
diff --git a/tensorflow/python/training/input.py b/tensorflow/python/training/input.py
index 44f00a96deff64012705c4c81b185a9c4fac2295..caa26581e8a0041dd1b157ab6b1f8236344582e8 100644
--- a/tensorflow/python/training/input.py
+++ b/tensorflow/python/training/input.py
@@ -515,8 +515,7 @@ def _store_sparse_tensors(tensor_list, enqueue_many, keep_input,
     def _sparse_values_to_keep(t, keep_input):
       """Convert a per-row `keep_input` vector to a per-value one."""
       # Get the rows of every value in the sparse Tensor.
-      row_values = array_ops.reshape(
-          t.indices, [array_ops.shape(t.indices)[0], -1])[:, 0]
+      row_values = t.indices[:, 0]
       # The value should be kept iff the row should be kept.
       return array_ops.gather(keep_input, row_values)
     if keep_input.shape.ndims == 1:
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 297a8bbde5447cff9465be36c0bb71f2490c60fc..7bd57ad3d854534e196fa7b72bebbd7195e6bca8 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -237,7 +237,17 @@ class MomentumOptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      # This test invokes the ResourceSparseApplyMomentum operation, which
+      # did not have a registered GPU kernel as of April 2018. With graph
+      # execution, the placement algorithm notices this and automatically
+      # places the variable in CPU (host) memory. With eager execution,
+      # the variable would be placed in GPU memory if available, which
+      # would then conflict with the future invocation of the
+      # ResourceSparseApplyMomentum operation.
+      # To work around this discrepancy, for now we force the variable
+      # to be placed on CPU.
+      with ops.device("/cpu:0"):
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
 
       # pylint: disable=cell-var-from-loop
       def loss():
@@ -256,7 +266,17 @@ class MomentumOptimizerTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeWith2DIndiciesForEmbeddingLookup(self):
-    var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
+    # This test invokes the ResourceSparseApplyMomentum operation, which
+    # did not have a registered GPU kernel as of April 2018. With graph
+    # execution, the placement algorithm notices this and automatically
+    # places the variable in CPU (host) memory. With eager execution,
+    # the variable would be placed in GPU memory if available, which
+    # would then conflict with the future invocation of the
+    # ResourceSparseApplyMomentum operation.
+    # To work around this discrepancy, for now we force the variable
+    # to be placed on CPU.
+    with ops.device("/cpu:0"):
+      var0 = resource_variable_ops.ResourceVariable(array_ops.ones([2, 2]))
 
     def loss():
       return math_ops.reduce_sum(embedding_ops.embedding_lookup(var0, [[1]]))
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 2d4f09a60a518471b4f1c8104bf606953f0f296d..4ce6f6d00267410626f7d7a9e2251d3f40b6bb6e 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -350,8 +350,8 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
   elif save_summaries_steps == USE_DEFAULT:
     save_summaries_steps = None
 
-  if save_checkpoint_steps == USE_DEFAULT and \
-    save_checkpoint_secs == USE_DEFAULT:
+  if (save_checkpoint_steps == USE_DEFAULT and
+      save_checkpoint_secs == USE_DEFAULT):
     save_checkpoint_steps = None
     save_checkpoint_secs = 600
   elif save_checkpoint_secs == USE_DEFAULT:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index bf79714f9682e60b97788b8b470821cfe9290886..f126d3847b6b0b43495c63b31ca915c107ede969 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -35,11 +35,28 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpointable
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+def get_filtered_grad_fn(grad_fn):
+  # `distributed_context.join()` requires that its arguments are parallel
+  # across threads, and in particular that `grads_and_vars` has the same
+  # variables in the same order.
+
+  # When computing gradients in eager mode with multiple threads, you
+  # can get extra variables with a gradient of `None`. This happens when
+  # those variables are accessed in another thread during the gradient
+  # computation. To get a consistent set of variables, we filter out
+  # those with `None` gradients.
+  def filtered_grad_fn(x=None):
+    return [(g, v) for g, v in grad_fn(x) if g is not None]
+
+  return filtered_grad_fn
+
+
 def _deduplicate_indexed_slices(values, indices):
   """Sums `values` associated with any non-unique `indices`.
 
@@ -335,6 +352,13 @@ class Optimizer(
     #   ... }
     self._deferred_slot_restorations = {}
 
+    # TODO(isaprykin): When using a DistributionStrategy, and when an
+    # optimizer is created in each tower, it might be dangerous to
+    # rely on some Optimer methods.  When such methods are called on a
+    # per-tower optimizer, an exception needs to be thrown.  We do
+    # allow creation per-tower optimizers however, because the
+    # compute_gradients()->apply_gradients() sequence is safe.
+
   def get_name(self):
     return self._name
 
@@ -447,14 +471,33 @@ class Optimizer(
         if var_list is not None:
           tape.watch(var_list)
         loss_value = loss()
+
+        # Scale loss if using a "mean" loss reduction and multiple towers.
+        # Have to be careful to call distribute_lib.get_loss_reduction()
+        # *after* loss() is evaluated, so we know what loss reduction it uses.
+        # TODO(josh11b): Test that we handle weight decay in a reasonable way.
+        if distribute_lib.get_loss_reduction() == "mean":
+          num_towers = distribute_lib.get_distribution_strategy().num_towers
+          if num_towers > 1:
+            loss_value *= (1. / num_towers)
+
       if var_list is None:
         var_list = tape.watched_variables()
       grads = tape.gradient(loss_value, var_list, grad_loss)
       return list(zip(grads, var_list))
+
+    # Non-callable/Tensor loss case
     if context.executing_eagerly():
       raise RuntimeError(
           "`loss` passed to Optimizer.compute_gradients should "
           "be a function when eager execution is enabled.")
+
+    # Scale loss if using a "mean" loss reduction and multiple towers.
+    if distribute_lib.get_loss_reduction() == "mean":
+      num_towers = distribute_lib.get_distribution_strategy().num_towers
+      if num_towers > 1:
+        loss *= (1. / num_towers)
+
     if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP,
                               Optimizer.GATE_GRAPH]:
       raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, "
@@ -510,11 +553,25 @@ class Optimizer(
     Raises:
       TypeError: If `grads_and_vars` is malformed.
       ValueError: If none of the variables have gradients.
+      RuntimeError: If you should use `_distributed_apply()` instead.
     """
     # This is a default implementation of apply_gradients() that can be shared
     # by most optimizers.  It relies on the subclass implementing the following
     # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
 
+    # Handle DistributionStrategy case.
+    if distribute_lib.get_cross_tower_context():
+      raise RuntimeError("Use `_distributed_apply()` instead of "
+                         "`apply_gradients()` in a cross-tower context.")
+    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
+    # always calling _distributed_apply(), using the default distribution
+    # as needed.
+    if distribute_lib.has_distribution_strategy():
+      grads_and_vars = get_filtered_grad_fn(lambda _: grads_and_vars)()
+      return distribute_lib.get_tower_context().merge_call(
+          self._distributed_apply, grads_and_vars, global_step, name)
+
+    # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
     if not grads_and_vars:
       raise ValueError("No variables provided.")
@@ -582,6 +639,90 @@ class Optimizer(
 
       return apply_updates
 
+  def _distributed_apply(self,
+                         distribution,
+                         grads_and_vars,
+                         global_step=None,
+                         name=None):
+    """A version of `apply_gradients` for cross-tower context.
+
+    This is a version of `apply_gradients()` for when you are using a
+    `DistributionStrategy` and are in a cross-tower context. If in a
+    tower context, use `apply_gradients()` as normal.
+
+    Args:
+      distribution: A `DistributionStrategy` object.
+      grads_and_vars: List of (gradient, variable) pairs as returned by
+        `compute_gradients()`, and then aggregated across towers.
+      global_step: Optional (mirrored) `Variable` to increment by one
+        after the variables have been updated.
+      name: Optional name for the returned operation.  Default to the
+        name passed to the `Optimizer` constructor.
+
+    Returns:
+      An `Operation` that applies the specified gradients across all
+      towers. If `global_step` was not None, that operation also
+      increments `global_step`.
+    """
+    reduced_grads = distribution.batch_reduce("sum", grads_and_vars)
+    var_list = [v for _, v in grads_and_vars]
+    grads_and_vars = zip(reduced_grads, var_list)
+    # Note that this is called in a cross-tower context.
+    self._create_slots(var_list)
+
+    def update(v, g):
+      """Apply gradients to a replica variable."""
+      assert v is not None
+
+      try:
+        # Convert the grad to Tensor or IndexedSlices if necessary.
+        g = ops.convert_to_tensor_or_indexed_slices(g)
+      except TypeError:
+        raise TypeError("Gradient must be convertible to a Tensor"
+                        " or IndexedSlices, or None: %s" % g)
+      if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
+        raise TypeError(
+            "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
+      p = _get_processor(v)
+
+      scope_name = "" if context.executing_eagerly() else v.op.name
+      # device_policy is set because non-mirrored tensors will be read in
+      # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
+      # is an example.
+      with ops.name_scope("update_" + scope_name):
+        return p.update_op(self, g)
+
+    with ops.name_scope(name, self._name) as name:
+      self._prepare()
+
+      update_ops = [
+          op
+          for grad, var in grads_and_vars
+          for op in distribution.unwrap(distribution.update(var, update, grad))
+      ]
+
+      def finish(self, update_ops):
+        return self._finish(update_ops, "update")
+
+      non_slot_devices = distribution.non_slot_devices(var_list)
+      finish_updates = distribution.update_non_slot(
+          non_slot_devices, finish, self, update_ops)
+      if global_step is None:
+        apply_updates = distribution.group(finish_updates, name=name)
+      else:
+        with ops.control_dependencies(distribution.unwrap(finish_updates)):
+          apply_updates = distribution.group(distribution.update(
+              global_step, state_ops.assign_add, 1, name=name))
+
+      if not context.executing_eagerly():
+        if isinstance(apply_updates, ops.Tensor):
+          apply_updates = apply_updates.op
+        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
+        if apply_updates not in train_op:
+          train_op.append(apply_updates)
+
+      return apply_updates
+
   def get_slot(self, var, name):
     """Return a slot named `name` created for `var` by the Optimizer.
 
@@ -599,9 +740,25 @@ class Optimizer(
     Returns:
       The `Variable` for the slot if it was created, `None` otherwise.
     """
+    # pylint: disable=protected-access
     named_slots = self._slots.get(name, None)
     if not named_slots:
       return None
+
+    if hasattr(var, "_mirrored_container"):
+      # NOTE: If this isn't patched, then there is no `handle` in
+      # `_resource_apply_dense`.
+      mirrored_container = var._mirrored_container()
+      assert mirrored_container is not None
+      if context.executing_eagerly():
+        key = mirrored_container._unique_id
+      else:
+        key = (mirrored_container.graph, mirrored_container._shared_name)
+      # pylint: enable=protected-access
+      mirrored_slot = named_slots.get(key, None)
+      if mirrored_slot is None: return None
+      return mirrored_slot.get(device=var.device)
+
     return named_slots.get(_var_key(var), None)
 
   def get_slot_names(self):
@@ -645,6 +802,7 @@ class Optimizer(
 
   def _create_non_slot_variable(self, initial_value, name, colocate_with):
     """Add an extra variable, not associated with a slot."""
+    # Recommendation: Use OptimizerV2 if your optimizer uses non-slot variables.
     eager = context.executing_eagerly()
     graph = None if eager else colocate_with.graph
 
@@ -652,20 +810,21 @@ class Optimizer(
     v = self._non_slot_dict.get(key, None)
     if v is None:
       self._maybe_initialize_checkpointable()
-      with ops.colocate_with(colocate_with):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
               name=name, shape=None)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
         v = variable_scope.variable(initial_value, name=name, trainable=False)
-        # Restore this variable by name if necessary, but don't add a
-        # Checkpointable dependency. Optimizers return the current graph's
-        # non-slot variables from _checkpoint_dependencies explicitly rather
-        # than unconditionally adding dependencies (since there may be multiple
-        # non-slot variables with the same name in different graphs, trying to
-        # save all of them would result in errors).
-        self._handle_deferred_dependencies(name=name, checkpointable=v)
+      # Restore this variable by name if necessary, but don't add a
+      # Checkpointable dependency. Optimizers return the current graph's
+      # non-slot variables from _checkpoint_dependencies explicitly rather
+      # than unconditionally adding dependencies (since there may be multiple
+      # non-slot variables with the same name in different graphs, trying to
+      # save all of them would result in errors).
+      self._handle_deferred_dependencies(name=name, checkpointable=v)
       self._non_slot_dict[key] = v
 
     return v
@@ -694,7 +853,13 @@ class Optimizer(
     return self._get_non_slot_variable(name, graph=graph)
 
   def _get_non_slot_variable(self, name, graph=None):
-    return self._non_slot_dict.get((name, graph), None)
+    non_slot = self._non_slot_dict.get((name, graph), None)
+    if hasattr(non_slot, "_mirrored_container"):
+      # This is a mirrored non-slot.  In order to enable code like `_finish`
+      # to assign to a non-slot, return the current context replica.
+      return non_slot.get()
+    else:
+      return non_slot
 
   def _non_slot_variables(self):
     """Additional variables created by the `Optimizer`.
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index ba0d0384758f25cc2cc6264b9b73e47f15359721..e40b8d22ed2ab0f4c9ff65e953f0f1cf681c8068 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -91,17 +91,27 @@ class BaseSaverBuilder(object):
   class SaveSpec(object):
     """Class used to describe tensor slices that need to be saved."""
 
-    def __init__(self, tensor, slice_spec, name):
+    def __init__(self, tensor, slice_spec, name, dtype=None):
       """Creates a `SaveSpec` object.
 
       Args:
         tensor: the tensor to save or callable that produces a tensor to save.
         slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
         name: the name to save the tensor under.
+        dtype: The data type of the Tensor. Required if `tensor` is callable.
+          Used for error checking in the restore op.
       """
       self._tensor = tensor
       self.slice_spec = slice_spec
       self.name = name
+      if callable(self._tensor):
+        if dtype is None:
+          raise AssertionError(
+              "When passing a callable `tensor` to a SaveSpec, an explicit "
+              "dtype must be provided.")
+        self.dtype = dtype
+      else:
+        self.dtype = tensor.dtype
 
     @property
     def tensor(self):
@@ -117,14 +127,27 @@ class BaseSaverBuilder(object):
         op: the "producer" object that this class wraps; it produces a list of
           tensors to save.  E.g., a "Variable" object saving its backing tensor.
         specs: a list of SaveSpec, each element of which describes one tensor to
-          save under this object.
+          save under this object. All Tensors must be on the same device.
         name: the name to save the object under.
       """
       self.op = op
       self.specs = specs
       self.name = name
-      # The device of this saveable. All tensors must be on the same device.
-      self.device = specs[0].tensor.device
+      self._device = None
+
+    @property
+    def device(self):
+      """The device for SaveSpec Tensors."""
+      # Note that SaveSpec.tensor runs Tensor-gathering ops when executing
+      # eagerly, making this call potentially very expensive.
+      #
+      # TODO(allenl): Consider another way to gather device information. Lower
+      # priority since this property isn't part of the normal save()/restore()
+      # workflow, but does come up when some alternative builders are passed to
+      # the Saver.
+      if self._device is None:
+        self._device = self.specs[0].tensor.device
+      return self._device
 
     def restore(self, restored_tensors, restored_shapes):
       """Restores this object from 'restored_tensors'.
@@ -148,7 +171,7 @@ class BaseSaverBuilder(object):
     """SaveableObject implementation that handles Variables."""
 
     def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
       super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
 
     def restore(self, restored_tensors, restored_shapes):
@@ -186,7 +209,8 @@ class BaseSaverBuilder(object):
         raise ValueError(
             "Saveable is neither a resource variable nor a read operation."
             " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)
+      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
+                                       dtype=var.dtype)
       super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
           var, [spec], name)
 
@@ -295,7 +319,7 @@ class BaseSaverBuilder(object):
               filename_tensor,
               [spec.name],
               [spec.slice_spec],
-              [spec.tensor.dtype])[0])
+              [spec.dtype])[0])
 
     return tensors
   # pylint: enable=unused-argument
@@ -854,7 +878,7 @@ class BulkSaverBuilder(BaseSaverBuilder):
     restore_specs = []
     for saveable in saveables:
       for spec in saveable.specs:
-        restore_specs.append((spec.name, spec.slice_spec, spec.tensor.dtype))
+        restore_specs.append((spec.name, spec.slice_spec, spec.dtype))
 
     names, slices, dtypes = zip(*restore_specs)
     # Load all tensors onto CPU 0 for compatibility with existing code.
@@ -1924,12 +1948,22 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   else:
     meta_graph_def = meta_graph_or_file
 
-  meta_graph.import_scoped_meta_graph(meta_graph_def,
-                                      clear_devices=clear_devices,
-                                      import_scope=import_scope,
-                                      **kwargs)
+  imported_vars = meta_graph.import_scoped_meta_graph(
+      meta_graph_def,
+      clear_devices=clear_devices,
+      import_scope=import_scope,
+      **kwargs)
+
   if meta_graph_def.HasField("saver_def"):
-    return Saver(saver_def=meta_graph_def.saver_def, name=import_scope)
+    # Infer the scope that is prepended by `import_scoped_meta_graph`.
+    scope = import_scope
+    var_names = list(imported_vars.keys())
+    if var_names:
+      sample_key = var_names[0]
+      sample_var = imported_vars[sample_key]
+      scope = sample_var.name[:-len(sample_key)]
+
+    return Saver(saver_def=meta_graph_def.saver_def, name=scope)
   else:
     if variables._all_saveable_objects():  # pylint: disable=protected-access
       # Return the default saver instance for all graph variables.
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 7de778f298e0fb0d62d45abdd280b673f1068213..14dda7997948ead7b12dee953a0b2ee3b2ee8fc9 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -2341,6 +2341,38 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testImportIntoImplicitNamescope(self):
+    # Test that we can import a meta graph into an implicit namescope.
+    test_dir = self._get_test_dir("import_into_namescope")
+    filename = os.path.join(test_dir, "ckpt")
+    image = array_ops.placeholder(dtypes.float32, [None, 784], name="image")
+    label = array_ops.placeholder(dtypes.float32, [None, 10], name="label")
+    with session.Session() as sess:
+      weights = variables.Variable(
+          random_ops.random_uniform([784, 10]), name="weights")
+      bias = variables.Variable(array_ops.zeros([10]), name="bias")
+      logit = nn_ops.relu(math_ops.matmul(image, weights) + bias, name="logits")
+      nn_ops.softmax(logit, name="prediction")
+      cost = nn_ops.softmax_cross_entropy_with_logits(labels=label,
+                                                      logits=logit, name="cost")
+      adam.AdamOptimizer().minimize(cost, name="optimize")
+      saver = saver_module.Saver()
+      sess.run(variables.global_variables_initializer())
+      saver.save(sess, filename)
+
+    graph = ops_lib.Graph()
+    with session.Session(graph=graph) as sess:
+      with ops_lib.name_scope("new_model"):
+        new_saver = saver_module.import_meta_graph(
+            filename + ".meta", graph=graph)
+
+      new_saver.restore(sess, filename)
+      sess.run(["new_model/optimize"], {
+          "new_model/image:0": np.random.random([1, 784]),
+          "new_model/label:0": np.random.randint(
+              10, size=[1, 10])
+      })
+
   def testClearDevicesOnImport(self):
     # Test that we import a graph without its devices and run successfully.
     with ops_lib.Graph().as_default():
@@ -2948,6 +2980,37 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(42., self.evaluate(v.non_dep_variable))
       self.assertEqual(42., self.evaluate(v.mirrored))
 
+  def testSingleTensorEvaluation(self):
+
+    class _CountingSaveable(saver_module.BaseSaverBuilder.SaveableObject):
+
+      def __init__(self, name):
+        self.eval_count = 0
+        def _tensor():
+          self.eval_count += 1
+          return constant_op.constant([1.])
+        dummy_op = constant_op.constant([2.])
+        super(_CountingSaveable, self).__init__(
+            dummy_op,
+            [saver_module.BaseSaverBuilder.SaveSpec(
+                _tensor, "", name, dtype=dummy_op.dtype)],
+            name)
+
+      def restore(self, restored_tensors, restored_shapes):
+        """Restore the same value into both variables."""
+        pass
+
+    with context.eager_mode():
+      v = _CountingSaveable("foo")
+      saver = saver_module.Saver(var_list=[v])
+      test_dir = self.get_temp_dir()
+      prefix = os.path.join(test_dir, "ckpt")
+      with self.test_session() as sess:
+        save_path = saver.save(sess, prefix)
+        self.assertEqual(1, v.eval_count)
+        saver.restore(sess, save_path)
+        self.assertEqual(1, v.eval_count)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 360e02fb44c1062f71bb50449b9ef381510a9c69..a00ceb90211e371c3b2f2b32f2042d1556158595 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -229,10 +229,14 @@ class SessionManager(object):
     up to `max_wait_secs`, for recovery to succeed.
 
     If the model cannot be recovered successfully then it is initialized by
-    either running the provided `init_op`, or calling the provided `init_fn`.
-    The local_init_op is also run after init_op and init_fn, regardless of
+    running the `init_op` and calling `init_fn` if they are provided.
+    The `local_init_op` is also run after init_op and init_fn, regardless of
     whether the model was recovered successfully, but only if
-    ready_for_local_init_op passes.
+    `ready_for_local_init_op` passes.
+
+    If the model is recovered from a checkpoint it is assumed that all
+    global variables have been initialized, in particular neither `init_op`
+    nor `init_fn` will be executed.
 
     It is an error if the model cannot be recovered and no `init_op`
     or `init_fn` or `local_init_op` are passed.
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 9ac52dd0715d7ed15e2e57ed286be973614b01e5..258a6f045d7c1b491ce00bdf8dd0ae6ad500ba68 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -40,12 +40,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.training import distribute as distribute_lib
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
@@ -112,7 +112,8 @@ def create_slot(primary, val, name, colocate_with_primary=True):
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, val, "", validate_shape, None, None)
     else:
       return _create_slot_var(primary, val, "", validate_shape, None, None)
@@ -148,7 +149,8 @@ def create_slot_with_initializer(primary, initializer, shape, dtype, name,
     prefix = primary.op.name
   with variable_scope.variable_scope(None, prefix + "/" + name):
     if colocate_with_primary:
-      with ops.colocate_with(primary):
+      distribution_strategy = distribute_lib.get_distribution_strategy()
+      with distribution_strategy.colocate_vars_with(primary):
         return _create_slot_var(primary, initializer, "", validate_shape, shape,
                                 dtype)
     else:
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 23c2c48f4b5a165bd6e356a6243b234619af1c4c..5622431bc9974fcc7f355472618ee8b59863764c 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -60,15 +60,7 @@ def _is_namedtuple(instance, strict=False):
   Returns:
     True if `instance` is a `namedtuple`.
   """
-  # Attemp to limit the test to plain namedtuple (not stuff inheriting from it).
-  if not isinstance(instance, tuple):
-    return False
-  if strict and instance.__class__.__base__ != tuple:
-    return False
-  return (
-      hasattr(instance, "_fields") and
-      isinstance(instance._fields, _collections.Sequence) and
-      all(isinstance(f, _six.string_types) for f in instance._fields))
+  return _pywrap_tensorflow.IsNamedtuple(instance, strict)
 
 
 def _sequence_like(instance, args):
@@ -157,76 +149,7 @@ def flatten(nest):
 
 def _same_namedtuples(nest1, nest2):
   """Returns True if the two namedtuples have the same name and fields."""
-  if nest1._fields != nest2._fields:
-    return False
-  if nest1.__class__.__name__ != nest2.__class__.__name__:
-    return False
-  return True
-
-
-def _recursive_assert_same_structure(nest1, nest2, check_types):
-  """Helper function for `assert_same_structure`.
-
-  See `assert_same_structure` for further information about namedtuples.
-
-  Args:
-    nest1: An arbitrarily nested structure.
-    nest2: An arbitrarily nested structure.
-    check_types: If `True` (default) types of sequences are checked as
-        well, including the keys of dictionaries. If set to `False`, for example
-        a list and a tuple of objects will look the same if they have the same
-        size. Note that namedtuples with identical name and fields are always
-        considered to have the same shallow structure.
-
-  Returns:
-    True if `nest1` and `nest2` have the same structure.
-
-  Raises:
-    ValueError: If the two structure don't have the same nested structre.
-    TypeError: If the two structure don't have the same sequence type.
-    ValueError: If the two dictionaries don't have the same set of keys.
-  """
-  is_sequence_nest1 = is_sequence(nest1)
-  if is_sequence_nest1 != is_sequence(nest2):
-    raise ValueError(
-        "The two structures don't have the same nested structure.\n\n"
-        "First structure: %s\n\nSecond structure: %s." % (nest1, nest2))
-
-  if not is_sequence_nest1:
-    return  # finished checking
-
-  if check_types:
-    type_nest1 = type(nest1)
-    type_nest2 = type(nest2)
-
-    # Duck-typing means that nest should be fine with two different namedtuples
-    # with identical name and fields.
-    if _is_namedtuple(nest1, True) and _is_namedtuple(nest2, True):
-      if not _same_namedtuples(nest1, nest2):
-        raise TypeError(
-            "The two namedtuples don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-    else:
-      if type_nest1 != type_nest2:
-        raise TypeError(
-            "The two structures don't have the same sequence type. First "
-            "structure has type %s, while second structure has type %s."
-            % (type_nest1, type_nest2))
-
-    if isinstance(nest1, dict):
-      keys1 = set(_six.iterkeys(nest1))
-      keys2 = set(_six.iterkeys(nest2))
-      if keys1 != keys2:
-        raise ValueError(
-            "The two dictionaries don't have the same set of keys. First "
-            "structure has keys {}, while second structure has keys {}."
-            .format(keys1, keys2))
-
-  nest1_as_sequence = [n for n in _yield_value(nest1)]
-  nest2_as_sequence = [n for n in _yield_value(nest2)]
-  for n1, n2 in zip(nest1_as_sequence, nest2_as_sequence):
-    _recursive_assert_same_structure(n1, n2, check_types)
+  return _pywrap_tensorflow.SameNamedtuples(nest1, nest2)
 
 
 def assert_same_structure(nest1, nest2, check_types=True):
@@ -257,14 +180,7 @@ def assert_same_structure(nest1, nest2, check_types=True):
     TypeError: If the two structures differ in the type of sequence in any of
       their substructures. Only possible if `check_types` is `True`.
   """
-  len_nest1 = len(flatten(nest1)) if is_sequence(nest1) else 1
-  len_nest2 = len(flatten(nest2)) if is_sequence(nest2) else 1
-  if len_nest1 != len_nest2:
-    raise ValueError("The two structures don't have the same number of "
-                     "elements.\n\nFirst structure (%i elements): %s\n\n"
-                     "Second structure (%i elements): %s"
-                     % (len_nest1, nest1, len_nest2, nest2))
-  _recursive_assert_same_structure(nest1, nest2, check_types)
+  _pywrap_tensorflow.AssertSameStructure(nest1, nest2, check_types)
 
 
 def flatten_dict_items(dictionary):
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 4439d6241ea9607b194cbb17304dbb77dc9f57a8..2f12b25354a905b2aafa870c28f1e9c0b693e888 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import time
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -32,6 +35,9 @@ from tensorflow.python.util import nest
 
 class NestTest(test.TestCase):
 
+  PointXY = collections.namedtuple("Point", ["x", "y"])  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack(self):
     structure = ((3, 4), 5, (6, 7, (9, 10), 8))
     flat = ["a", "b", "c", "d", "e", "f", "g", "h"]
@@ -39,8 +45,8 @@ class NestTest(test.TestCase):
     self.assertEqual(
         nest.pack_sequence_as(structure, flat), (("a", "b"), "c",
                                                  ("d", "e", ("f", "g"), "h")))
-    point = collections.namedtuple("Point", ["x", "y"])
-    structure = (point(x=4, y=2), ((point(x=1, y=0),),))
+    structure = (NestTest.PointXY(x=4, y=2),
+                 ((NestTest.PointXY(x=1, y=0),),))
     flat = [4, 2, 1, 0]
     self.assertEqual(nest.flatten(structure), flat)
     restructured_from_flat = nest.pack_sequence_as(structure, flat)
@@ -66,6 +72,7 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenDictOrder(self):
     """`flatten` orders dicts by key, including OrderedDicts."""
     ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
@@ -87,12 +94,14 @@ class NestTest(test.TestCase):
         ordered_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
+  Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testFlattenAndPack_withDicts(self):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
-    named_tuple = collections.namedtuple("A", ("b", "c"))
     mess = [
         "z",
-        named_tuple(3, 4),
+        NestTest.Abc(3, 4),
         {
             "c": [
                 1,
@@ -111,7 +120,7 @@ class NestTest(test.TestCase):
 
     structure_of_mess = [
         14,
-        named_tuple("a", True),
+        NestTest.Abc("a", True),
         {
             "c": [
                 0,
@@ -157,6 +166,7 @@ class NestTest(test.TestCase):
       nest.pack_sequence_as(["hello", "world"],
                             ["and", "goodbye", "again"])
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testIsSequence(self):
     self.assertFalse(nest.is_sequence("1234"))
     self.assertTrue(nest.is_sequence([1, 3, [4, 5]]))
@@ -186,6 +196,23 @@ class NestTest(test.TestCase):
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
 
+  # pylint does not correctly recognize these as class names and
+  # suggests to use variable style under_score naming.
+  # pylint: disable=invalid-name
+  Named0ab = collections.namedtuple("named_0", ("a", "b"))
+  Named1ab = collections.namedtuple("named_1", ("a", "b"))
+  SameNameab = collections.namedtuple("same_name", ("a", "b"))
+  SameNameab2 = collections.namedtuple("same_name", ("a", "b"))
+  SameNamexy = collections.namedtuple("same_name", ("x", "y"))
+  SameName1xy = collections.namedtuple("same_name_1", ("x", "y"))
+  SameName1xy2 = collections.namedtuple("same_name_1", ("x", "y"))
+  NotSameName = collections.namedtuple("not_same_name", ("a", "b"))
+  # pylint: enable=invalid-name
+
+  class SameNamedType1(SameNameab):
+    pass
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testAssertSameStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
@@ -198,23 +225,32 @@ class NestTest(test.TestCase):
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(6 elements\\):.*?"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         "More specifically: Substructure "
+         r'"type=tuple str=\(\(1, 2\), 3\)" is a sequence, while '
+         'substructure "type=str str=spam" is not')):
       nest.assert_same_structure(structure1, structure_different_num_elements)
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(2 elements\\):.*?"
-         "\n\nSecond structure \\(1 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         r'is a sequence, while substructure "type=ndarray str=\[0 1\]" '
+         "is not")):
       nest.assert_same_structure([0, 1], np.array([0, 1]))
 
     with self.assertRaisesRegexp(
         ValueError,
-        ("don't have the same number of elements\\.\n\n"
-         "First structure \\(1 elements\\):.*"
-         "\n\nSecond structure \\(2 elements\\):")):
+        ("The two structures don't have the same nested structure\\.\n\n"
+         "First structure:.*?\n\n"
+         "Second structure:.*\n\n"
+         r'More specifically: Substructure "type=list str=\[0, 1\]" '
+         'is a sequence, while substructure "type=int str=0" '
+         "is not")):
       nest.assert_same_structure(0, [0, 1])
 
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1), [0, 1])
@@ -225,21 +261,21 @@ class NestTest(test.TestCase):
          "First structure: .*?\n\nSecond structure: ")):
       nest.assert_same_structure(structure1, structure_different_nesting)
 
-    named_type_0 = collections.namedtuple("named_0", ("a", "b"))
-    named_type_1 = collections.namedtuple("named_1", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure, (0, 1),
-                      named_type_0("a", "b"))
+                      NestTest.Named0ab("a", "b"))
 
-    nest.assert_same_structure(named_type_0(3, 4), named_type_0("a", "b"))
+    nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                               NestTest.Named0ab("a", "b"))
 
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      named_type_0(3, 4), named_type_1(3, 4))
+                      NestTest.Named0ab(3, 4), NestTest.Named1ab(3, 4))
 
     with self.assertRaisesRegexp(
         ValueError,
         ("don't have the same nested structure\\.\n\n"
          "First structure: .*?\n\nSecond structure: ")):
-      nest.assert_same_structure(named_type_0(3, 4), named_type_0([3], 4))
+      nest.assert_same_structure(NestTest.Named0ab(3, 4),
+                                 NestTest.Named0ab([3], 4))
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -258,36 +294,33 @@ class NestTest(test.TestCase):
                                  "don't have the same set of keys"):
       nest.assert_same_structure({"a": 1}, {"b": 1})
 
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    nest.assert_same_structure(same_name_type_0(0, 1), same_name_type_1(2, 3))
+    nest.assert_same_structure(NestTest.SameNameab(0, 1),
+                               NestTest.SameNameab2(2, 3))
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_2 = collections.namedtuple("same_name_1", ("x", "y"))
-    same_name_type_3 = collections.namedtuple("same_name_1", ("x", "y"))
     nest.assert_same_structure(
-        same_name_type_0(same_name_type_2(0, 1), 2),
-        same_name_type_1(same_name_type_3(2, 3), 4))
+        NestTest.SameNameab(NestTest.SameName1xy(0, 1), 2),
+        NestTest.SameNameab2(NestTest.SameName1xy2(2, 3), 4))
 
     expected_message = "The two structures don't have the same.*"
     with self.assertRaisesRegexp(ValueError, expected_message):
-      nest.assert_same_structure(same_name_type_0(0, same_name_type_1(1, 2)),
-                                 same_name_type_1(same_name_type_0(0, 1), 2))
+      nest.assert_same_structure(
+          NestTest.SameNameab(0, NestTest.SameNameab2(1, 2)),
+          NestTest.SameNameab2(NestTest.SameNameab(0, 1), 2))
 
-    same_name_type_1 = collections.namedtuple("not_same_name", ("a", "b"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.NotSameName(2, 3))
 
-    same_name_type_1 = collections.namedtuple("same_name", ("x", "y"))
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), same_name_type_1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamexy(2, 3))
 
-    class SameNamedType1(collections.namedtuple("same_name", ("a", "b"))):
-      pass
     self.assertRaises(TypeError, nest.assert_same_structure,
-                      same_name_type_0(0, 1), SameNamedType1(2, 3))
+                      NestTest.SameNameab(0, 1), NestTest.SameNamedType1(2, 3))
 
+  EmptyNT = collections.namedtuple("empty_nt", "")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructure(self):
     structure1 = (((1, 2), 3), 4, (5, 6))
     structure2 = (((7, 8), 9), 10, (11, 12))
@@ -310,9 +343,8 @@ class NestTest(test.TestCase):
     self.assertEqual((), nest.map_structure(lambda x: x + 1, ()))
     self.assertEqual([], nest.map_structure(lambda x: x + 1, []))
     self.assertEqual({}, nest.map_structure(lambda x: x + 1, {}))
-    empty_nt = collections.namedtuple("empty_nt", "")
-    self.assertEqual(empty_nt(), nest.map_structure(lambda x: x + 1,
-                                                    empty_nt()))
+    self.assertEqual(NestTest.EmptyNT(), nest.map_structure(lambda x: x + 1,
+                                                            NestTest.EmptyNT()))
 
     # This is checking actual equality of types, empty list != empty tuple
     self.assertNotEqual((), nest.map_structure(lambda x: x + 1, []))
@@ -352,10 +384,12 @@ class NestTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Only valid keyword argument"):
       nest.map_structure(lambda x: None, structure1, check_types=False, foo="a")
 
+  ABTuple = collections.namedtuple("ab_tuple", "a, b")  # pylint: disable=invalid-name
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
   def testMapStructureWithStrings(self):
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
-    inp_a = ab_tuple(a="foo", b=("bar", "baz"))
-    inp_b = ab_tuple(a=2, b=(1, 3))
+    inp_a = NestTest.ABTuple(a="foo", b=("bar", "baz"))
+    inp_b = NestTest.ABTuple(a=2, b=(1, 3))
     out = nest.map_structure(lambda string, repeats: string * repeats,
                              inp_a,
                              inp_b)
@@ -363,8 +397,8 @@ class NestTest(test.TestCase):
     self.assertEqual("bar", out.b[0])
     self.assertEqual("bazbazbaz", out.b[1])
 
-    nt = ab_tuple(a=("something", "something_else"),
-                  b="yet another thing")
+    nt = NestTest.ABTuple(a=("something", "something_else"),
+                          b="yet another thing")
     rev_nt = nest.map_structure(lambda x: x[::-1], nt)
     # Check the output is the correct structure, and all strings are reversed.
     nest.assert_same_structure(nt, rev_nt)
@@ -431,10 +465,8 @@ class NestTest(test.TestCase):
 
     # This assertion is expected to pass: two namedtuples with the same
     # name and field names are considered to be identical.
-    same_name_type_0 = collections.namedtuple("same_name", ("a", "b"))
-    same_name_type_1 = collections.namedtuple("same_name", ("a", "b"))
-    inp_shallow = same_name_type_0(1, 2)
-    inp_deep = same_name_type_1(1, [1, 2, 3])
+    inp_shallow = NestTest.SameNameab(1, 2)
+    inp_deep = NestTest.SameNameab2(1, [1, 2, 3])
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=False)
     nest.assert_shallow_structure(inp_shallow, inp_deep, check_types=True)
 
@@ -466,7 +498,7 @@ class NestTest(test.TestCase):
                      [1, {"c": 2}, 3, (4, 5)])
 
     # Namedtuples.
-    ab_tuple = collections.namedtuple("ab_tuple", "a, b")
+    ab_tuple = NestTest.ABTuple
     input_tree = ab_tuple(a=[0, 1], b=2)
     shallow_tree = ab_tuple(a=0, b=1)
     input_tree_flattened_as_shallow_tree = nest.flatten_up_to(shallow_tree,
@@ -681,5 +713,31 @@ class NestTest(test.TestCase):
           list(nest.flatten_with_joined_string_paths(inputs)), expected)
 
 
+class NestBenchmark(test.Benchmark):
+
+  def run_and_report(self, s1, s2, name):
+    burn_iter, test_iter = 100, 30000
+
+    for _ in xrange(burn_iter):
+      nest.assert_same_structure(s1, s2)
+
+    t0 = time.time()
+    for _ in xrange(test_iter):
+      nest.assert_same_structure(s1, s2)
+    t1 = time.time()
+
+    self.report_benchmark(iters=test_iter, wall_time=(t1 - t0) / test_iter,
+                          name=name)
+
+  def benchmark_assert_structure(self):
+    s1 = (((1, 2), 3), 4, (5, 6))
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6"))
+    self.run_and_report(s1, s2, "assert_same_structure_6_elem")
+
+    s1 = (((1, 2), 3), 4, (5, 6)) * 10
+    s2 = ((("foo1", "foo2"), "foo3"), "foo4", ("foo5", "foo6")) * 10
+    self.run_and_report(s1, s2, "assert_same_structure_60_elem")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 4ab8a72a83b466c38c50b1c76004e7a6fe942a04..286028b8bbe6bc89b9d7e479a8f1b5c06d7ba5cb 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -18,12 +18,22 @@ from __future__ import division
 from __future__ import print_function
 
 import inspect as _inspect
+import six
+from collections import namedtuple
 
 from tensorflow.python.util import tf_decorator
 
 ArgSpec = _inspect.ArgSpec
 
 
+if six.PY3:
+  FullArgSpec = _inspect.FullArgSpec
+else:
+  FullArgSpec = namedtuple(
+      'FullArgSpec', ['args', 'varargs', 'varkw', 'defaults',
+                      'kwonlyargs', 'kwonlydefaults', 'annotations'])
+
+
 def currentframe():
   """TFDecorator-aware replacement for inspect.currentframe."""
   return _inspect.stack()[1][0]
@@ -55,13 +65,26 @@ def getfullargspec(obj):  # pylint: disable=redefined-builtin
     obj: A callable, possibly decorated.
 
   Returns:
-    The `FullArgSpec` (`ArgSpec` in Python 2) that describes the signature of
+    The `FullArgSpec` that describes the signature of
     the outermost decorator that changes the callable's signature. If the
-    callable is not decorated, `inspect.getfullargspec()`
-    (`inspect.getargspec()` in Python 2) will be called directly on the
-    callable.
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
   """
-  spec_fn = getattr(_inspect, 'getfullargspec', getattr(_inspect, 'getargspec'))
+  if six.PY2:
+    def spec_fn(target):
+      argspecs = _inspect.getargspec(target)
+      fullargspecs = FullArgSpec(
+          args=argspecs.args,
+          varargs=argspecs.varargs,
+          varkw=argspecs.keywords,
+          defaults=argspecs.defaults,
+          kwonlyargs=[],
+          kwonlydefaults=None,
+          annotations={})
+      return fullargspecs
+  else:
+    spec_fn = _inspect.getfullargspec
+
   decorators, target = tf_decorator.unwrap(obj)
   return next((d.decorator_argspec for d in decorators
                if d.decorator_argspec is not None), spec_fn(target))
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index a41fa7df253bcf4bce280574b89ed0dda8330521..70aee4a3f663c862ecb09444866a0294333ee27a 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
 namespace swig {
@@ -27,6 +28,113 @@ PyObject* CollectionsSequenceType = nullptr;
 
 bool WarnedThatSetIsNotSequence = false;
 
+bool IsString(PyObject* o) {
+  return PyBytes_Check(o) ||
+#if PY_MAJOR_VERSION < 3
+         PyString_Check(o) ||
+#endif
+         PyUnicode_Check(o);
+}
+
+// Equivalent to Python's 'o.__class__.__name__'
+// Note that '__class__' attribute is set only in new-style classes.
+// A lot of tensorflow code uses __class__ without checks, so it seems like
+// we only support new-style classes.
+StringPiece GetClassName(PyObject* o) {
+  // __class__ is equivalent to type() for new style classes.
+  // type() is equivalent to PyObject_Type()
+  // (https://docs.python.org/3.5/c-api/object.html#c.PyObject_Type)
+  // PyObject_Type() is equivalent to o->ob_type except for Py_INCREF, which
+  // we don't need here.
+  PyTypeObject* type = o->ob_type;
+
+  // __name__ is the value of `tp_name` after the last '.'
+  // (https://docs.python.org/2/c-api/typeobj.html#c.PyTypeObject.tp_name)
+  StringPiece name(type->tp_name);
+  size_t pos = name.rfind('.');
+  if (pos != StringPiece::npos) {
+    name.remove_prefix(pos + 1);
+  }
+  return name;
+}
+
+string PyObjectToString(PyObject* o) {
+  if (o == nullptr) {
+    return "<null object>";
+  }
+  PyObject* str = PyObject_Str(o);
+  if (str) {
+#if PY_MAJOR_VERSION < 3
+    string s(PyString_AS_STRING(str));
+#else
+    string s(PyUnicode_AsUTF8(str));
+#endif
+    Py_DECREF(str);
+    return tensorflow::strings::StrCat("type=", GetClassName(o), " str=", s);
+  } else {
+    return "<failed to execute str() on object>";
+  }
+}
+
+// Implements the same idea as tensorflow.util.nest._yield_value
+// During construction we check if the iterable is a dictionary.
+// If so, we construct a sequence from its sorted keys that will be used
+// for iteration.
+// If not, we construct a sequence directly from the iterable.
+// At each step, we get the next element from the sequence and use it
+// either as a key or return it directly.
+//
+// 'iterable' must not be modified while ValIterator is used.
+class ValIterator {
+ public:
+  explicit ValIterator(PyObject* iterable) : dict_(nullptr), index_(0) {
+    if (PyDict_Check(iterable)) {
+      dict_ = iterable;
+      // PyDict_Keys returns a list, which can be used with
+      // PySequence_Fast_GET_ITEM.
+      seq_ = PyDict_Keys(iterable);
+      // Iterate through dictionaries in a deterministic order by sorting the
+      // keys. Notice this means that we ignore the original order of
+      // `OrderedDict` instances. This is intentional, to avoid potential
+      // bugs caused by mixing ordered and plain dicts (e.g., flattening
+      // a dict but using a corresponding `OrderedDict` to pack it back).
+      PyList_Sort(seq_);
+    } else {
+      seq_ = PySequence_Fast(iterable, "");
+    }
+    size_ = PySequence_Fast_GET_SIZE(seq_);
+  }
+
+  ~ValIterator() { Py_DECREF(seq_); }
+
+  // Return a borrowed reference to the next element from iterable.
+  // Return nullptr when iteration is over.
+  PyObject* next() {
+    PyObject* element = nullptr;
+    if (index_ < size_) {
+      // Both PySequence_Fast_GET_ITEM and PyDict_GetItem return borrowed
+      // references.
+      element = PySequence_Fast_GET_ITEM(seq_, index_);
+      ++index_;
+      if (dict_ != nullptr) {
+        element = PyDict_GetItem(dict_, element);
+        if (element == nullptr) {
+          PyErr_SetString(PyExc_RuntimeError,
+                          "Dictionary was modified during iteration over it");
+          return nullptr;
+        }
+      }
+    }
+    return element;
+  }
+
+ private:
+  PyObject* seq_;
+  PyObject* dict_;
+  Py_ssize_t size_;
+  Py_ssize_t index_;
+};
+
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
@@ -38,7 +146,7 @@ int IsSequenceHelper(PyObject* o) {
                     "so consider avoiding using them.";
     WarnedThatSetIsNotSequence = true;
   }
-  if (CollectionsSequenceType == nullptr) {
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
     PyErr_SetString(
         PyExc_RuntimeError,
         tensorflow::strings::StrCat(
@@ -49,11 +157,7 @@ int IsSequenceHelper(PyObject* o) {
   }
   int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
   if (is_instance == -1) return -1;
-  return static_cast<int>(is_instance != 0 && !PyBytes_Check(o) &&
-#if PY_MAJOR_VERSION < 3
-                          !PyString_Check(o) &&
-#endif
-                          !PyUnicode_Check(o));
+  return static_cast<int>(is_instance != 0 && !IsString(o));
 }
 
 bool FlattenHelper(PyObject* nested, PyObject* list) {
@@ -75,12 +179,16 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
       // while the method is running.
       PyObject* key = PyList_GET_ITEM(keys, i);
       PyObject* val = PyDict_GetItem(nested, key);
-      if (Py_EnterRecursiveCall(" in Flatten")) {
+      if (Py_EnterRecursiveCall(" in flatten")) {
         Py_DECREF(keys);
         return false;
       }
-      FlattenHelper(val, list);
+      const bool success = FlattenHelper(val, list);
       Py_LeaveRecursiveCall();
+      if (!success) {
+        Py_DECREF(keys);
+        return false;
+      }
     }
     Py_DECREF(keys);
     return true;
@@ -90,13 +198,159 @@ bool FlattenHelper(PyObject* nested, PyObject* list) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
   while ((item = PyIter_Next(iterator)) != nullptr) {
-    FlattenHelper(item, list);
+    if (Py_EnterRecursiveCall(" in flatten")) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
+    bool success = FlattenHelper(item, list);
+    Py_LeaveRecursiveCall();
+    if (!success) {
+      Py_DECREF(iterator);
+      Py_DECREF(item);
+      return false;
+    }
     Py_DECREF(item);
   }
   Py_DECREF(iterator);
   return true;
 }
 
+// Sets error using keys of 'dict1' and 'dict2'.
+// 'dict1' and 'dict2' are assumed to be Python dictionaries.
+void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
+                           bool* is_type_error) {
+  PyObject* k1 = PyDict_Keys(dict1);
+  PyObject* k2 = PyDict_Keys(dict2);
+  *is_type_error = false;
+  *error_msg = tensorflow::strings::StrCat(
+      "The two dictionaries don't have the same set of keys. "
+      "First structure has keys ",
+      PyObjectToString(k1), ", while second structure has keys ",
+      PyObjectToString(k2));
+  Py_DECREF(k1);
+  Py_DECREF(k2);
+}
+
+// Returns true iff there were no "internal" errors. In other words,
+// errors that has nothing to do with structure checking.
+// If an "internal" error occured, the appropriate Python error will be
+// set and the caller can propage it directly to the user.
+//
+// Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
+// be empty.
+// Leaves `error_msg` empty if structures matched. Else, fills `error_msg`
+// with appropriate error and sets `is_type_error` to true iff
+// the error to be raised should be TypeError.
+bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
+                               string* error_msg, bool* is_type_error) {
+  DCHECK(error_msg);
+  DCHECK(is_type_error);
+  const bool is_seq1 = IsSequence(o1);
+  const bool is_seq2 = IsSequence(o2);
+  if (PyErr_Occurred()) return false;
+  if (is_seq1 != is_seq2) {
+    string seq_str = is_seq1 ? PyObjectToString(o1) : PyObjectToString(o2);
+    string non_seq_str = is_seq1 ? PyObjectToString(o2) : PyObjectToString(o1);
+    *is_type_error = false;
+    *error_msg = tensorflow::strings::StrCat(
+        "Substructure \"", seq_str, "\" is a sequence, while substructure \"",
+        non_seq_str, "\" is not");
+    return true;
+  }
+
+  // Got to scalars, so finished checking. Structures are the same.
+  if (!is_seq1) return true;
+
+  if (check_types) {
+    const PyTypeObject* type1 = o1->ob_type;
+    const PyTypeObject* type2 = o2->ob_type;
+
+    // We treat two different namedtuples with identical name and fields
+    // as having the same type.
+    const PyObject* o1_tuple = IsNamedtuple(o1, true);
+    if (o1_tuple == nullptr) return false;
+    const PyObject* o2_tuple = IsNamedtuple(o2, true);
+    if (o2_tuple == nullptr) {
+      Py_DECREF(o1_tuple);
+      return false;
+    }
+    bool both_tuples = o1_tuple == Py_True && o2_tuple == Py_True;
+    Py_DECREF(o1_tuple);
+    Py_DECREF(o2_tuple);
+
+    if (both_tuples) {
+      const PyObject* same_tuples = SameNamedtuples(o1, o2);
+      if (same_tuples == nullptr) return false;
+      bool not_same_tuples = same_tuples != Py_True;
+      Py_DECREF(same_tuples);
+      if (not_same_tuples) {
+        *is_type_error = true;
+        *error_msg = tensorflow::strings::StrCat(
+            "The two namedtuples don't have the same sequence type. "
+            "First structure ",
+            PyObjectToString(o1), " has type ", type1->tp_name,
+            ", while second structure ", PyObjectToString(o2), " has type ",
+            type2->tp_name);
+        return true;
+      }
+    } else if (type1 != type2) {
+      *is_type_error = true;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two namedtuples don't have the same sequence type. "
+          "First structure ",
+          PyObjectToString(o1), " has type ", type1->tp_name,
+          ", while second structure ", PyObjectToString(o2), " has type ",
+          type2->tp_name);
+      return true;
+    }
+
+    if (PyDict_Check(o1)) {
+      if (PyDict_Size(o1) != PyDict_Size(o2)) {
+        SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+        return true;
+      }
+
+      PyObject* key;
+      Py_ssize_t pos = 0;
+      while (PyDict_Next(o1, &pos, &key, nullptr)) {
+        if (PyDict_GetItem(o2, key) == nullptr) {
+          SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+          return true;
+        }
+      }
+    }
+  }
+
+  ValIterator iter1(o1);
+  ValIterator iter2(o2);
+
+  while (true) {
+    PyObject* v1 = iter1.next();
+    PyObject* v2 = iter2.next();
+    if (v1 != nullptr && v2 != nullptr) {
+      if (Py_EnterRecursiveCall(" in assert_same_structure")) {
+        return false;
+      }
+      bool no_internal_errors = AssertSameStructureHelper(
+          v1, v2, check_types, error_msg, is_type_error);
+      Py_LeaveRecursiveCall();
+      if (!no_internal_errors) return false;
+      if (!error_msg->empty()) return true;
+    } else if (v1 == nullptr && v2 == nullptr) {
+      // Done with all recursive calls. Structure matched.
+      return true;
+    } else {
+      *is_type_error = false;
+      *error_msg = tensorflow::strings::StrCat(
+          "The two structures don't have the same number of elements. ",
+          "First structure: ", PyObjectToString(o1),
+          ". Second structure: ", PyObjectToString(o2));
+      return true;
+    }
+  }
+}
+
 }  // anonymous namespace
 
 void RegisterSequenceClass(PyObject* sequence_class) {
@@ -123,5 +377,107 @@ PyObject* Flatten(PyObject* nested) {
     return nullptr;
   }
 }
+
+PyObject* IsNamedtuple(PyObject* o, bool strict) {
+  // Must be subclass of tuple
+  if (!PyTuple_Check(o)) {
+    Py_RETURN_FALSE;
+  }
+
+  // If strict, o.__class__.__base__ must be tuple
+  if (strict) {
+    PyObject* klass = PyObject_GetAttrString(o, "__class__");
+    if (klass == nullptr) return nullptr;
+    PyObject* base = PyObject_GetAttrString(klass, "__base__");
+    Py_DECREF(klass);
+    if (base == nullptr) return nullptr;
+
+    const PyTypeObject* base_type = reinterpret_cast<PyTypeObject*>(base);
+    // built-in object types are singletons
+    bool tuple_base = base_type == &PyTuple_Type;
+    Py_DECREF(base);
+    if (!tuple_base) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Sequence type has not been set. "
+            "Please call RegisterSequenceClass before using this module")
+            .c_str());
+    return nullptr;
+  }
+
+  // o must have attribute '_fields' and every element in
+  // '_fields' must be a string.
+  int has_fields = PyObject_HasAttrString(o, "_fields");
+  if (!has_fields) {
+    Py_RETURN_FALSE;
+  }
+
+  Safe_PyObjectPtr fields = make_safe(PyObject_GetAttrString(o, "_fields"));
+  int is_instance = PyObject_IsInstance(fields.get(), CollectionsSequenceType);
+  if (is_instance == 0) {
+    Py_RETURN_FALSE;
+  } else if (is_instance == -1) {
+    return nullptr;
+  }
+
+  Safe_PyObjectPtr seq = make_safe(PySequence_Fast(fields.get(), ""));
+  const Py_ssize_t s = PySequence_Fast_GET_SIZE(seq.get());
+  for (Py_ssize_t i = 0; i < s; ++i) {
+    // PySequence_Fast_GET_ITEM returns borrowed ref
+    PyObject* elem = PySequence_Fast_GET_ITEM(seq.get(), i);
+    if (!IsString(elem)) {
+      Py_RETURN_FALSE;
+    }
+  }
+
+  Py_RETURN_TRUE;
+}
+
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2) {
+  PyObject* f1 = PyObject_GetAttrString(o1, "_fields");
+  PyObject* f2 = PyObject_GetAttrString(o2, "_fields");
+  if (f1 == nullptr || f2 == nullptr) {
+    Py_XDECREF(f1);
+    Py_XDECREF(f2);
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        "Expected namedtuple-like objects (that have _fields attr)");
+    return nullptr;
+  }
+
+  if (PyObject_RichCompareBool(f1, f2, Py_NE)) {
+    Py_RETURN_FALSE;
+  }
+
+  if (GetClassName(o1).compare(GetClassName(o2)) == 0) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types) {
+  string error_msg;
+  bool is_type_error = false;
+  AssertSameStructureHelper(o1, o2, check_types, &error_msg, &is_type_error);
+  if (!error_msg.empty()) {
+    PyErr_SetString(
+        is_type_error ? PyExc_TypeError : PyExc_ValueError,
+        tensorflow::strings::StrCat(
+            "The two structures don't have the same nested structure.\n\n",
+            "First structure: ", PyObjectToString(o1), "\n\nSecond structure: ",
+            PyObjectToString(o2), "\n\nMore specifically: ", error_msg)
+            .c_str());
+    return nullptr;
+  }
+  Py_RETURN_NONE;
+}
+
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 2af71dc753760e7efaf28cc500d5296a31957a04..c325baa5f86820846dd09780b4208667f3aad5e1 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -33,6 +33,57 @@ namespace swig {
 //   dict.
 bool IsSequence(PyObject* o);
 
+// Implements the same interface as tensorflow.util.nest._is_namedtuple
+// Returns Py_True iff `instance` should be considered a `namedtuple`.
+//
+// Args:
+//   instance: An instance of a Python object.
+//   strict: If True, `instance` is considered to be a `namedtuple` only if
+//       it is a "plain" namedtuple. For instance, a class inheriting
+//       from a `namedtuple` will be considered to be a `namedtuple`
+//       iff `strict=False`.
+//
+// Returns:
+//   True if `instance` is a `namedtuple`.
+PyObject* IsNamedtuple(PyObject* o, bool strict);
+
+// Implements the same interface as tensorflow.util.nest._same_namedtuples
+// Returns Py_True iff the two namedtuples have the same name and fields.
+// Raises RuntimeError if `o1` or `o2` don't look like namedtuples (don't have
+// '_fields' attribute).
+PyObject* SameNamedtuples(PyObject* o1, PyObject* o2);
+
+// Asserts that two structures are nested in the same way.
+//
+// Note that namedtuples with identical name and fields are always considered
+// to have the same shallow structure (even with `check_types=True`).
+// For intance, this code will print `True`:
+//
+// ```python
+// def nt(a, b):
+//   return collections.namedtuple('foo', 'a b')(a, b)
+// print(assert_same_structure(nt(0, 1), nt(2, 3)))
+// ```
+//
+// Args:
+//  nest1: an arbitrarily nested structure.
+//  nest2: an arbitrarily nested structure.
+//  check_types: if `true`, types of sequences are checked as
+//      well, including the keys of dictionaries. If set to `false`, for example
+//      a list and a tuple of objects will look the same if they have the same
+//      size. Note that namedtuples with identical name and fields are always
+//      considered to have the same shallow structure.
+//
+// Raises:
+//  ValueError: If the two structures do not have the same number of elements or
+//    if the two structures are not nested in the same way.
+//  TypeError: If the two structures differ in the type of sequence in any of
+//    their substructures. Only possible if `check_types` is `True`.
+//
+// Returns:
+//  Py_None on success, nullptr on error.
+PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
+
 // Implements the same interface as tensorflow.util.nest.flatten
 //
 // Returns a flat list from a given nested structure.
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index d69084fc0091ac79cf3f5cf3d70af419cf78f936..b7f201b6fe6fd18af2bb833df2d08bfedb23a185 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -34,6 +34,15 @@ limitations under the License.
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
+%unignore tensorflow::swig::IsNamedtuple;
+%noexception tensorflow::swig::IsNamedtuple;
+
+%unignore tensorflow::swig::SameNamedtuples;
+%noexception tensorflow::swig::SameNamedtuples;
+
+%unignore tensorflow::swig::AssertSameStructure;
+%noexception tensorflow::swig::AssertSameStructure;
+
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 1865240014e2da5068a4ef377a5934de62dd54b6..80fc9ff2926c53b557a7ba9e242d597a89acf79f 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -33,6 +33,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:lib",
         "@local_config_cuda//cuda:cuda_headers",
     ],
@@ -45,6 +46,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/compiler/xla:statusor",
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_static([":stream_executor_impl"]),
 )
@@ -56,7 +58,10 @@ cc_library(
             [
                 "cuda/*.cc",
             ],
-            exclude = ["cuda/cuda_platform_id.cc"],
+            exclude = [
+                "cuda/*_test.cc",
+                "cuda/cuda_platform_id.cc",
+            ],
         ),
     ),
     copts = select({
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index c563f8f931b0a5689268329386d1252f2a45bdd1..1c550dbb13657d39aecb3f7373efd12bae80aabb 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -2076,12 +2076,6 @@ bool CUDABlas::DoBlasGemvWithProfilingImpl(
     const DeviceMemory<T> &a, int lda, const DeviceMemory<T> &x, int incx,
     const T &beta, DeviceMemory<T> *y, int incy,
     blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2114,12 +2108,6 @@ bool CUDABlas::DoBlasGemmWithProfilingImpl(
     uint64 n, uint64 k, const ParamType &alpha, const DeviceMemory<T> &a,
     int lda, const DeviceMemory<T> &b, int ldb, const ParamType &beta,
     DeviceMemory<T> *c, int ldc, blas::ProfileResult *output_profile_result) {
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
@@ -2188,12 +2176,6 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
     return false;
   }
 
-  struct TimerDeleter {
-    void operator()(CUDATimer *t) {
-      t->Destroy();
-      delete t;
-    }
-  };
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (output_profile_result != nullptr) {
     timer.reset(new CUDATimer(parent_));
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index ab5e6590e0fcdb2f19a0a3a85e64e6b144a97363..1dc7f991b38c00d9df06322b869911d2866dd965 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
@@ -55,15 +57,6 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-// Returns the "Compatibility" version number from the CuDNN version number.
-// This is the number that tries to indicate ABI compatibility.
-//
-// For example, if cudnn_version is 5107, the compatibility version
-// number will be 5100.
-size_t cudnnCompatibilityVersion(size_t cudnn_version) {
-  return (cudnn_version / 100) * 100;
-}
-
 }  // namespace
 
 namespace perftools {
@@ -109,6 +102,22 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
+#if CUDNN_VERSION >= 6000
+string ToString(libraryPropertyType type) {
+  switch (type) {
+    case MAJOR_VERSION:
+      return "MAJOR_VERSION";
+    case MINOR_VERSION:
+      return "MINOR_VERSION";
+    case PATCH_LEVEL:
+      return "PATCH_LEVEL";
+    default:
+      return port::StrCat(
+          "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
+  }
+}
+#endif
+
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
 
@@ -288,6 +297,8 @@ CUDNN_DNN_ROUTINE_EACH_R7(PERFTOOLS_GPUTOOLS_CUDNN_WRAP)
 
 namespace {
 
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
+
 cudnnHandle_t ToHandle(void* opaque_handle) {
   return static_cast<cudnnHandle_t>(opaque_handle);
 }
@@ -360,6 +371,51 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
+#if CUDNN_VERSION >= 6000
+port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
+  cudnnStatus_t status = cudnnGetProperty(type, value);
+  if (status != CUDNN_STATUS_SUCCESS) {
+    const string error =
+        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
+                     " with status: ", ToString(status));
+    LOG(ERROR) << error;
+    return port::Status{port::error::INTERNAL, error};
+  }
+  return port::Status::OK();
+}
+
+cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
+  if (algorithm.is_default()) {
+    return CUDNN_RNN_ALGO_STANDARD;
+  } else {
+    cudnnRNNAlgo_t algo = static_cast<cudnnRNNAlgo_t>(algorithm.algo_id());
+    switch (algo) {
+      case CUDNN_RNN_ALGO_STANDARD:
+      case CUDNN_RNN_ALGO_PERSIST_STATIC:
+      case CUDNN_RNN_ALGO_PERSIST_DYNAMIC:
+        return algo;
+      default:
+        LOG(FATAL) << "Unsupported Cudnn RNN algorithm: "
+                   << algorithm.algo_id();
+    }
+  }
+}
+#endif
+
+port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
+#if CUDNN_VERSION >= 6000
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
+  TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
+#else
+  size_t loaded_version = ::cudnnGetVersion();
+  version->major_version = loaded_version / 1000;
+  version->minor_version = (loaded_version / 100) % 10;
+  version->patch_level = loaded_version % 100;
+#endif
+  return port::Status::OK();
+}
+
 }  // namespace
 
 CudnnSupport::CudnnSupport(CUDAExecutor* parent)
@@ -376,24 +432,19 @@ port::Status CudnnSupport::Init() {
   auto status = wrap::cudnnCreate(
       parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
   if (status == CUDNN_STATUS_SUCCESS) {
-    // Check whether loaded version of CuDNN matches what the source
-    // was built with.
-    size_t loaded_version = ::cudnnGetVersion();
-    size_t loaded_compat_version = cudnnCompatibilityVersion(loaded_version);
-    size_t compiled_compat_version = cudnnCompatibilityVersion(CUDNN_VERSION);
-    bool library_loaded_matches_source =
-        (loaded_compat_version == compiled_compat_version);
-    if (!library_loaded_matches_source) {
-      const string error =
-          port::StrCat("Loaded runtime CuDNN library: ", loaded_version,
-                       " (compatibility version ", loaded_compat_version,
-                       ") but source was compiled with ", CUDNN_VERSION,
-                       " (compatibility version ", compiled_compat_version,
-                       ").  If using a binary install, upgrade your CuDNN "
-                       "library to match.  If building from sources, "
-                       "make sure the library loaded at runtime matches a "
-                       "compatible version specified during compile "
-                       "configuration.");
+    CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
+
+    CudnnVersion loaded_version;
+    TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&loaded_version));
+    if (!IsSourceCompatibleWithCudnnLibrary(source_version, loaded_version)) {
+      const tensorflow::string error = port::StrCat(
+          "Loaded runtime CuDNN library: ", loaded_version.ToString(),
+          " but source was compiled with: ", source_version.ToString(),
+          ".  CuDNN library major and minor version needs to match or have "
+          "higher minor version in case of CuDNN 7.0 or later version. If "
+          "using a binary install, upgrade your CuDNN library.  If building "
+          "from sources, make sure the library loaded at runtime is compatible "
+          "with the version specified during compile configuration.");
       LOG(ERROR) << error;
       return port::Status{port::error::INTERNAL, error};
     }
@@ -426,6 +477,13 @@ port::Status CudnnSupport::Init() {
                                    ToString(status))};
 }
 
+port::StatusOr<std::tuple<int, int, int>> CudnnSupport::GetVersion() {
+  CudnnVersion version;
+  TF_RETURN_IF_ERROR(GetLoadedCudnnVersion(&version));
+  return std::make_tuple(version.major_version, version.minor_version,
+                         version.patch_level);
+}
+
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
 class ScopedTensorDescriptor {
  public:
@@ -1092,6 +1150,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
                      cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
+                     cudnnDataType_t compute_type,
+                     const dnn::AlgorithmConfig& algorithm_config,
                      float dropout, uint64 seed,
                      ScratchAllocator* state_allocator)
       : parent_(parent),
@@ -1102,7 +1162,9 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
-        data_type_(data_type) {
+        data_type_(data_type),
+        compute_type_(compute_type),
+        algorithm_config_(algorithm_config) {
     // Create the dropout handle.
     cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
         parent, cudnn_handle, dropout, seed, state_allocator));
@@ -1116,18 +1178,20 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
 #if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    cudnnRNNAlgo_t rnn_algo = CUDNN_RNN_ALGO_STANDARD;
+    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config_.algorithm());
     status = wrap::cudnnSetRNNDescriptor_v6(
         parent, cudnn_handle, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, rnn_algo /*algo*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, rnn_algo /*algo*/, compute_type /*dataType*/);
 #else
+    CHECK(algorithm_config_.is_default())
+        << "Non-default algorithm not supported for CUDA version < 6.0";
     status = wrap::cudnnSetRNNDescriptor(
         parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
         num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
         input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, data_type /*dataType*/);
+        rnn_mode /*mode*/, compute_type /*dataType*/);
 #endif
     CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
 
@@ -1138,9 +1202,7 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
       SetFailure(cudnn_params_desc_->Status());
       return;
     }
-    if (data_type == CUDNN_DATA_HALF) {
-      set_use_tensor_op_math(true);
-    }
+    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
   }
   ~CudnnRnnDescriptor() override {
     if (rnn_desc_) {
@@ -1174,6 +1236,10 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode() const { return direction_mode_; }
   cudnnRNNMode_t rnn_mode() const { return rnn_mode_; }
   cudnnDataType_t data_type() const { return data_type_; }
+  cudnnDataType_t compute_type() const { return compute_type_; }
+  const dnn::AlgorithmConfig& algorithm_config() const {
+    return algorithm_config_;
+  }
   int64 ParamsSizeInBytes() const override {
     return cudnn_params_desc_->params_size_in_bytes();
   }
@@ -1204,6 +1270,8 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
+  cudnnDataType_t compute_type_;
+  dnn::AlgorithmConfig algorithm_config_;
   std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
   std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
@@ -1576,7 +1644,8 @@ bool CudnnSupport::DoRnnForwardImpl(
     const CudnnRnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<T>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1633,9 +1702,24 @@ bool CudnnSupport::DoRnnForwardImpl(
     }
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the forward call
+  cudnnStatus_t status;
   if (!is_training) {
-    cudnnStatus_t status = wrap::cudnnRNNForwardInference(
+    status = wrap::cudnnRNNForwardInference(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1647,13 +1731,8 @@ bool CudnnSupport::DoRnnForwardImpl(
         output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
         workspace.opaque() /*workspace*/,
         workspace.size() /*workSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardInference: "
-                 << ToString(status);
-      return false;
-    }
   } else {
-    cudnnStatus_t status = wrap::cudnnRNNForwardTraining(
+    status = wrap::cudnnRNNForwardTraining(
         parent_, ToHandle(dnn_handle_) /*handle*/,
         rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
         input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
@@ -1667,8 +1746,24 @@ bool CudnnSupport::DoRnnForwardImpl(
         workspace.size() /*workSpaceSizeInBytes*/,
         reserve_space.opaque() /*reserveSpace*/,
         reserve_space.size() /*reserveSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Failed to call cudnnRNNForwardTraining"
+  }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    if (status == CUDNN_STATUS_SUCCESS) {
+      auto algo_desc = rnn_desc.algorithm_config().algorithm();
+      output_profile_result->set_algorithm(algo_desc);
+      output_profile_result->set_elapsed_time_in_ms(
+          timer->GetElapsedMilliseconds());
+    }
+  }
+  if (status != CUDNN_STATUS_SUCCESS) {
+    // Silently return when we are profiling.
+    if (!is_profiling) {
+      LOG(ERROR) << "Failed to call "
+                 << (is_training ? "cudnnRNNForwardTraining "
+                                 : "cudnnRNNForwardInference ")
                  << ToString(status);
       return false;
     }
@@ -1700,7 +1795,8 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<T>* input_c_backprop_data,
     DeviceMemory<T>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
   // extract model parameters
   RnnModelDims model_dims;
   bool res = ExtractAndCheckRnnForward(
@@ -1729,6 +1825,20 @@ bool CudnnSupport::DoRnnBackwardImpl(
     return false;
   }
 
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
+  const bool is_profiling = output_profile_result != nullptr;
+  if (is_profiling) {
+    timer.reset(new CUDATimer(parent_));
+    if (!timer->Init()) {
+      return false;
+    }
+    // The start and stop of the timer should be as close to the Cudnn call as
+    // possible. It is still possible for other threads to issue workload on
+    // to this stream. So it could take multiple profiling measurements.
+    if (!timer->Start(AsCUDAStream(stream))) {
+      return false;
+    }
+  }
   // make the backward data call
   cudnnStatus_t status = wrap::cudnnRNNBackwardData(
       parent_, ToHandle(dnn_handle_) /*handle*/, rnn_desc.handle() /*rnnDesc*/,
@@ -1749,7 +1859,11 @@ bool CudnnSupport::DoRnnBackwardImpl(
       workspace.size() /*workSpaceSizeInBytes*/,
       reserve_space_data->opaque() /*reserveSpace*/,
       reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
+
   if (status != CUDNN_STATUS_SUCCESS) {
+    if (is_profiling) {
+      timer->Stop(AsCUDAStream(stream));
+    }
     LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
     return false;
   }
@@ -1771,11 +1885,23 @@ bool CudnnSupport::DoRnnBackwardImpl(
         reserve_space_data->opaque() /*reserveSpace*/,
         reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
     if (status != CUDNN_STATUS_SUCCESS) {
+      if (is_profiling) {
+        timer->Stop(AsCUDAStream(stream));
+      }
       LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
                  << ToString(status);
       return false;
     }
   }
+  if (is_profiling) {
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return false;
+    }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
 
   return true;
 }
@@ -1787,15 +1913,17 @@ CudnnSupport::createRnnDescriptor(int num_layers, int hidden_size,
                                   int input_size, dnn::RnnInputMode input_mode,
                                   dnn::RnnDirectionMode direction_mode,
                                   dnn::RnnMode rnn_mode,
-                                  dnn::DataType data_type, float dropout,
-                                  uint64 seed,
+                                  dnn::DataType data_type,
+                                  const dnn::AlgorithmConfig& algorithm_config,
+                                  float dropout, uint64 seed,
                                   ScratchAllocator* state_allocator) {
 #if CUDNN_VERSION >= 5000
   mutex_lock lock{dnn_handle_mutex_};
   std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
       parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
       ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), dropout, seed,
+      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
+      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
       state_allocator));
   if (!rnn_desc->ok()) {
     return rnn_desc->Status();
@@ -1872,7 +2000,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1893,7 +2022,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1914,7 +2044,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<float>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1935,7 +2066,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -1957,7 +2089,8 @@ bool CudnnSupport::DoRnnForward(
     const dnn::RnnStateTensorDescriptor& output_c_desc,
     DeviceMemory<double>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -1978,7 +2111,8 @@ bool CudnnSupport::DoRnnForward(
       stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
       input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
       output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator);
+      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2007,7 +2141,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<Eigen::half>* input_c_backprop_data,
     DeviceMemory<Eigen::half>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2031,7 +2166,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2059,7 +2194,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<float>* input_c_backprop_data,
     DeviceMemory<float>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2083,7 +2219,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2112,7 +2248,8 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<double>* input_c_backprop_data,
     DeviceMemory<double>* params_backprop_data,
     DeviceMemory<uint8>* reserve_space_data,
-    ScratchAllocator* workspace_allocator) {
+    ScratchAllocator* workspace_allocator,
+    dnn::ProfileResult* output_profile_result) {
 #if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
@@ -2136,7 +2273,7 @@ bool CudnnSupport::DoRnnBackward(
       output_c_data, output_backprop_data, output_h_backprop_data,
       output_c_backprop_data, input_backprop_data, input_h_backprop_data,
       input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator);
+      workspace_allocator, output_profile_result);
 #else
   return false;
 #endif  // CUDNN_VERSION
@@ -2331,6 +2468,33 @@ cudnnDataType_t GetConvComputeType<double>() {
   return CUDNN_DATA_DOUBLE;
 }
 
+// A helper struct to decide whether to use FP32 as the internal compute type
+// for rnn when the input data type is FP16. By default it is turned on,
+// users can explicitly disable them (choose to use FP16 as the internal compute
+// type) through an env-var "TF_FP16_RNN_USE_FP32_COMPUTE=0".
+struct RnnDoFP32ComputationFP16Input {
+  static constexpr const char* kName = "TF_FP16_RNN_USE_FP32_COMPUTE";
+  static constexpr bool kDefaultFlag = true;
+};
+
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
+cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
+  switch (data_type) {
+    case dnn::DataType::kFloat:
+      return CUDNN_DATA_FLOAT;
+    case dnn::DataType::kDouble:
+      return CUDNN_DATA_DOUBLE;
+    case dnn::DataType::kHalf:
+      if (CudnnEnvVar<RnnDoFP32ComputationFP16Input>::IsEnabled()) {
+        return CUDNN_DATA_FLOAT;
+      } else {
+        return CUDNN_DATA_HALF;
+      }
+    default:
+      LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
+  }
+}
 }  // namespace
 
 template <class T>
@@ -2710,6 +2874,30 @@ bool CudnnSupport::GetConvolveAlgorithms(
   return true;
 }
 
+bool CudnnSupport::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
+  std::vector<dnn::AlgorithmDesc::Index> algo_types = {
+  // clang-format off
+#if CUDNN_VERSION >= 6000
+    CUDNN_RNN_ALGO_STANDARD,
+    CUDNN_RNN_ALGO_PERSIST_STATIC,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
+#endif
+    // clang-format on
+  };
+
+  out_algorithms->clear();
+  for (auto i : algo_types) {
+    out_algorithms->push_back({i, /*use_tensor_ops=*/false});
+#if CUDNN_VERSION >= 7100
+    if (RnnTensorOpMathEnabled()) {
+      out_algorithms->push_back({i, /*use_tensor_ops=*/true});
+    }
+#endif
+  }
+  return true;
+}
+
 bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 48d56f71e3195a897b6216ab9f5709326d1b86d3..0e5368aca82e7fa4d0281062dd3fbb59bebc0a3d 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -46,12 +46,14 @@ class CudnnSupport : public dnn::DnnSupport {
   ~CudnnSupport() override;
 
   port::Status Init() override;
+  port::StatusOr<std::tuple<int, int, int>> GetVersion() override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator* state_allocator) override;
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) override;
 
   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
@@ -77,7 +79,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -94,7 +97,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<float>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                     const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -111,7 +115,8 @@ class CudnnSupport : public dnn::DnnSupport {
                     const dnn::RnnStateTensorDescriptor& output_c_desc,
                     DeviceMemory<double>* output_c_data, bool is_training,
                     ScratchAllocator* reserve_space_allocator,
-                    ScratchAllocator* workspace_allocator) override;
+                    ScratchAllocator* workspace_allocator,
+                    dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -135,7 +140,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<Eigen::half>* input_c_backprop_data,
                      DeviceMemory<Eigen::half>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -159,7 +165,8 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<float>* input_c_backprop_data,
                      DeviceMemory<float>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
                      const dnn::RnnSequenceTensorDescriptor& input_desc,
@@ -183,12 +190,16 @@ class CudnnSupport : public dnn::DnnSupport {
                      DeviceMemory<double>* input_c_backprop_data,
                      DeviceMemory<double>* params_backprop_data,
                      DeviceMemory<uint8>* reserve_space_data,
-                     ScratchAllocator* workspace_allocator) override;
+                     ScratchAllocator* workspace_allocator,
+                     dnn::ProfileResult* output_profile_result) override;
 
   bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
 
+  bool GetRnnAlgorithms(
+      std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
+
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
@@ -746,7 +757,8 @@ class CudnnSupport : public dnn::DnnSupport {
                         const CudnnRnnStateTensorDescriptor& output_c_desc,
                         DeviceMemory<T>* output_c_data, bool is_training,
                         ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator);
+                        ScratchAllocator* workspace_allocator,
+                        dnn::ProfileResult* output_profile_result);
 
   template <class T>
   bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
@@ -771,7 +783,8 @@ class CudnnSupport : public dnn::DnnSupport {
                          DeviceMemory<T>* input_c_backprop_data,
                          DeviceMemory<T>* params_backprop_data,
                          DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator);
+                         ScratchAllocator* workspace_allocator,
+                         dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 4a2714dc1fada4f6081f7edc421660bef300a5b7..2abc55ec9411d1cc90ddcd46fe27d19472e7ef49 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -77,6 +77,13 @@ class CUDATimer : public internal::TimerInterface {
                          // executing in a stream.
 };
 
+struct TimerDeleter {
+  void operator()(CUDATimer *t) {
+    t->Destroy();
+    delete t;
+  }
+};
+
 }  // namespace cuda
 }  // namespace gputools
 }  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.cc b/tensorflow/stream_executor/cuda/cudnn_version.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5591801aae2526d528289f9b2267d864cf766045
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version) {
+  // Major version is neither forward or backward compatible and therefore major
+  // versions needs to match between source and library.
+  //
+  // Minor version is backward-compatible beginning with CuDNN 7 and therefore
+  // minor version of library needs to be same or higher.
+  //
+  // Patch releases are always forward and backward compatible and therefore
+  // need not match.
+  if (loaded_version.major_version != source_version.major_version) {
+    return false;
+  }
+  return ((loaded_version.minor_version == source_version.minor_version) ||
+          (source_version.major_version >= 7 &&
+           loaded_version.minor_version >= source_version.minor_version));
+}
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/cuda/cudnn_version.h b/tensorflow/stream_executor/cuda/cudnn_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ed02e1700ced5087bfebacb6314cbc8771e3612
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+#define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+
+struct CudnnVersion {
+  CudnnVersion() = default;
+
+  CudnnVersion(int major, int minor, int patch)
+      : major_version(major), minor_version(minor), patch_level(patch) {}
+
+  tensorflow::string ToString() const {
+    return tensorflow::strings::StrCat(major_version, ".", minor_version, ".",
+                                       patch_level);
+  }
+
+  int major_version;
+  int minor_version;
+  int patch_level;
+};
+
+// Returns true if the given source CuDNN version is compatible with the given
+// loaded version.
+bool IsSourceCompatibleWithCudnnLibrary(CudnnVersion source_version,
+                                        CudnnVersion loaded_version);
+
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDNN_VERSION_H_
diff --git a/tensorflow/stream_executor/cuda/cudnn_version_test.cc b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..230adafeb112f682b5ece4778921e18a4ad25f87
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_version_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/cuda/cudnn_version.h"
+
+#include "testing/base/public/gunit.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace perftools {
+namespace gputools {
+namespace cuda {
+namespace {
+
+TEST(CuDNNVersion, ToString) {
+  CudnnVersion version(7, 0, 12);
+  EXPECT_EQ(version.ToString(), "7.0.12");
+}
+
+TEST(IsSourceCompatibleWithCudnnLibraryTest, Basic) {
+  // Returns true if both major and minor versions are matching and even if the
+  // patch versions are not matching.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 00)));
+
+  // Returns false if major versions are not matching as they are neither
+  // forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 12),
+      /*loaded_version=*/CudnnVersion(6, 1, 14)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(8, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns true if the loaded version is equal or higher because minor version
+  // are backward compatible with CuDNN version 7.
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 14)));
+  EXPECT_TRUE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 0, 14),
+      /*loaded_version=*/CudnnVersion(7, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(7, 1, 15),
+      /*loaded_version=*/CudnnVersion(7, 0, 14)));
+
+  // Returns false if minor versions are not matching for version 6. Before
+  // version 7, minor versions are also neither forward or backward compatible.
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 0, 14),
+      /*loaded_version=*/CudnnVersion(6, 1, 15)));
+  EXPECT_FALSE(IsSourceCompatibleWithCudnnLibrary(
+      /*source_version=*/CudnnVersion(6, 1, 14),
+      /*loaded_version=*/CudnnVersion(6, 0, 14)));
+}
+
+}  // namespace
+}  // namespace cuda
+}  // namespace gputools
+}  // namespace perftools
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 44144a06139bf8661432cb930e53ba5218aac823..0a3c4bcf503b36f3ca7811970e94fdd7930b7c1a 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -28,6 +28,10 @@ bool DnnSupport::GetConvolveAlgorithms(
   return false;
 }
 
+bool DnnSupport::GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms) {
+  return false;
+}
+
 bool DnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<AlgorithmDesc>* out_algorithms) {
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index b41536e638873412a31a0cdbbd3ba3a818dd9cf2..3c47d2c2e852055988eb6d99e2081ba935920704 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -885,6 +886,12 @@ class DnnSupport {
 
   virtual port::Status Init() = 0;
 
+  // Gets the version of the backing library, as a {major, minor, patch} tuple.
+  virtual port::StatusOr<std::tuple<int, int, int>> GetVersion() {
+    return port::UnimplementedError(
+        "DnnSupport::GetVersion not implemented on this platform.");
+  }
+
   // Performs a single-precision forward batch normalization operation onto
   // the stream.
   //
@@ -1195,6 +1202,9 @@ class DnnSupport {
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
 
+  // Returns a list of supported rnn algorithms.
+  virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
+
   // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
   // coefficient_scales specifies the scaling of each column of coefficients:
   // original float coefficient[row * num_columns + column] =
@@ -2001,6 +2011,7 @@ class DnnSupport {
                       dnn::RnnInputMode input_mode,
                       dnn::RnnDirectionMode direction_mode,
                       dnn::RnnMode rnn_mode, dnn::DataType data_type,
+                      const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
     return port::Status{port::error::UNIMPLEMENTED,
@@ -2076,7 +2087,8 @@ class DnnSupport {
                             DeviceMemory<Eigen::half>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2096,7 +2108,8 @@ class DnnSupport {
                             DeviceMemory<float>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2116,7 +2129,8 @@ class DnnSupport {
                             DeviceMemory<double>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
-                            ScratchAllocator* workspace_allocator) {
+                            ScratchAllocator* workspace_allocator,
+                            dnn::ProfileResult* output_profile_result) {
     return false;
   }
   // Enqueue a backward operation of the RNN model onto the stream.
@@ -2183,7 +2197,8 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* input_c_backprop_data,
       DeviceMemory<Eigen::half>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2210,7 +2225,8 @@ class DnnSupport {
       DeviceMemory<float>* input_c_backprop_data,
       DeviceMemory<float>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2237,7 +2253,8 @@ class DnnSupport {
       DeviceMemory<double>* input_c_backprop_data,
       DeviceMemory<double>* params_backprop_data,
       DeviceMemory<uint8>* reserve_space_data,
-      ScratchAllocator* workspace_allocator) {
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index 81e531efb31ea7d8d6ac03b56aea6aa5f01d64d1..636199cfa2762b7c42dd350dfd294762e3159299 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/platform/port.h"
 
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/stream_executor/lib/demangle.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/logging.h"
@@ -96,7 +97,7 @@ static const char *kStubPrefix = "__device_stub_";
 void KernelBase::set_name(port::StringPiece name) {
   name_ = name.ToString();
   port::StringPiece stubless_name = name;
-  if (name.starts_with(kStubPrefix)) {
+  if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
     stubless_name.remove_prefix(strlen(kStubPrefix));
   }
   demangled_name_ = port::Demangle(stubless_name.data());
diff --git a/tensorflow/stream_executor/lib/statusor.h b/tensorflow/stream_executor/lib/statusor.h
index 138738ecab54986fd7d5cd76839d59da55623b1f..3b97929b37d82dc6fc00c70f617c5868b28d547e 100644
--- a/tensorflow/stream_executor/lib/statusor.h
+++ b/tensorflow/stream_executor/lib/statusor.h
@@ -14,238 +14,19 @@ limitations under the License.
 ==============================================================================*/
 
 // IWYU pragma: private, include "perftools/gputools/executor/stream_executor.h"
-//
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Further, StatusOr<T*> does not allow the
-// contained pointer to be NULL.
-//
-// The primary use-case for StatusOr<T> is as the return value of a
-// function which may fail.
-//
-// Example client usage for a StatusOr<T>, where T is not a pointer:
-//
-//  StatusOr<float> result = DoBigCalculationThatCouldFail();
-//  if (result.ok()) {
-//    float answer = result.ValueOrDie();
-//    printf("Big calculation yielded: %f", answer);
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<T*>:
-//
-//  StatusOr<Foo*> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example client usage for a StatusOr<std::unique_ptr<T>>:
-//
-//  StatusOr<std::unique_ptr<Foo>> result = FooFactory::MakeNewFoo(arg);
-//  if (result.ok()) {
-//    std::unique_ptr<Foo> foo = std::move(result.ValueOrDie());
-//    foo->DoSomethingCool();
-//  } else {
-//    LOG(ERROR) << result.status();
-//  }
-//
-// Example factory implementation returning StatusOr<T*>:
-//
-//  StatusOr<Foo*> FooFactory::MakeNewFoo(int arg) {
-//    if (arg <= 0) {
-//      return Status(port::error::INVALID_ARGUMENT,
-//                            "Arg must be positive");
-//    } else {
-//      return new Foo(arg);
-//    }
-//  }
-//
 
 #ifndef TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_LIB_STATUSOR_H_
 
-#include <new>
-#include "tensorflow/stream_executor/platform/port.h"
-#include <type_traits>
-#include <utility>
-
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/platform/logging.h"
-#include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace perftools {
 namespace gputools {
 namespace port {
 
-template<typename T>
-class StatusOr {
-  template<typename U> friend class StatusOr;
-
- public:
-  // Construct a new StatusOr with Status::UNKNOWN status
-  StatusOr() : status_(error::UNKNOWN, "") {}
-
-  // Construct a new StatusOr with the given non-ok status. After calling
-  // this constructor, calls to ValueOrDie() is invalid.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return
-  // value, so it is convenient and sensible to be able to do 'return
-  // Status()' when the return type is StatusOr<T>.
-  //
-  // REQUIRES: status != Status::OK.
-  // In optimized builds, passing Status::OK here will have the effect
-  // of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const Status& status);  // NOLINT
-
-  // Construct a new StatusOr with the given value. If T is a plain pointer,
-  // value must not be NULL. After calling this constructor, calls to
-  // ValueOrDie() will succeed, and calls to status() will return OK.
-  //
-  // NOTE: Not explicit - we want to use StatusOr<T> as a return type
-  // so it is convenient and sensible to be able to do 'return T()'
-  // when the return type is StatusOr<T>.
-  //
-  // REQUIRES: if T is a plain pointer, value != NULL.
-  // In optimized builds, passing a NULL pointer here will have
-  // the effect of passing PosixErrorSpace::EINVAL as a fallback.
-  StatusOr(const T& value);  // NOLINT
-
-  // Conversion copy constructor, T must be copy constructible from U
-  template <typename U>
-  StatusOr(const StatusOr<U>& other)  // NOLINT
-      : status_(other.status_),
-        value_(other.value_) {}
-
-  // Conversion assignment operator, T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(const StatusOr<U>& other) {
-    status_ = other.status_;
-    value_ = other.value_;
-    return *this;
-  }
-
-  // Rvalue-reference overloads of the other constructors and assignment
-  // operators, to support move-only types and avoid unnecessary copying.
-  StatusOr(T&& value);  // NOLINT
-
-  // Move conversion operator to avoid unnecessary copy.
-  // T must be assignable from U.
-  // Not marked with explicit so the implicit conversion can happen.
-  template <typename U>
-  StatusOr(StatusOr<U>&& other)  // NOLINT
-      : status_(std::move(other.status_)),
-        value_(std::move(other.value_)) {}
-
-  // Move assignment operator to avoid unnecessary copy.
-  // T must be assignable from U
-  template <typename U>
-  StatusOr& operator=(StatusOr<U>&& other) {
-    status_ = std::move(other.status_);
-    value_ = std::move(other.value_);
-    return *this;
-  }
-
-  // Returns a reference to our status. If this contains a T, then
-  // returns Status::OK.
-  const Status& status() const { return status_; }
-
-  // Returns this->status().ok()
-  bool ok() const { return status_.ok(); }
-
-  // Returns a reference to our current value, requires that this->ok().
-  // If you need to initialize a T object from the stored value,
-  // ConsumeValueOrDie() may be more efficient.
-  const T& ValueOrDie() const;
-  T& ValueOrDie();
-
-  // Returns our current value, requires this->ok(). Use this if
-  // you would otherwise want to say std::move(s.ValueOrDie()), for example
-  // if you need to initialize a T object from the stored value and you don't
-  // need subsequent access to the stored value. It uses T's move constructor,
-  // if it has one, so it will work with move-only types, and will often be
-  // more efficient than ValueOrDie, but may leave the stored value
-  // in an arbitrary valid state.
-  T ConsumeValueOrDie();
-
- private:
-  Status status_;
-  T value_;
-
-  void CheckValueNotNull(const T& value);
-
-  template <typename U>
-  struct IsNull {
-    // For non-pointer U, a reference can never be NULL.
-    static inline bool IsValueNull(const U& t) { return false; }
-  };
-
-  template <typename U>
-  struct IsNull<U*> {
-    static inline bool IsValueNull(const U* t) { return t == NULL; }
-  };
-};
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation details for StatusOr<T>
-
-template <typename T>
-StatusOr<T>::StatusOr(const T& value)
-    : status_(), value_(value) {
-  CheckValueNotNull(value);
-}
-
-template <typename T>
-const T& StatusOr<T>::ValueOrDie() const {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T& StatusOr<T>::ValueOrDie() {
-  TF_CHECK_OK(status_);
-  return value_;
-}
-
-template <typename T>
-T StatusOr<T>::ConsumeValueOrDie() {
-  TF_CHECK_OK(status_);
-  return std::move(value_);
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(const Status& status)
-    : status_(status) {
-  assert(!status.ok());
-  if (status.ok()) {
-    status_ =
-        Status(error::INTERNAL,
-               "Status::OK is not a valid constructor argument to StatusOr<T>");
-  }
-}
-
-template <typename T>
-StatusOr<T>::StatusOr(T&& value)
-    : status_() {
-  CheckValueNotNull(value);
-  value_ = std::move(value);
-}
-
+// Use XLA's StatusOr so we don't duplicate code.
 template <typename T>
-void StatusOr<T>::CheckValueNotNull(const T& value) {
-  assert(!IsNull<T>::IsValueNull(value));
-  if (IsNull<T>::IsValueNull(value)) {
-    status_ =
-        Status(error::INTERNAL,
-               "NULL is not a valid constructor argument to StatusOr<T*>");
-  }
-}
+using StatusOr = ::xla::StatusOr<T>;
 
 }  // namespace port
 }  // namespace gputools
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index 4dd6f3b0ccf112b281dd50467e9a16a672dbbbfb..5dd3d06affa424f0919f107c76ba40feeb165122 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -29,7 +29,7 @@ using tensorflow::str_util::Split;
 // Returns a copy of the input string 'str' with the given 'suffix'
 // removed. If the suffix doesn't match, returns a copy of the original string.
 inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix) {
-  if (str.ends_with(suffix)) {
+  if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
   return str.ToString();
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 1e3afde2687657e417e9e2cb3f5e2aaf0600da7a..fe498507a8019c3b0994a98fb52301a1e9e52764 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -4795,7 +4795,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<Eigen::half> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4803,7 +4804,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4827,7 +4829,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<float> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4835,7 +4838,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4860,7 +4864,8 @@ Stream &Stream::ThenRnnForward(
     const dnn::RnnStateTensorDescriptor &output_c_desc,
     DeviceMemory<double> *output_c_data, bool is_training,
     ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4868,7 +4873,8 @@ Stream &Stream::ThenRnnForward(
           this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
           input_c_desc, input_c_data, params, output_desc, output_data,
           output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator));
+          is_training, reserve_space_allocator, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnForward without DNN support";
@@ -4900,7 +4906,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<Eigen::half> *input_c_backprop_data,
     DeviceMemory<Eigen::half> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4910,7 +4917,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4941,7 +4949,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<float> *input_c_backprop_data,
     DeviceMemory<float> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4951,7 +4960,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
@@ -4983,7 +4993,8 @@ Stream &Stream::ThenRnnBackward(
     DeviceMemory<double> *input_c_backprop_data,
     DeviceMemory<double> *params_backprop_data,
     DeviceMemory<uint8> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
+    ScratchAllocator *workspace_allocator,
+    dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
   if (ok()) {
     if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
@@ -4993,7 +5004,8 @@ Stream &Stream::ThenRnnBackward(
           output_h_desc, output_h_data, output_c_desc, output_c_data,
           output_backprop_data, output_h_backprop_data, output_c_backprop_data,
           input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator));
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result));
     } else {
       SetError();
       LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index d7d11315699b85cae4d479b79bc8fc2717b2d8fb..4af426001f27a7983ce3a0832e53a1ce2c0ebd8e 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -1802,7 +1802,8 @@ class Stream {
                          DeviceMemory<Eigen::half> *output_c_data,
                          bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1819,7 +1820,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<float> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
                          const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1836,7 +1838,8 @@ class Stream {
                          const dnn::RnnStateTensorDescriptor &output_c_desc,
                          DeviceMemory<double> *output_c_data, bool is_training,
                          ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator);
+                         ScratchAllocator *workspace_allocator,
+                         dnn::ProfileResult *output_profile_result);
 
   // Enqueue a backward operation of the RNN model onto the stream.
   // See DnnSupport::DoRnnBackward for more details.
@@ -1863,7 +1866,8 @@ class Stream {
       DeviceMemory<Eigen::half> *input_c_backprop_data,
       DeviceMemory<Eigen::half> *params_backprop_data,
       DeviceMemory<uint8> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
+      ScratchAllocator *workspace_allocator,
+      dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1887,7 +1891,8 @@ class Stream {
                           DeviceMemory<float> *input_c_backprop_data,
                           DeviceMemory<float> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
                           const dnn::RnnSequenceTensorDescriptor &input_desc,
@@ -1911,7 +1916,8 @@ class Stream {
                           DeviceMemory<double> *input_c_backprop_data,
                           DeviceMemory<double> *params_backprop_data,
                           DeviceMemory<uint8> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator);
+                          ScratchAllocator *workspace_allocator,
+                          dnn::ProfileResult *output_profile_result);
 
   // Enqueue onto the stream a operation that transforms a tensor.
   // See DnnSupport::DoTransformTensor for more details.
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index afca1c2e597b55b1b8d0b76d4e79995d6f6af822..f55fa68402953b0e084c7f688b9481cfa0ec1b7e 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -305,6 +305,15 @@ bool StreamExecutor::GetConvolveAlgorithms(
                                             cc_minor, out_algorithms);
 }
 
+bool StreamExecutor::GetRnnAlgorithms(
+    std::vector<dnn::AlgorithmDesc> *out_algorithms) {
+  dnn::DnnSupport *dnn_support = AsDnn();
+  if (!dnn_support) {
+    return false;
+  }
+  return dnn_support->GetRnnAlgorithms(out_algorithms);
+}
+
 bool StreamExecutor::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused,
     std::vector<dnn::AlgorithmDesc> *out_algorithms) {
@@ -344,7 +353,8 @@ port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size,
     dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-    dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout, uint64 seed,
+    dnn::RnnMode rnn_mode, dnn::DataType data_type,
+    const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
     ScratchAllocator *state_allocator) {
   dnn::DnnSupport *dnn_support = AsDnn();
   if (!dnn_support) {
@@ -353,7 +363,7 @@ StreamExecutor::createRnnDescriptor(
   }
   return dnn_support->createRnnDescriptor(
       num_layers, hidden_size, input_size, input_mode, direction_mode, rnn_mode,
-      data_type, dropout, seed, state_allocator);
+      data_type, algorithm_config, dropout, seed, state_allocator);
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index a2a77218cbbafeeb9d4d8ca04b2e0a8a5024ebf9..69d0374d73d116ed8e19ace2329f6edb360bbea5 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -349,10 +349,14 @@ class StreamExecutor {
   // platform that underlies this interface.
   bool SupportsDnn() const;
 
-  // Get the list of supported algorithms for the forward convolution opeartion.
+  // Returns the list of supported algorithms for the forward convolution
+  // operation.
   bool GetConvolveAlgorithms(bool with_winograd_nonfused,
                              std::vector<dnn::AlgorithmDesc> *out_algorithms);
 
+  // Returns the list of supported algorithms for rnn operation.
+  bool GetRnnAlgorithms(std::vector<dnn::AlgorithmDesc> *out_algorithms);
+
   // Get the list of supported algorithms for the backward convolution on data.
   bool GetConvolveBackwardDataAlgorithms(
       bool with_winograd_nonfused,
@@ -372,8 +376,9 @@ class StreamExecutor {
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
       int num_layers, int hidden_size, int input_size,
       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
-      dnn::RnnMode rnn_mode, dnn::DataType data_type, float dropout,
-      uint64 seed, ScratchAllocator *state_allocator);
+      dnn::RnnMode rnn_mode, dnn::DataType data_type,
+      const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator *state_allocator);
 
   // Create a RNN sequence descriptor that specifies either the input or output
   // sequence. The caller retains the ownership of the returned descriptor.
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index fcc57d506e38205d8da605653ed67fb645102c35..528f811b40ad7711407c856af804cbe2829d8b32 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -304,6 +304,7 @@ def tf_cc_shared_object(
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
           ],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": [
               "-Wl,-soname," + name.split("/")[-1],
           ],
@@ -342,6 +343,22 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}.*",
 )
 
+# A simple wrap around native.cc_binary rule.
+# When using this rule, you should realize it doesn't link to any tensorflow
+# dependencies by default.
+def tf_native_cc_binary(name,
+                        copts=tf_copts(),
+                        **kwargs):
+  native.cc_binary(
+      name=name,
+      copts=copts,
+      **kwargs)
+
+register_extension_info(
+    extension_name = "tf_native_cc_binary",
+    label_regex_for_dep = "{extension_name}.*",
+)
+
 def tf_gen_op_wrapper_cc(name,
                          out_ops_file,
                          pkg="",
@@ -622,9 +639,12 @@ def tf_cc_test(name,
       linkopts=select({
         clean_dep("//tensorflow:android"): [
             "-pie",
-          ],
+        ],
         clean_dep("//tensorflow:windows"): [],
         clean_dep("//tensorflow:windows_msvc"): [],
+        clean_dep("//tensorflow:darwin"): [
+            "-lm",
+        ],
         "//conditions:default": [
             "-lpthread",
             "-lm"
@@ -790,7 +810,33 @@ def tf_cc_test_mkl(srcs,
                    tags=[],
                    size="medium",
                    args=None):
-  if_mkl(tf_cc_tests(srcs, deps, name, linkstatic=linkstatic, tags=tags, size=size, args=args, nocopts="-fno-exceptions"))
+  for src in srcs:
+    native.cc_test(
+      name=src_to_test_name(src),
+      srcs=if_mkl([src]) + tf_binary_additional_srcs(),
+      copts=tf_copts(),
+      linkopts=select({
+        clean_dep("//tensorflow:android"): [
+            "-pie",
+          ],
+        clean_dep("//tensorflow:windows"): [],
+        clean_dep("//tensorflow:windows_msvc"): [],
+        "//conditions:default": [
+            "-lpthread",
+            "-lm"
+        ],
+      }) + _rpath_linkopts(src_to_test_name(src)),
+      deps=deps + if_mkl(
+          [
+              "//third_party/mkl:intel_binary_blob",
+          ],
+      ),
+      linkstatic=linkstatic,
+      tags=tags,
+      size=size,
+      args=args,
+      nocopts="-fno-exceptions")
+
 
 def tf_cc_tests_gpu(srcs,
                     deps,
@@ -910,6 +956,7 @@ def tf_cuda_library(deps=None, cuda_deps=None, copts=tf_copts(), **kwargs):
   if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
     enable_text_relocation_linkopt = select({
           clean_dep("//tensorflow:darwin"): [],
+          clean_dep("//tensorflow:windows"): [],
           "//conditions:default": ['-Wl,-z,notext'],})
     if 'linkopts' in kwargs:
       kwargs['linkopts'] += enable_text_relocation_linkopt
@@ -1008,16 +1055,12 @@ register_extension_info(
 def tf_mkl_kernel_library(name,
                           prefix=None,
                           srcs=None,
-                          gpu_srcs=None,
                           hdrs=None,
                           deps=None,
                           alwayslink=1,
                           copts=tf_copts(),
-                          nocopts="-fno-exceptions",
-                          **kwargs):
+                          nocopts="-fno-exceptions"):
   """A rule to build MKL-based TensorFlow kernel libraries."""
-  gpu_srcs = gpu_srcs  # unused argument
-  kwargs = kwargs  # unused argument
 
   if not bool(srcs):
     srcs = []
@@ -1030,16 +1073,15 @@ def tf_mkl_kernel_library(name,
     hdrs = hdrs + native.glob(
         [prefix + "*.h"])
 
-  if_mkl(
-      native.cc_library(
-          name=name,
-          srcs=srcs,
-          hdrs=hdrs,
-          deps=deps,
-          alwayslink=alwayslink,
-          copts=copts,
-          nocopts=nocopts
-      ))
+  native.cc_library(
+      name=name,
+      srcs=if_mkl(srcs),
+      hdrs=hdrs,
+      deps=deps,
+      alwayslink=alwayslink,
+      copts=copts,
+      nocopts=nocopts
+  )
 
 register_extension_info(
     extension_name = "tf_mkl_kernel_library",
@@ -1178,6 +1220,20 @@ def tf_custom_op_library_additional_deps():
       "@protobuf_archive//:protobuf_headers",
       clean_dep("//third_party/eigen3"),
       clean_dep("//tensorflow/core:framework_headers_lib"),
+  ] + if_windows(["//tensorflow/python:pywrap_tensorflow_import_lib"])
+
+# A list of targets that contains the implemenation of
+# tf_custom_op_library_additional_deps. It's used to generate a DEF file for
+# exporting symbols from _pywrap_tensorflow.dll on Windows.
+def tf_custom_op_library_additional_deps_impl():
+  return [
+      "@protobuf_archive//:protobuf",
+      "@nsync//:nsync_cpp",
+      # for //third_party/eigen3
+      clean_dep("//third_party/eigen3"),
+      # for //tensorflow/core:framework_headers_lib
+      clean_dep("//tensorflow/core:framework"),
+      clean_dep("//tensorflow/core:reader_base"),
   ]
 
 # Traverse the dependency graph along the "deps" attribute of the
@@ -1264,6 +1320,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
       deps=deps + if_cuda(cuda_deps),
       data=[name + "_check_deps"],
       copts=tf_copts(is_external=True),
+      features = ["windows_export_all_symbols"],
       linkopts=linkopts + select({
           "//conditions:default": [
               "-lm",
@@ -1410,7 +1467,8 @@ def tf_py_wrap_cc(name,
       ]) + tf_extension_copts()),
       linkopts=tf_extension_linkopts() + extra_linkopts,
       linkstatic=1,
-      deps=deps + extra_deps)
+      deps=deps + extra_deps,
+      **kwargs)
   native.genrule(
       name="gen_" + cc_library_pyd_name,
       srcs=[":" + cc_library_name],
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index d9b0260c9f254f0b609ecc9094789085bb6586d4..a1c569951e99162c8048b7b760c25df7b2f29420 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -5,18 +5,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
@@ -44,6 +32,7 @@ genrule(
     # api/module1/module2/__init__.py and api/module3/__init__.py.
     # keep sorted
     outs = [
+        # BEGIN GENERATED FILES
         "api/__init__.py",
         "api/app/__init__.py",
         "api/bitwise/__init__.py",
@@ -105,6 +94,7 @@ genrule(
         "api/logging/__init__.py",
         "api/losses/__init__.py",
         "api/manip/__init__.py",
+        "api/math/__init__.py",
         "api/metrics/__init__.py",
         "api/nn/__init__.py",
         "api/nn/rnn_cell/__init__.py",
@@ -128,6 +118,7 @@ genrule(
         "api/train/__init__.py",
         "api/train/queue_runner/__init__.py",
         "api/user_ops/__init__.py",
+        # END GENERATED FILES
     ],
     cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 183c4731b8176ece16a70bac421291fd76d748cb..c7748f5b7a7c1013f11e053e7a36ddfd9594c6ea 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -67,18 +67,23 @@ def format_import(source_module_name, source_name, dest_name):
       return 'import %s as %s' % (source_name, dest_name)
 
 
-class _ModuleImportsBuilder(object):
+class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
   def __init__(self):
-    self.module_imports = collections.defaultdict(list)
-    self._seen_api_names = set()
+    self.module_imports = collections.defaultdict(
+        lambda: collections.defaultdict(set))
+    self._dest_import_to_id = collections.defaultdict(int)
+    # Names that start with underscore in the root module.
+    self._underscore_names_in_root = []
 
   def add_import(
-      self, dest_module_name, source_module_name, source_name, dest_name):
+      self, symbol_id, dest_module_name, source_module_name, source_name,
+      dest_name):
     """Adds this import to module_imports.
 
     Args:
+      symbol_id: (number) Unique identifier of the symbol to import.
       dest_module_name: (string) Module name to add import to.
       source_module_name: (string) Module to import from.
       source_name: (string) Name of the symbol to import.
@@ -89,40 +94,74 @@ class _ModuleImportsBuilder(object):
         dest_name has already been added to dest_module_name.
     """
     import_str = format_import(source_module_name, source_name, dest_name)
-    if import_str in self.module_imports[dest_module_name]:
-      return
 
     # Check if we are trying to expose two different symbols with same name.
     full_api_name = dest_name
     if dest_module_name:
       full_api_name = dest_module_name + '.' + full_api_name
-    if full_api_name in self._seen_api_names:
+    if (full_api_name in self._dest_import_to_id and
+        symbol_id != self._dest_import_to_id[full_api_name] and
+        symbol_id != -1):
       raise SymbolExposedTwiceError(
           'Trying to export multiple symbols with same name: %s.' %
           full_api_name)
-    self._seen_api_names.add(full_api_name)
+    self._dest_import_to_id[full_api_name] = symbol_id
 
-    self.module_imports[dest_module_name].append(import_str)
+    if not dest_module_name and dest_name.startswith('_'):
+      self._underscore_names_in_root.append(dest_name)
 
+    # The same symbol can be available in multiple modules.
+    # We store all possible ways of importing this symbol and later pick just
+    # one.
+    self.module_imports[dest_module_name][full_api_name].add(import_str)
 
-def get_api_imports():
-  """Get a map from destination module to formatted imports.
+  def build(self):
+    """Get a map from destination module to __init__.py code for that module.
+
+    Returns:
+      A dictionary where
+        key: (string) destination module (for e.g. tf or tf.consts).
+        value: (string) text that should be in __init__.py files for
+          corresponding modules.
+    """
+    module_text_map = {}
+    for dest_module, dest_name_to_imports in self.module_imports.items():
+      # Sort all possible imports for a symbol and pick the first one.
+      imports_list = [
+          sorted(imports)[0]
+          for _, imports in dest_name_to_imports.items()]
+      module_text_map[dest_module] = '\n'.join(sorted(imports_list))
+
+    # Expose exported symbols with underscores in root module
+    # since we import from it using * import.
+    underscore_names_str = ', '.join(
+        '\'%s\'' % name for name in self._underscore_names_in_root)
+    module_text_map[''] += '''
+_names_with_underscore = [%s]
+__all__ = [s for s in dir() if not s.startswith('_')]
+__all__.extend([s for s in _names_with_underscore])
+''' % underscore_names_str
+
+    return module_text_map
+
+
+def get_api_init_text():
+  """Get a map from destination module to __init__.py code for that module.
 
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
-      value: List of strings representing module imports
-          (for e.g. 'from foo import bar') and constant
-          assignments (for e.g. 'FOO = 123').
+      value: (string) text that should be in __init__.py files for
+        corresponding modules.
   """
-  module_imports_builder = _ModuleImportsBuilder()
-  visited_symbols = set()
+  module_code_builder = _ModuleInitCodeBuilder()
 
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
   for module in sys.modules.values():
     # Only look at tensorflow modules.
-    if not module or 'tensorflow.' not in module.__name__:
+    if (not module or not hasattr(module, "__name__") or
+        'tensorflow.' not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
@@ -130,8 +169,6 @@ def get_api_imports():
 
     for module_contents_name in dir(module):
       attr = getattr(module, module_contents_name)
-      if id(attr) in visited_symbols:
-        continue
 
       # If attr is _tf_api_constants attribute, then add the constants.
       if module_contents_name == _API_CONSTANTS_ATTR:
@@ -139,30 +176,25 @@ def get_api_imports():
           for export in exports:
             names = export.split('.')
             dest_module = '.'.join(names[:-1])
-            module_imports_builder.add_import(
-                dest_module, module.__name__, value, names[-1])
+            module_code_builder.add_import(
+                -1, dest_module, module.__name__, value, names[-1])
         continue
 
       _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        # If the same symbol is available using multiple names, only create
-        # imports for it once.
-        if id(attr) in visited_symbols:
-          continue
-        visited_symbols.add(id(attr))
-
         for export in attr._tf_api_names:  # pylint: disable=protected-access
           names = export.split('.')
           dest_module = '.'.join(names[:-1])
-          module_imports_builder.add_import(
-              dest_module, module.__name__, module_contents_name, names[-1])
+          module_code_builder.add_import(
+              id(attr), dest_module, module.__name__, module_contents_name,
+              names[-1])
 
   # Import all required modules in their parent modules.
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
-  imported_modules = set(module_imports_builder.module_imports.keys())
+  imported_modules = set(module_code_builder.module_imports.keys())
   for module in imported_modules:
     if not module:
       continue
@@ -175,11 +207,11 @@ def get_api_imports():
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
         import_from += '.' + parent_module
-      module_imports_builder.add_import(
-          parent_module, import_from, module_split[submodule_index],
-          module_split[submodule_index])
+      module_code_builder.add_import(
+          -1, parent_module, import_from,
+          module_split[submodule_index], module_split[submodule_index])
 
-  return module_imports_builder.module_imports
+  return module_code_builder.build()
 
 
 def create_api_files(output_files):
@@ -195,16 +227,19 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
+    # Convert path separators to '/' for easier parsing below.
+    normalized_output_file = output_file.replace(os.sep, '/')
     if _API_DIR not in output_file:
       raise ValueError(
           'Output files must be in api/ directory, found %s.' % output_file)
     # Get the module name that corresponds to output_file.
     # First get module directory under _API_DIR.
     module_dir = os.path.dirname(
-        output_file[output_file.rfind(_API_DIR)+len(_API_DIR):])
+        normalized_output_file[
+            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
     # Convert / to .
     module_name = module_dir.replace('/', '.').strip('.')
-    module_name_to_file_path[module_name] = output_file
+    module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
   for module, file_path in module_name_to_file_path.items():
@@ -212,11 +247,11 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_imports = get_api_imports()
+  module_text_map = get_api_init_text()
 
   # Add imports to output files.
   missing_output_files = []
-  for module, exports in module_imports.items():
+  for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
       module_file_path = '"api/%s/__init__.py"' %  (
@@ -224,7 +259,7 @@ def create_api_files(output_files):
       missing_output_files.append(module_file_path)
       continue
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + '\n'.join(exports))
+      fp.write(_GENERATED_FILE_HEADER + text)
 
   if missing_output_files:
     raise ValueError(
@@ -241,6 +276,16 @@ if __name__ == '__main__':
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
-      help='Python files that we expect this script to output.')
+      help='If a single file is passed in, then we we assume it contains a '
+      'semicolon-separated list of Python files that we expect this script to '
+      'output. If multiple files are passed in, then we assume output files '
+      'are listed directly as arguments.')
   args = parser.parse_args()
-  main(args.outputs)
+  if len(args.outputs) == 1:
+    # If we only get a single argument, then it must be a file containing
+    # list of outputs.
+    with open(args.outputs[0]) as output_list_file:
+      outputs = [line.strip() for line in output_list_file.read().split(';')]
+  else:
+    outputs = args.outputs
+  main(outputs)
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 2760779e6e0a909cb077f534db40710ab6a11b32..218c8120453c8dca6e81146eb06e8243a3cd424d 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -56,7 +56,7 @@ class CreatePythonApiTest(test.TestCase):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = (
         'from test.tensorflow.test_module import test_op as test_op1')
     self.assertTrue(
@@ -69,14 +69,14 @@ class CreatePythonApiTest(test.TestCase):
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected_import = 'from test.tensorflow.test_module import TestClass'
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_imports()
+    imports = create_python_api.get_api_init_text()
     expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD
index 08436396a6c04a59461b6800b908c29aabb91a1b..ebdf42df2c01a60b1cadd0368647adc4121db7ef 100644
--- a/tensorflow/tools/api/golden/BUILD
+++ b/tensorflow/tools/api/golden/BUILD
@@ -10,15 +10,3 @@ filegroup(
     name = "api_golden",
     srcs = glob(["*.pbtxt"]),
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fd9be8c75914ed37f5f36c4df5a14bd00caee20e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesClassifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b305be43f845ec15f9c160d5ea4823c6ae68897
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -0,0 +1,54 @@
+path: "tensorflow.estimator.BoostedTreesRegressor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
+  is_instance: "<class \'tensorflow.python.estimator.estimator.Estimator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "config"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_dir"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "model_fn"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "params"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "export_savedmodel"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "get_variable_names"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_variable_value"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "latest_checkpoint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'input_fn\', \'predict_keys\', \'hooks\', \'checkpoint_path\', \'yield_single_examples\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "train"
+    argspec: "args=[\'self\', \'input_fn\', \'hooks\', \'steps\', \'max_steps\', \'saving_listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
index 759ff752b0ea6b710a2d20fd9ad665b3e6e6ea82..05e603efb7cbad8c4c42a7a15074d2634af8d21c 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-run-config.pbtxt
@@ -6,10 +6,6 @@ tf_class {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "distribute"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "evaluation_master"
     mtype: "<type \'property\'>"
@@ -82,9 +78,13 @@ tf_class {
     name: "tf_random_seed"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "train_distribute"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index a7a6cc1e49ddfe07569dff035e38931a0510addd..4946f2c51a62af85d61b8e38e982c59dd0d61e36 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -8,6 +8,14 @@ tf_module {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BoostedTreesClassifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BoostedTreesRegressor"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index 7be2f4f61f6b9637f372591e49efc0c93c7a8c0a..7713d78b8a505d464800ada0c84ca126213d95d0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index bf361cf8054571c0b056e1373acb838aaea87173..69b81f75fa078856b4ed9fccd1037059efd90a0b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index db8f626b98b70fd99f38e696aa16c72e74e86e25..96272d1b7d61430188bbbf2680bd2beb9f1e9675 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Activation"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 809b3a5430449176a0d7423ec7f4499ceb620890..8fd55c8686de77ec764e9d564c78d0df4f545915 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 68d41bb6cc258ca87d4664ac0fb9d5649f89ebaf..47d1532c3c8cf248f6e9a9a35e10b9559286263f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 970b777e514194db4ac49fe58bea737b35436217..797d422a90a5ad21d0014a0003b11d281c25e579 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 529c64ab293d596012aefd42e0695bd1eb7e44d1..269be1455b6bf3bbe325f3928584960578e3793d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 7e7c330d74fe3b71ecd0eb87e34719e47ae70784..344813621534dbb5de3719088c06313e55519dd7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index ada8466d7473072b1878861ab36ec40b07fa1914..979008d0edb7d0f9d9c1246e1fcc7d2e2871d28c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 2a5c1cd530a7a532f6cdd3c184f4ee7eb88d23d3..0ffdffd4cdee14fbfeb68f2575300632ab21d7a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 9a2cb29815d59f3761ea25e9ea36ff6489c85b88..6b00f110eea2aaa551bffe8ec225042c5469210e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index f5e991ea42e5ee2723b64574d4598dc8463f1c8c..caff5a2f1db61c6958980446d3bd54009776e1a4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 31732214a62524017e39776cdfb9ab629746e8ae..4a7239492177aae2ee098fc3033904d3d1a31ba4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index 422eddf10db6763e10405dba5537ca161d1b8994..9804394fa53d6b3c1ff136a73212863143bbdb39 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -92,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'momentum\', \'epsilon\', \'center\', \'scale\', \'beta_initializer\', \'gamma_initializer\', \'moving_mean_initializer\', \'moving_variance_initializer\', \'beta_regularizer\', \'gamma_regularizer\', \'beta_constraint\', \'gamma_constraint\', \'renorm\', \'renorm_clipping\', \'renorm_momentum\', \'fused\', \'trainable\', \'virtual_batch_size\', \'adjustment\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'0.99\', \'0.001\', \'True\', \'True\', \'zeros\', \'ones\', \'zeros\', \'ones\', \'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'0.99\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 9053a37916314198842bc21b0608a9b69a64c264..5e5b04c7c695c6d88e7f42b77290a582be087763 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -18,10 +17,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -70,10 +65,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -112,11 +103,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index 3d536d2182fc4480a2ee5fba177543ca21fbd5ac..b8eb4079b9eea3b054f9c2ad4298f6a1669de79e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 6a7da1aef8db64ad11bb5a5ba357f33eeb99170b..3fdb101425d0f010f77cd70bc3721af269a36b0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,22 +1,49 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRecurrent2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.Recurrent\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "activation"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "activity_regularizer"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "bias_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "data_format"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dilation_rate"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dropout"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "graph"
+    name: "filters"
     mtype: "<type \'property\'>"
   }
   member {
@@ -35,6 +62,22 @@ tf_class {
     name: "input_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "kernel_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "kernel_size"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "losses"
     mtype: "<type \'property\'>"
@@ -68,7 +111,35 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "scope_name"
+    name: "padding"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_activation"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_constraint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_dropout"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_initializer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "recurrent_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strides"
     mtype: "<type \'property\'>"
   }
   member {
@@ -79,10 +150,18 @@ tf_class {
     name: "trainable_weights"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "unit_forget_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "updates"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "use_bias"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "variables"
     mtype: "<type \'property\'>"
@@ -105,11 +184,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -143,10 +222,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_constants"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "get_initial_state"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
@@ -187,28 +262,12 @@ tf_class {
     name: "get_weights"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_conv"
-    argspec: "args=[\'self\', \'x\', \'w\', \'b\', \'padding\'], varargs=None, keywords=None, defaults=[\'None\', \'valid\'], "
-  }
-  member_method {
-    name: "preprocess_input"
-    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "recurrent_conv"
-    argspec: "args=[\'self\', \'x\', \'w\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reset_states"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "step"
-    argspec: "args=[\'self\', \'inputs\', \'states\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 801a0339720919f8b3f6beee0f045d58b2c0a371..0be42471e35eeba224376b24aa846db69e011274 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 13352e264a5305190717bb973a3f2bce4d7f4fff..39ba31a70942811cbd36caf33c9bda90a5449703 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index f400e4a15c362037e85ac375cee98bb5f6358669..26d9d8c476f4e429ffe112cd490ad277b478c65b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index b3a9f573b8ba652d2544b21f36f65fe81a6ebb50..43611017fa37077c7ff05e690dfd50a0e6e5ae1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index a9be09c0abd19aeb4df30116ef2befc3948bfbf4..fa4925ab99d719104c5ca1a0003c25d85f78f3df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index be1ef5eb928d16cc6bf78c289aa20d815c728b23..c5c5d5e7c083dca63972cb7033e675aa483d8039 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 30034f7eaf6d9073695353e5c8d9ead0cc8de7cc..36dc2d2e9a70fe7a1d32352a14dabbade2a2efc2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index 189b38054c004facfeeff8ad2ae87848b89040f2..23ec74370bab010d2b5ba257502a32c8ca7e4a57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index a76d85c629c1fe620dafd62a0f0e05e9009109e2..0e4089c5785ccdcafbb8b3dc1ca75ffbe49d9434 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 782195d4ad5883d8c0ea6a657cc10258f2080a55..23ddbe1a925e33432727d13dc875972136083056 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index 2cb7a39ea595e1ff699b96554cb135377d20a488..e04ab6bea85baa1252c9c43f891f9ed5a9dedb87 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 80803306992bba3b601824a93cb3086ef3947369..655314afffd1e4fb0987c913eb69edc4254c77ef 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index 678f40bbc23db15ff7c1138169478fb4412a449d..d5215f1330a2fd0db4696a7b743d931e164a5a9f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index fac826109b6a32305ece86c4990f08afe2236ce8..310a3c3b918684803c86d8e2ea1731604a041cdb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2d67b5f720209e3648e69659d44c3d8c4e639231
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -0,0 +1,177 @@
+path: "tensorflow.keras.layers.DepthwiseConv2D"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'kernel_size\', \'strides\', \'padding\', \'depth_multiplier\', \'data_format\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'1\', \'None\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 285d544af2d69d564afdec748598b39b6b95670f..0e493a7f2bffc772e9cc9cd5afdc4c092fb92118 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index b77976974cccb96fc2373c093d2bdf279560c46f..14726b4b6cecc39da98af4220211a1c0351b2ff1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index b07714d3f2d158496e0482f8611e55ea0fb0fd51..32a50455ed8a2a146a81fde0d884ebc867b8d0ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ELU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index e67d4ddfc47077d62319ab097e5333a373cbfc80..2f615d81124a7ef5e1bd7181a10abfb1b7a8df24 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Embedding"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index b2a668e5a88d312656f48ddd0e9f7aa9f6306991..82dc878a8c7f7f011df4dd3fa0445217fa250a98 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,9 +1,7 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 1fd3febad26df16576dedca1df7560bf230c08ec..d79d02b95433a7399f27ff1354cba315f8a2c3ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GRUCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -91,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index f5f41d879dcb840551c00a7272bbcfbe51dbee89..1d38ae64bb86d11ecd352371608a11f6736bb0ac 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -123,7 +118,7 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "scope_name"
+    name: "reset_after"
     mtype: "<type \'property\'>"
   }
   member {
@@ -160,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
@@ -172,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index f4f1a5d51c5d5689918af4facf907f79d9ca71ec..135de9cd95141a93d1a45b80ada66e339c484c89 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index e502df5e177d422403d0643c18a9588afb9d9713..5db6e433ee02b8050822f76bb762329055c11aa4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index 9c8d5bfcd8966384230e7d5cdcc1cac53a0eab9a..bf0dba0a925b4fb3a88173f4a6dfcc565c6edd91 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 8dd65f1f248daaf120780f19050c45d297b7902e..6da98036094228603f12f65f66d958e2e4b9daeb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 5e30571cc730ee23767a044036b590460deec00b..345593dec812a240251c0c07da759e131fefaad9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index ba90fa454696d1cb4e77d80a2dc77ff65def4714..5d3be9085e51520c48a8d33b1dde6480a0039bad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 8823857758307c208527b144c0cc73b566f2f115..0b79a87e0507ecc795f14a63684dd5b5d7dafc1e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 500ced852ba6b19502769ba9052f2e364af7e283..68cdbac652f74b72d1cb769fbefbee750025767d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index cf2717ed46b56e639fb774c1e922648e1653ec0d..d5872b444fa9ae617e6ad55bc39f43ec4be7d92f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index a86ff1a46997f19b11e6ef03be432b45687a2df2..4b0cf9a5d38f868d9fdd16a042daea6d0f56fdf8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index e01cc7c1b09ad6a40380613d54b771c6a1c89c1c..4c1adb2131f204121cf74c9a77d346902632fef2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 259c1fb37c787f5318570b7aca6935d2f0ed997f..815f1cf580562e62db99862e51ba02e2b2051b57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index 0c41bf97f763f1e40e8fac714709ccac1483a00b..e027dd6cc282b75a6a6c2a78878fa57d6706b547 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index bec8817aa393ba2d8a6410408938402366cbb01d..c647b24a23258b96a3c1780fd2f3e499658cfe7d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,8 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling._GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 17be86222901c0f5a9a18c0e5f1c5bcac6c06a17..75d70734b4144627f02a8f619991356e38889389 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.InputLayer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 3aeef347ae1f96a3ef40493cc6b722a887e81786..29edabe0483a21d7db35eec04d6ae7a855a82da1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 6d2a8c56196d9b3c80f570c7f1d3ac803253fff6..0ed383a3554f81c3db490cf5d242546a14b64d15 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 490b5b618c65e28f1ae2e01e8d35e7f3973cc180..6d14c9c8f69286552271ca6dab5271a5af48593f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "implementation"
     mtype: "<type \'property\'>"
@@ -122,10 +117,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -176,11 +167,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index 21a65b838af35e2f540eacab823513e7bf54b434..ddf96aba34bf574f4b9046ed932d8f136864f157 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Lambda"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index 127b04738e70c11b2dc1071cf174cf5de23c5133..aca282d62427f5c8186fa3ac86daebd6fba09ce5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -12,10 +11,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -64,10 +59,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -90,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,11 +93,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index 87e49f2ed5b5d73aee5e9aa2511485b1f3f4bcd9..b9c53b43c87bd8c1047663001159d7286360a008 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 1aa3aad3246b83931a47e69a4aa76fdf2b5aee22..2ee566d03b4631ebcff6bcb0b93ec274849c5b67 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 5e9dc7d4774c651a186a4e320d0cfd088e87b6b3..db0d0e816a6d863f13c7eb085edf269d71e2f252 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 0d101e5b68cdb2cdf24ed472c724cfc885e3d95d..82008b89d038e26155b8ae952c2943557ed8c35e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Masking"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index c85cd49ac8ce2c1fc0759671865b7174cd1c1480..31a34a17d04129c8dfe8ec6f98b9fdbc110e13f4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 4f59e330c92f96101c65a9a24f66196e84587ccb..70d24ac75c4f850374fa8cffb881652f97e97d22 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index c0ea0eb0505d20e70d641f2a646a060d7dbfabda..55b16564b30919fb48d97862e2e8cfc0fdda8de5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index ca37ae51314516ae67c7725eb2ccd3d25154e2ac..a230b74c383eff93443d07e48f6818352302d9e7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -93,7 +83,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\'], "
+    argspec: "args=[\'self\', \'pool_size\', \'strides\', \'padding\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'2\', \'None\', \'valid\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index 3ede2378347f5eddb0e8fae775a0200ea484d3f8..d98f7c39f546048b1483617b24f004d5c9759d14 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index d87e25a7ba8e7cce615431723b53a0106c2b5279..b2e96a4203758270afb8c225a05a481dbc329a84 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,10 +1,8 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index e4df7b48ae6b41400375920a48ef8577bb69376e..0c45bbdf171ad9831e51d2b1ba952fab9eb3d0e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 6bf7c77743c31b6d74df35d827e9d5bc9a25d303..6423d83418aa40b57afb3d5ff22f4ec605183587 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -104,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index c14be132b7e406c99841576be8d8fa9ab99aa816..6e17081375b7fbdcb000dfcbc0cd48ab072fe6e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.PReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index 72ffbceae01da900778dba1ec14e646aa17b39e5..d01d371da596256873c3799b99c45db01c674ff9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Permute"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index d3e780c8b22ed580f61ffc3d9b2bad7278391402..d3f5508640e268df484af5adaf66621fa3d92d5b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RNN"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index a27980a9d17397e558a4b732e3dc332a0c1e8432..44e1007f5420bbb8feda891901c138cb776c071e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 67f991276c6908ff54fd516e84533542a5f60528..8fc3ec33310f531a6ca6948dc20db67543cde69a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Reshape"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index fccea5e8af5ab81e712669ff1b2567d8bde8607e..457d27749504167f2865bc272251bd89b5d3297e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index d20663bdb0bc2eea323d35b1e3d4d27122f50472..54eda8ee2121d4fadd73a33010202850d743cc65 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 889fa0a1b58bbd3babd293b7b1b45915a9ee3ca4..711196554698b79c64e1c67bc875ce33c70563c6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index c850f3fedc814b20f0f95cc3cf4fd5c973446b5b..815e34a48de542feb078d0002dbebbbf4d199e63 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,11 +1,9 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -16,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -68,10 +62,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -106,11 +96,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 526d88ccba60eb25c68432e5baa03fd3a878f718..6614760e5e72556ac61c3788065b3faf2d286800 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 7fddae34472411f49d42b4d65d12034d056ec818..bfcfd71ecddfb618ceb53969e8782f535786009b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -34,10 +33,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -114,10 +109,6 @@ tf_class {
     name: "recurrent_regularizer"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "states"
     mtype: "<type \'property\'>"
@@ -164,11 +155,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index 5b9b62fc970238e49e6d4849285606d0a7908b23..9c4618c4e91ae288284b35c4a0c6bbbfe604d91d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Softmax"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 769da30999993fad05ae0f7c04e256e6cf01a774..9a0a19d2d52f34e61c273bd4d4a27c46940dd5bf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index fca2e42a1519fcf3a9f0ec996c50b148b2df05fd..446f7122a6a2dc1d2f7377cef00d7b5b9a89cd3a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 36e8de09a967c5940bf8078234f5980a78ec8009..52a0485b5cea2be6f2f4d9b0ee31cb2388adcef7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -2,9 +2,7 @@ path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -67,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -105,11 +95,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index a96f16fae99af9c30959d228202055e9aebfaf58..c82e7a192dfca8ca38832a53f9135125b4c34286 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_size"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index e1cbd0e150ed890ae57c1725249d1340fc2cb663..9ccf251a18034371607cfdc6091f2282136feec9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index f0d35728fb1c42d563ff0598dd84da51a766a764..e080a07799fe1b7ec5f73b4e7bf78053a2c9dd3c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,10 +61,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -108,11 +99,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 74efaea6ddb22ec2fe9d41558978c183b0e06671..5fadca0b8386951976e8a6330bacf0dcf169e2d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index dc5bd5fd5319f9bbd601a3c4083ae566b47e1aaa..2d395bf7e87b8835bcf63d792d68dc3ac4083051 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index e01ccfb74aead591f1018cdcbb1c888767ecdb20..18d58ec3b23f7d4fd13ff45f4c1d4d95e7722ed5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 7e6f90f7623677244865ac285c134dc79f7b9b69..6223cb2f3c1230a60f3cf3dd57a0e803cf4f15d9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.Wrapper"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable"
     mtype: "<type \'property\'>"
@@ -107,11 +98,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 4d0d402dad442ccf52267f5ce40b05400afbfbc7..e71bba6a7f1df713cd13b4a0249d001d56bb31b0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index b353a529bcf8e543d334fee57fca26ebc83036a4..aba6d8cb1f43bb070f9b17b5290afc5ce30246b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 9fe1256e616dbca4f35101df160dc55bc68bfa8a..ce545ecc954e84e702fbd24047370a3417dc0fb1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -13,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -65,10 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -103,11 +94,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index 088c8e88e26f59f2753733252882f5e0e8287fb6..affc9bd09b1124acbf7ff84b97e53a419c9b6a92 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -116,6 +116,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DepthwiseConv2D"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dot"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index 8ccf15f9ab0fcfa59907ff05a962a84d3d86ccb4..3ac285681f596194254079359c9405ca47e6a3df 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -3,7 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -14,10 +13,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -74,10 +69,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -128,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index be12b0bd2ec509ff394eaa3f43db0b54badd7fba..51ba0c5043f63bb59d73f979f832d071273d4f82 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -4,7 +4,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -15,10 +14,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -75,10 +70,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -133,7 +124,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 1c4f550d7f05b8be33326cb39d7a5f3bf663f5e6..38fd78a5a828c7d0da98c97fdc01f504397c6fe6 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index d2db0952693f2989e6a9e8748a254eb4db483206..86a524cc91e10616cd049cec93843e419ec670c7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 34d9a9df281c09a2e2030daf74a2ceb8066085bb..8a811fe4561ac3790e43a5553ca04fca002e420d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 21ad0efecf88c42a3a679910ddfe095585a7933a..3923e706be7a352d770bf309aecd1fadb2a05e81 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index ed38747c7671a267bb640ecb96a4c5fcc46c5edf..7a0a8a2a51295d9009f44e2ea126e8a4694147af 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index ff453c6059477c20528fc768d93c65d208cdfc4a..7ed3a652519a6429ff429925c29f3c296a6d2958 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 5583bd22dce18b0a0593b73bde509818b63b3f29..23831aa74f1c3dea99e6da407e5a63693f94e37a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 63f0c32a7c8f7e530c76c64fa619102bc12f9ad9..9d41a6b09900d984706accd70a353cc26585d9b5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index b77726252ccca30a7c6555fb569eb65b69e34998..865fe08e63c81222395d125938c8830c02030733 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index 92db9f6dcd2f77c4253eb77df4a26fb632b2a766..ee164aae204d3f6c09af79f7fbac825ce470098d 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 80fa846a24c9162d8521bdb4f098b9cd8e34aedb..8167dc79cdf9f83b9b97557638bf0702a1be86da 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index f63213b3dde40aa54b165c1c269c26fd2cd9e3b4..efa4419692993aa9975c1af2e647288ae9f38eba 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -94,7 +104,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -108,6 +122,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -116,10 +134,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -132,6 +162,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,4 +174,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 7c1d05cd2bfc8be978e82566e7a3086040978b4a..2ff89f0a6faef905bcafdcb36121f506e9a9977a 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.layers.base.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index 4e45b2d513bb72bb47433d72c310d6a34fbc0c01..b3a6dfdffa28d3628e09e6aab823534ba84edf16 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -23,6 +24,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -51,6 +56,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -81,7 +90,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -93,7 +102,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -107,6 +120,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -115,10 +132,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -131,6 +160,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -139,4 +172,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index 19ec33fce775caa634e71e2295ac945a6f70ade9..cef396489dde24698cd9a63b6247292958cfec4e 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index 76180c333a21c592a3b53bb445df9b12d3596552..565f0c7a79661f77d5987d671266ff69268b03b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index ded75c8ff09efc6746ddd2284f53d2c021cc473c..595ce2eeadfc95fe44895ffd976024aee80ee948 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,8 +1,10 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.layers.pooling._Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -95,7 +105,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -109,6 +123,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -117,10 +135,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -133,6 +163,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,4 +175,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index 3dbfa5453f8e0ebb02429df9c4cbdf98de6b8ced..ccca96f72248e390ca65db061d836ee58c8e3205 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index ab171df1d1650e19836018f3316e6919f6d36def..1c99c9618254cadb1b4e95c7223ca9361e4fa861 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,9 +1,11 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.layers.convolutional._Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +28,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +60,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -96,7 +106,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -110,6 +124,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -118,10 +136,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -134,6 +164,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,4 +176,12 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..897718c05e0d10a6f961f33b8c65f5dab1d03f5b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.math"
+tf_module {
+  member_method {
+    name: "polyval"
+    argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 9c71a24d0500e2091e0ae94cc4dd7ed6b788a54f..f909cd875698bf65b1b005069c4d59f891b0cece 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 9e19f96b7452616956fb7fd3ca62d8f4b25a2122..173d2eae63656ac86d11e9eb051b43489a00560f 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 7540aa62861895a7c41840476d4edb79785a77a9..3c3e38229738fec3b25f437a73f3a9d216d970af 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index fc1ff386690f9c7acb11d4cc0770e394f78350ad..db16660f1145b55c824c698653094977dd6c718b 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -107,7 +116,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -121,6 +134,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -129,10 +146,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -145,6 +174,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -153,6 +186,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 751122cfff3bf9c55dd9fa264fdf2e1960940724..d7f658aaee153652b93eef812197322a984b6d44 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 4b6313f395fd8fd4ec2af78365117620263e7a55..b9ab487c77ccfd8a668e891a2dba0770f1f91ea8 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,6 +4,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -26,6 +27,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -54,6 +59,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -104,7 +113,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -118,6 +131,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -126,10 +143,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -142,6 +171,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -150,6 +183,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index 00e8c71140596ecea237ce05a09feff1fbb49001..b9e3d934759accd885036fa4c5a7013ef64736f3 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 3852f90dd6c4a254e20e789bdeb7796d61cef6bc..75b5898c591cbe2b761c0f709159c5489cb8f76a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -24,6 +25,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -52,6 +57,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -90,7 +99,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\', \'activity_regularizer\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'trainable\', \'name\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -102,7 +111,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -116,6 +129,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -124,10 +141,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -140,6 +169,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -148,6 +181,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index 8f3f0f7506ef49014b31cd4bc04f1cb1e0d696fc..fee0dc63b997f328a4e3d44040c4056de4128eb7 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
@@ -25,6 +26,10 @@ tf_class {
     name: "input"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "input_shape"
     mtype: "<type \'property\'>"
@@ -53,6 +58,10 @@ tf_class {
     name: "output"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "output_shape"
     mtype: "<type \'property\'>"
@@ -103,7 +112,11 @@ tf_class {
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "apply"
@@ -117,6 +130,10 @@ tf_class {
     name: "call"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_output_shape"
     argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
@@ -125,10 +142,22 @@ tf_class {
     name: "count_params"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_input_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -141,6 +170,10 @@ tf_class {
     name: "get_output_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_output_shape_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -149,6 +182,14 @@ tf_class {
     name: "get_updates_for"
     argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "zero_state"
     argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 937044aece83e49549bf6aca938bf673203f392b..be64fd19d89de9d3c1ecad0c36f9874bb0e5aaba 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -404,6 +404,10 @@ tf_module {
     name: "manip"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "math"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "metrics"
     mtype: "<type \'module\'>"
@@ -910,7 +914,7 @@ tf_module {
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
index dcf747971b7b8bf243502b2388da635705b8ee3e..6b65b0ace3cf7740ab03390841c941592000d127 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-file-writer.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'logdir\', \'graph\', \'max_queue\', \'flush_secs\', \'graph_def\', \'filename_suffix\', \'session\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'120\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_event"
diff --git a/tensorflow/tools/api/lib/BUILD b/tensorflow/tools/api/lib/BUILD
index 2d3b838957d60ffb5e827c6b43100d217cc5739e..3f4fb9104271539c431f02e21b7e30780a721fd7 100644
--- a/tensorflow/tools/api/lib/BUILD
+++ b/tensorflow/tools/api/lib/BUILD
@@ -26,15 +26,3 @@ py_library(
         "//tensorflow/python:util",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 15bf1abb5f8f541c435be77b1a3c2f13382f2438..724b12cd4799eb76fe602c737c850e96e92faa58 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -23,7 +23,6 @@ py_test(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow:experimental_tensorflow_py",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:lib",
@@ -42,15 +41,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 603b2a4327b94873b9908d5e0e114dcc4f7542dc..1ad6b6d1c0ae5ca1ac1329fd49e972840020e4c3 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import sys
 import unittest
 
 import tensorflow as tf
-from tensorflow import experimental_api as api
 
 from google.protobuf import text_format
 
@@ -47,8 +46,6 @@ from tensorflow.tools.api.lib import python_object_to_proto_visitor
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
 
-if hasattr(tf, 'experimental_api'):
-  del tf.experimental_api
 
 # FLAGS defined at the bottom:
 FLAGS = None
@@ -237,49 +234,6 @@ class ApiCompatibilityTest(test.TestCase):
         verbose=FLAGS.verbose_diffs,
         update_goldens=FLAGS.update_goldens)
 
-  @unittest.skipUnless(
-      sys.version_info.major == 2,
-      'API compabitility test goldens are generated using python2.')
-  def testNewAPIBackwardsCompatibility(self):
-    # Extract all API stuff.
-    visitor = python_object_to_proto_visitor.PythonObjectToProtoVisitor()
-
-    public_api_visitor = public_api.PublicAPIVisitor(visitor)
-    public_api_visitor.do_not_descend_map['tf'].append('contrib')
-    public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
-    # TODO(annarev): Make slide_dataset available in API.
-    public_api_visitor.private_map['tf'] = ['slide_dataset']
-    traverse.traverse(api, public_api_visitor)
-
-    proto_dict = visitor.GetProtos()
-
-    # Read all golden files.
-    expression = os.path.join(
-        resource_loader.get_root_dir_with_all_resources(),
-        _KeyToFilePath('*'))
-    golden_file_list = file_io.get_matching_files(expression)
-
-    def _ReadFileToProto(filename):
-      """Read a filename, create a protobuf from its contents."""
-      ret_val = api_objects_pb2.TFAPIObject()
-      text_format.Merge(file_io.read_file_to_string(filename), ret_val)
-      return ret_val
-
-    golden_proto_dict = {
-        _FileNameToKey(filename): _ReadFileToProto(filename)
-        for filename in golden_file_list
-    }
-
-    # Diff them. Do not fail if called with update.
-    # If the test is run to update goldens, only report diffs but do not fail.
-    self._AssertProtoDictEquals(
-        golden_proto_dict,
-        proto_dict,
-        verbose=FLAGS.verbose_diffs,
-        update_goldens=False,
-        additional_missing_object_message=
-        'Check if tf_export decorator/call is missing for this symbol.')
-
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 6ed2594e6abe169577066678e1bf4b9e2df4c4d3..566a172ea77fbc033496ef00a3415cff9ad8149a 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -90,12 +90,3 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [":benchmark_model_lib"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = ["**/OWNERS"],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index cdc47076ced24698d0139c1c14d1660018b1a815..730741780550bfe3fbccd7e62f5f7d9788f0a9a9 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -9,18 +9,3 @@ exports_files(
         "gen_build_info.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index aeac085d30aef746366192361f249eb01f95e8da..9627475d84f261e2cbe22548764eaa4f6f59068b 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -431,7 +431,8 @@ cmd_status(){
 # out by default in TF WORKSPACE file.
 do_bazel_nobuild() {
   BUILD_TARGET="//tensorflow/..."
-  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/src/main/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/java/demo/app/..."
+  BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/examples/android/..."
   BUILD_TARGET="${BUILD_TARGET} -//tensorflow/contrib/lite/schema/..."
   BUILD_CMD="bazel build --nobuild ${BAZEL_FLAGS} -- ${BUILD_TARGET}"
 
diff --git a/tensorflow/tools/ci_build/install/install_golang.sh b/tensorflow/tools/ci_build/install/install_golang.sh
index e1edd62cc505654b7266c212822561188bbc701c..124ad82e916fe70c0d26a7d09d27a9c510320c1e 100755
--- a/tensorflow/tools/ci_build/install/install_golang.sh
+++ b/tensorflow/tools/ci_build/install/install_golang.sh
@@ -16,7 +16,7 @@
 
 set -ex
 
-GOLANG_URL="https://storage.googleapis.com/golang/go1.9.2.linux-amd64.tar.gz"
+GOLANG_URL="https://storage.googleapis.com/golang/go1.10.linux-amd64.tar.gz"
 
 sudo mkdir -p /usr/local
 wget -q -O - "${GOLANG_URL}" | sudo tar -C /usr/local -xz
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
index 338066131b5d4511ae9f0646a1269b182cf8e1fa..c7cc16e6699830da4dff6cd32136da65fb6a41af 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py2_cc_core.sh
@@ -33,6 +33,7 @@ yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
     --test_timeout 300,450,1200,3600 --config=opt \
+    --announce_rc \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
index 920a261ae3c8d68ec0b0d311fd361e3843eebd86..7e0e81a1ebdc9e4ad4e76f6582892914cd1a5881 100755
--- a/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
+++ b/tensorflow/tools/ci_build/osx/cpu/run_py3_cc_core.sh
@@ -31,6 +31,7 @@ export PYTHON_BIN_PATH=$(which python3)
 yes "" | $PYTHON_BIN_PATH configure.py
 which bazel
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test,-nomac,-no_mac \
+    --announce_rc \
     --test_timeout 300,450,1200,3600 \
     --test_size_filters=small,medium \
     --jobs=${N_JOBS} --build_tests_only --test_output=errors -k -- \
diff --git a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
index 3c3b223a0044b7136ea4dee20fa72cd2fed3742a..30554a084c5689768665557d593b928fbd98d8cb 100644
--- a/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
+++ b/tensorflow/tools/ci_build/windows/cpu/cmake/run_py.bat
@@ -28,6 +28,9 @@ IF DEFINED TF_NIGHTLY (ECHO TF_NIGHTLY is set to %TF_NIGHTLY%) ELSE (SET TF_NIGH
 :: Set pip binary location. Do not override if it is set already.
 IF DEFINED PIP_EXE (ECHO PIP_EXE is set to %PIP_EXE%) ELSE (SET PIP_EXE="C:\Program Files\Anaconda3\Scripts\pip.exe")
 
+:: Install absl-py.
+%PIP_EXE% install --upgrade absl-py
+
 :: Run the CMAKE build to build the pip package.
 CALL %REPO_ROOT%\tensorflow\tools\ci_build\windows\cpu\cmake\run_build.bat
 if %errorlevel% neq 0 exit /b %errorlevel%
@@ -37,9 +40,6 @@ DIR %REPO_ROOT%\%BUILD_DIR%\tf_python\dist\ /S /B > wheel_filename_file
 set /p WHEEL_FILENAME=<wheel_filename_file
 del wheel_filename_file
 
-:: Install absl-py.
-%PIP_EXE% install --upgrade absl-py
-
 :: Install the pip package.
 echo Installing PIP package...
 %PIP_EXE% install --upgrade --no-deps %WHEEL_FILENAME% -v -v
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 8b8ba31a0dda88ad3c43330e0208a9fa6a7d0276..5e9ae497e15321fd1621e5f36b819fbe45a5a706 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -42,6 +42,14 @@ source "tensorflow/tools/ci_build/windows/bazel/common_env.sh" \
 source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
   || { echo "Failed to source bazel_test_lib.sh" >&2; exit 1; }
 
+skip_test=0
+
+for ARG in "$@"; do
+  if [[ "$ARG" == --skip_test ]]; then
+    skip_test=1
+  fi
+done
+
 run_configure_for_cpu_build
 
 # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
@@ -49,6 +57,10 @@ run_configure_for_cpu_build
 BUILD_OPTS="--define=override_eigen_strong_inline=true"
 bazel build -c opt $BUILD_OPTS tensorflow/tools/pip_package:build_pip_package || exit $?
 
+if [[ "$skip_test" == 1 ]]; then
+  exit 0
+fi
+
 # Create a python test directory to avoid package name conflict
 PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
@@ -65,4 +77,6 @@ bazel test -c opt $BUILD_OPTS -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-  //${PY_TEST_DIR}/tensorflow/python/...
+  --flaky_test_attempts=3 \
+  //${PY_TEST_DIR}/tensorflow/python/... \
+  //${PY_TEST_DIR}/tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
index 94276c6c5c9ce897ca24f03efe3d93e1ea1e00c9..7dfee8f371b8c4795fe748d1fd02ee8d884f18f9 100644
--- a/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/windows/libtensorflow_gpu.sh
@@ -41,7 +41,7 @@ run_configure_for_gpu_build
 # build_libtensorflow_tarball in ../builds/libtensorflow.sh
 # cannot be used on Windows since it relies on pkg_tar rules.
 # So we do something special here
-bazel build -c opt --copt=/arch:AVX \
+bazel build -c opt --copt=/arch:AVX --announce_rc \
   tensorflow:libtensorflow.so \
   tensorflow/tools/lib_package:clicenses_generate \
   tensorflow/java:libtensorflow_jni.so \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 316e5469e7afda74563cc186c58964664170c5da..b9032c046e93527fd0f41f183e49e4933029ec62 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -44,14 +44,3 @@ py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 4f90c4d940670c43f65cc3f95971469627ab35c9..b7bfb29aae4fcaa55e01ba924f72cf79d2b09ad1 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -68,18 +68,3 @@ exports_files(
         "testdata/test_file_v0_11.py",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/def_file_filter/BUILD b/tensorflow/tools/def_file_filter/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..e390e0fb05c1d881e5fbafb43ea7576347949439
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD
@@ -0,0 +1,9 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
diff --git a/tensorflow/tools/def_file_filter/BUILD.tpl b/tensorflow/tools/def_file_filter/BUILD.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..3cb72f49797d8014e4df3f44155c967ab0e2f9b3
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/BUILD.tpl
@@ -0,0 +1,15 @@
+# Description:
+# Tools for filtering DEF file for TensorFlow on Windows
+#
+# On Windows, we use a DEF file generated by Bazel to export
+# symbols from the tensorflow dynamic library(_pywrap_tensorflow.dll).
+# The maximum number of symbols that can be exported per DLL is 64K,
+# so we have to filter some useless symbols through this python script.
+
+package(default_visibility = ["//visibility:public"])
+
+py_binary(
+    name = "def_file_filter",
+    srcs = ["def_file_filter.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/tools/def_file_filter/def_file_filter.py.tpl b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
new file mode 100644
index 0000000000000000000000000000000000000000..8bdc03eb0f19fd6daae826727f429bc1255f0eca
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter.py.tpl
@@ -0,0 +1,168 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""def_file_filter.py - tool to filter a windows def file.
+
+The def file can be used to export symbols from the tensorflow dll to enable
+tf.load_library().
+
+Because the linker allows only 64K symbols to be exported per dll
+we filter the symbols down to the essentials. The regular expressions
+we use for this are specific to tensorflow.
+
+TODO: this works fine but there is an issue with exporting
+'const char * const' and importing it from a user_ops. The problem is
+on the importing end and using __declspec(dllimport) works around it.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import io
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+# External tools we use that come with visual studio sdk
+UNDNAME = "%{undname_bin_path}"
+
+# Exclude if matched
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+
+# Include if matched before exclude
+INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
+                           r"google::protobuf::internal::ArenaImpl::AllocateAligned|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::internal::ArenaImpl::AddCleanup|" # for contrib/data/_prefetching_ops
+                           r"google::protobuf::Arena::OnArenaAllocation|" # for contrib/data/_prefetching_ops
+                           r"tensorflow::internal::LogMessage|"
+                           r"tensorflow::internal::LogString|"
+                           r"tensorflow::internal::CheckOpMessageBuilder|"
+                           r"tensorflow::internal::MakeCheckOpValueString|"
+                           r"tensorflow::internal::PickUnusedPortOrDie|"
+                           r"tensorflow::internal::ValidateDevice|"
+                           r"tensorflow::ops::internal::Enter|"
+                           r"tensorflow::strings::internal::AppendPieces|"
+                           r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::io::internal::JoinPathImpl")
+
+# Include if matched after exclude
+INCLUDE_RE = re.compile(r"^(TF_\w*)$|"
+                        r"^(TFE_\w*)$|"
+                        r"nsync::|"
+                        r"tensorflow::|"
+                        r"functor::|"
+                        r"perftools::gputools")
+
+# We want to identify data members explicitly in the DEF file, so that no one
+# can implicitly link against the DLL if they use one of the variables exported
+# from the DLL and the header they use does not decorate the symbol with
+# __declspec(dllimport). It is easier to detect what a data symbol does
+# NOT look like, so doing it with the below regex.
+DATA_EXCLUDE_RE = re.compile(r"[)(]|"
+                             r"vftable|"
+                             r"vbtable|"
+                             r"vcall|"
+                             r"RTTI|"
+                             r"protobuf::internal::ExplicitlyConstructed")
+
+def get_args():
+  """Parse command line."""
+  filename_list = lambda x: x.split(";")
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input", type=filename_list,
+                      help="paths to input def file",
+                      required=True)
+  parser.add_argument("--output", help="output deffile", required=True)
+  parser.add_argument("--target", help="name of the target", required=True)
+  args = parser.parse_args()
+  return args
+
+
+def main():
+  """main."""
+  args = get_args()
+
+  # Pipe dumpbin to extract all linkable symbols from libs.
+  # Good symbols are collected in candidates and also written to
+  # a temp file.
+  candidates = []
+  tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False)
+  for def_file_path in args.input:
+    def_file = open(def_file_path, 'r')
+    for line in def_file:
+      cols = line.split()
+      sym = cols[0]
+      tmpfile.file.write(sym + "\n")
+      candidates.append(sym)
+  tmpfile.file.close()
+
+  # Run the symbols through undname to get their undecorated name
+  # so we can filter on something readable.
+  with open(args.output, "w") as def_fp:
+    # track dupes
+    taken = set()
+
+    # Header for the def file.
+    def_fp.write("LIBRARY " + args.target + "\n")
+    def_fp.write("EXPORTS\n")
+    def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n")
+
+    # Each symbols returned by undname matches the same position in candidates.
+    # We compare on undname but use the decorated name from candidates.
+    dupes = 0
+    proc = subprocess.Popen([UNDNAME, tmpfile.name], stdout=subprocess.PIPE)
+    for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")):
+      decorated = candidates[idx]
+      if decorated in taken:
+        # Symbol is already in output, done.
+        dupes += 1
+        continue
+
+      if not INCLUDEPRE_RE.search(line):
+        if EXCLUDE_RE.search(line):
+          continue
+        if not INCLUDE_RE.search(line):
+          continue
+
+      if "deleting destructor" in line:
+        # Some of the symbols convered by INCLUDEPRE_RE export deleting
+        # destructor symbols, which is a bad idea.
+        # So we filter out such symbols here.
+        continue
+
+      if DATA_EXCLUDE_RE.search(line):
+        def_fp.write("\t" + decorated + "\n")
+      else:
+        def_fp.write("\t" + decorated + " DATA\n")
+      taken.add(decorated)
+    def_fp.close()
+
+  exit_code = proc.wait()
+  if exit_code != 0:
+    print("{} failed, exit={}".format(UNDNAME, exit_code))
+    return exit_code
+
+  os.unlink(tmpfile.name)
+
+  print("symbols={}, taken={}, dupes={}"
+        .format(len(candidates), len(taken), dupes))
+  return 0
+
+
+if __name__ == "__main__":
+  sys.exit(main())
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..47539b2423e602bb9771541ae5b01ba76c79f56f
--- /dev/null
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -0,0 +1,56 @@
+"""Repository rule for def file filter autoconfiguration.
+
+This repository reuses Bazel's VC detect mechanism to find undname.exe,
+which is a tool used in def_file_filter.py.
+
+def_file_filter.py is for filtering the DEF file for TensorFlow on Windows.
+On Windows, we use a DEF file generated by Bazel to export symbols from the
+tensorflow dynamic library(_pywrap_tensorflow.dll). The maximum number of
+symbols that can be exported per DLL is 64K, so we have to filter some useless
+symbols through this python script.
+
+`def_file_filter_config` depends on the following environment variables:
+  * `BAZEL_VC`
+  * `BAZEL_VS`
+  * `VS90COMNTOOLS`
+  * `VS100COMNTOOLS`
+  * `VS110COMNTOOLS`
+  * `VS120COMNTOOLS`
+  * `VS140COMNTOOLS`
+"""
+
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_vc_path")
+load("@bazel_tools//tools/cpp:windows_cc_configure.bzl", "find_msvc_tool")
+load("@bazel_tools//tools/cpp:lib_cc_configure.bzl", "auto_configure_fail")
+
+def _def_file_filter_configure_impl(repository_ctx):
+  if repository_ctx.os.name.lower().find("windows") == -1:
+    repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+    repository_ctx.file("def_file_filter.py", "")
+    return
+  vc_path = find_vc_path(repository_ctx)
+  if vc_path == "visual-studio-not-found":
+    auto_configure_fail("Visual C++ build tools not found on your machine")
+  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  repository_ctx.template(
+    "def_file_filter.py",
+    Label("//tensorflow/tools/def_file_filter:def_file_filter.py.tpl"),
+    {
+      "%{undname_bin_path}": undname_bin_path,
+    })
+  repository_ctx.symlink(Label("//tensorflow/tools/def_file_filter:BUILD.tpl"), "BUILD")
+
+
+def_file_filter_configure = repository_rule(
+    implementation = _def_file_filter_configure_impl,
+    environ = [
+        "BAZEL_VC",
+        "BAZEL_VS",
+        "VS90COMNTOOLS",
+        "VS100COMNTOOLS",
+        "VS110COMNTOOLS",
+        "VS120COMNTOOLS",
+        "VS140COMNTOOLS"
+    ],
+)
diff --git a/tensorflow/tools/dist_test/server/BUILD b/tensorflow/tools/dist_test/server/BUILD
index 865af8dd7b2af686dad852f35187f2d226533596..003a19a9abf470f58070bf44fc5608d1eb3634fe 100644
--- a/tensorflow/tools/dist_test/server/BUILD
+++ b/tensorflow/tools/dist_test/server/BUILD
@@ -37,15 +37,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/BUILD b/tensorflow/tools/docker/BUILD
index 7d5ae0a94d8f969585d8fb8e57892c165e35ba47..849ba49f71994c3c188d8bc7751d9569c3ee73b3 100644
--- a/tensorflow/tools/docker/BUILD
+++ b/tensorflow/tools/docker/BUILD
@@ -13,15 +13,3 @@ py_binary(
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 024cb40eb4b9380fa09bd0e371826783d1ebdc45..78cb4d250e84a4a165dd42db6845170c1751ffbe 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -47,7 +47,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 11f476d12c086f70335d9a69d7f3b86b525b5623..b3dbe475d2589b1e99c5cea3eed5371b5facb097 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -38,6 +38,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 1fcb6428b21b4ca495bef2b3249b6463e9ef0a10..bfb96da58d7f1e4a32938e7655cf664b4e395b50 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -47,6 +47,8 @@ RUN pip --no-cache-dir install \
         && \
     python -m ipykernel.kernelspec
 
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
+
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
 
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index 625321e1235202f78a2d5e1a5b2d9d05e1e3f9ba..9e1708662e79746e54af4409756774a306990438 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -54,7 +54,7 @@ RUN pip --no-cache-dir install \
     http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-0.0.0-cp27-none-linux_x86_64.whl
 # --- ~ DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
 
-# RUN ln -s /usr/bin/python3 /usr/bin/python#
+# RUN ln -s -f /usr/bin/python3 /usr/bin/python#
 
 # Set up our notebook config.
 COPY jupyter_notebook_config.py /root/.jupyter/
diff --git a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
index 5585ebdcd366ec9db0c47004647970cb27c8bb75..824fe14560bb2c3bfb0729f9e5b5cffa63db19ca 100644
--- a/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
+++ b/tensorflow/tools/docker/notebooks/3_mnist_from_scratch.ipynb
@@ -1207,7 +1207,7 @@
    "source": [
     "# Training computation: logits + cross-entropy loss.\n",
     "logits = model(train_data_node, True)\n",
-    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\n",
+    "loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(\n",
     "  labels=train_labels_node, logits=logits))\n",
     "\n",
     "# L2 regularization for the fully connected parameters.\n",
@@ -2031,7 +2031,7 @@
    "views": {}
   },
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2049,5 +2049,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 0
+ "nbformat_minor": 1
 }
diff --git a/tensorflow/tools/docker/notebooks/BUILD b/tensorflow/tools/docker/notebooks/BUILD
index 89f473df4bfdda479fa25b8e10b84c4430105cc9..e9f26899c9afa305afa6ee686a038997a4e6fbe3 100644
--- a/tensorflow/tools/docker/notebooks/BUILD
+++ b/tensorflow/tools/docker/notebooks/BUILD
@@ -3,15 +3,3 @@ package(default_visibility = ["//visibility:private"])
 licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/docker/parameterized_docker_build.sh b/tensorflow/tools/docker/parameterized_docker_build.sh
index b4fba5b8f5e19c2fbb8c7261d8cf293757df503c..05de25f2cb11d76f223a31bc12329e6ab7368e8a 100755
--- a/tensorflow/tools/docker/parameterized_docker_build.sh
+++ b/tensorflow/tools/docker/parameterized_docker_build.sh
@@ -284,7 +284,7 @@ if [[ "${TF_DOCKER_BUILD_IS_DEVEL}" == "no" ]]; then
     if sed -i -e 's/python /python3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/python-dev/python3-dev/g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile for python version "\
 "${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
@@ -306,7 +306,7 @@ else
         sed -i -e 's^/tmp/pip^/tmp/pip3^g' "${DOCKERFILE}" && \
         sed -i -e 's/pip /pip3 /g' "${DOCKERFILE}" && \
         sed -i -e 's/ENV CI_BUILD_PYTHON python/ENV CI_BUILD_PYTHON python3/g' "${DOCKERFILE}" && \
-        sed -i -e 's^# RUN ln -s /usr/bin/python3 /usr/bin/python#^RUN ln -s /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
+        sed -i -e 's^# RUN ln -s -f /usr/bin/python3 /usr/bin/python#^RUN ln -s -f /usr/bin/python3 /usr/bin/python^' "${DOCKERFILE}"
     then
       echo "Modified Dockerfile further for python version ${TF_DOCKER_BUILD_PYTHON_VERSION} at: ${DOCKERFILE}"
     else
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 8f10bc9e0ca3c947b8ca75663444309088e0513e..0c1fd0cf9dc4e91601ac50648757613ec08a7f38 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -37,7 +37,7 @@ py_library(
     srcs = ["parser.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = ["@com_github_andreif_codegen"],
+    deps = ["@astor_archive//:astor"],
 )
 
 py_test(
@@ -142,14 +142,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/docs/build_docs_test.py b/tensorflow/tools/docs/build_docs_test.py
index ae293f6576456ecdbb8a4b1ee4e8e4f40482ad94..0cbf8b478fa8877c81dd4c06d2713ae2e72f8b58 100644
--- a/tensorflow/tools/docs/build_docs_test.py
+++ b/tensorflow/tools/docs/build_docs_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import sys
 import textwrap
 
 import tensorflow as tf
@@ -39,10 +38,6 @@ class Flags(object):
 class BuildDocsTest(googletest.TestCase):
 
   def testBuildDocs(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     doc_generator = generate_lib.DocGenerator()
 
     doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index d22a465376f4f58164514e62d302524a43b0dd01..6185c9ddd7b2c044728daa6f14befee3c8add036 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import argparse
 import fnmatch
 import os
-import sys
 
 import six
 
@@ -134,8 +133,12 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
     try:
       if not os.path.exists(directory):
         os.makedirs(directory)
-      with open(path, 'w') as f:
-        f.write(pretty_docs.build_md_page(page_info))
+      # This function returns raw bytes in PY2 or unicode in PY3.
+      text = pretty_docs.build_md_page(page_info)
+      if six.PY3:
+        text = text.encode('utf-8')
+      with open(path, 'wb') as f:
+        f.write(text)
     except OSError as e:
       print('Cannot write documentation for %s to %s: %s' % (full_name,
                                                              directory, e))
@@ -211,7 +214,6 @@ def _get_default_do_not_descend_map():
       'tf': ['cli', 'lib', 'wrappers'],
       'tf.contrib': [
           'compiler',
-          'distribute',
           'grid_rnn',
           # Block contrib.keras to de-clutter the docs
           'keras',
@@ -434,19 +436,19 @@ def _other_docs(src_dir, output_dir, reference_resolver, file_pattern='*.md'):
       full_out_path = os.path.join(output_dir, suffix)
       if not fnmatch.fnmatch(base_name, file_pattern):
         print('Copying un-matched file %s...' % suffix)
-        open(full_out_path, 'w').write(open(full_in_path).read())
+        open(full_out_path, 'wb').write(open(full_in_path, 'rb').read())
         continue
       if dirpath.endswith('/api_guides/python'):
         print('Processing Python guide %s...' % base_name)
         content = tag_updater.process(full_in_path)
       else:
         print('Processing doc %s...' % suffix)
-        content = open(full_in_path).read()
+        content = open(full_in_path, 'rb').read().decode('utf-8')
 
       content = reference_resolver.replace_references(content,
                                                       relative_path_to_root)
-      with open(full_out_path, 'w') as f:
-        f.write(content)
+      with open(full_out_path, 'wb') as f:
+        f.write(content.encode('utf-8'))
 
   print('Done.')
 
@@ -455,8 +457,6 @@ class DocGenerator(object):
   """Main entry point for generating docs."""
 
   def __init__(self):
-    if sys.version_info >= (3, 0):
-      sys.exit('Doc generation is not supported from python3.')
     self.argument_parser = argparse.ArgumentParser()
     self._py_modules = None
     self._private_map = _get_default_private_map()
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 1ceaf31f1c3b83e2c2cb3c0d2022ce98781aed4b..ea6d28a02b1f3c07fe8783fd59e345dade1fc804 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -52,9 +52,6 @@ class DummyVisitor(object):
 class GenerateTest(googletest.TestCase):
 
   def test_write(self):
-    if sys.version_info >= (3, 0):
-      self.skipTest('Warning: Doc generation is not supported from python3.')
-
     module = sys.modules[__name__]
 
     index = {
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index d2a63ecc4960117eb64fcc4f94bf882d4a3f91dd..cec23b1a36ca7e6a94d850c393271fa8616a0717 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -26,7 +26,7 @@ import os
 import re
 import sys
 
-import codegen
+import astor
 import six
 
 from google.protobuf.message import Message as ProtoMessage
@@ -621,20 +621,20 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
 def _get_arg_spec(func):
   """Extracts signature information from a function or functools.partial object.
 
-  For functions, uses `tf_inspect.getargspec`. For `functools.partial` objects,
-  corrects the signature of the underlying function to take into account the
-  removed arguments.
+  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
+  objects, corrects the signature of the underlying function to take into
+  account the removed arguments.
 
   Args:
     func: A function whose signature to extract.
 
   Returns:
-    An `ArgSpec` namedtuple `(args, varargs, keywords, defaults)`, as returned
-    by `tf_inspect.getargspec`.
+    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
+    as returned by `tf_inspect.getfullargspec`.
   """
-  # getargspec does not work for functools.partial objects directly.
+  # getfullargspec does not work for functools.partial objects directly.
   if isinstance(func, functools.partial):
-    argspec = tf_inspect.getargspec(func.func)
+    argspec = tf_inspect.getfullargspec(func.func)
     # Remove the args from the original function that have been used up.
     first_default_arg = (
         len(argspec.args or []) - len(argspec.defaults or []))
@@ -657,12 +657,14 @@ def _get_arg_spec(func):
           argspec_defaults.pop(i-first_default_arg)
         else:
           first_default_arg -= 1
-    return tf_inspect.ArgSpec(args=argspec_args,
-                              varargs=argspec.varargs,
-                              keywords=argspec.keywords,
-                              defaults=tuple(argspec_defaults))
+    return tf_inspect.FullArgSpec(args=argspec_args,
+                                  varargs=argspec.varargs,
+                                  varkw=argspec.varkw,
+                                  defaults=tuple(argspec_defaults),
+                                  kwonlyargs=[], kwonlydefaults=None,
+                                  annotations={})
   else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getargspec(func)
+    return tf_inspect.getfullargspec(func)
 
 
 def _remove_first_line_indent(string):
@@ -670,11 +672,14 @@ def _remove_first_line_indent(string):
   return '\n'.join([line[indent:] for line in string.split('\n')])
 
 
+PAREN_NUMBER_RE = re.compile("^\(([0-9.e-]+)\)")
+
+
 def _generate_signature(func, reverse_index):
   """Given a function, returns a list of strings representing its args.
 
   This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getargspec, which
+  python function. It uses tf_inspect.getfullargspec, which
   does not generalize well to Python 3.x, which is more flexible in how *args
   and **kwargs are handled. This is not a problem in TF, since we have to remain
   compatible to Python 2.7 anyway.
@@ -725,7 +730,11 @@ def _generate_signature(func, reverse_index):
       if id(default) in reverse_index:
         default_text = reverse_index[id(default)]
       elif ast_default is not None:
-        default_text = codegen.to_source(ast_default)
+        default_text = (
+            astor.to_source(ast_default).rstrip('\n').replace('\t', '\\t')
+            .replace('\n', '\\n').replace('"""', "'"))
+        default_text = PAREN_NUMBER_RE.sub('\\1', default_text)
+
         if default_text != repr(default):
           # This may be an internal name. If so, handle the ones we know about.
           # TODO(wicke): This should be replaced with a lookup in the index.
@@ -758,8 +767,8 @@ def _generate_signature(func, reverse_index):
   # Add *args and *kwargs.
   if argspec.varargs:
     args_list.append('*' + argspec.varargs)
-  if argspec.keywords:
-    args_list.append('**' + argspec.keywords)
+  if argspec.varkw:
+    args_list.append('**' + argspec.varkw)
 
   return args_list
 
@@ -1138,7 +1147,8 @@ class _ClassPageInfo(object):
       # Remove builtin members that we never want to document.
       if short_name in ['__class__', '__base__', '__weakref__', '__doc__',
                         '__module__', '__dict__', '__abstractmethods__',
-                        '__slots__', '__getnewargs__']:
+                        '__slots__', '__getnewargs__', '__str__',
+                        '__repr__', '__hash__']:
         continue
 
       child_name = '.'.join([self.full_name, short_name])
@@ -1183,7 +1193,7 @@ class _ClassPageInfo(object):
         # obvious what they do, don't include them in the docs if there's no
         # docstring.
         if not child_doc.brief.strip() and short_name in [
-            '__str__', '__repr__', '__hash__', '__del__', '__copy__']:
+            '__del__', '__copy__']:
           print('Skipping %s, defined in %s, no docstring.' % (child_name,
                                                                defining_class))
           continue
diff --git a/tensorflow/tools/docs/parser_test.py b/tensorflow/tools/docs/parser_test.py
index fca5436ca5fadd1fb5da07d7523bb51c871164b5..d7757d78ed42a5307635776b3595153465ac8d71 100644
--- a/tensorflow/tools/docs/parser_test.py
+++ b/tensorflow/tools/docs/parser_test.py
@@ -398,7 +398,6 @@ class ParserTest(googletest.TestCase):
     self.assertIn('<code>test_function', docs)
 
   def test_argspec_for_functools_partial(self):
-
     # pylint: disable=unused-argument
     def test_function_for_partial1(arg1, arg2, kwarg1=1, kwarg2=2):
       pass
@@ -409,45 +408,67 @@ class ParserTest(googletest.TestCase):
 
     # pylint: disable=protected-access
     # Make sure everything works for regular functions.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     self.assertEqual(expected, parser._get_arg_spec(test_function_for_partial1))
 
     # Make sure doing nothing works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1', 'kwarg2'], None,
-                                  None, (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting args from the front works.
-    expected = tf_inspect.ArgSpec(['arg2', 'kwarg1', 'kwarg2'], None, None,
-                                  (1, 2))
+    expected = tf_inspect.FullArgSpec(args=['arg2', 'kwarg1', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(1, 2),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['kwarg2',], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(args=['kwarg2'],
+                                      varargs=None, varkw=None, defaults=(2,),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, 1, 2, 3)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure setting kwargs works.
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg2'], None, None, (2,))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg2'],
+                                      varargs=None, varkw=None, defaults=(2,),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg1=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1', 'arg2', 'kwarg1'], None, None, (1,))
+    expected = tf_inspect.FullArgSpec(args=['arg1', 'arg2', 'kwarg1'],
+                                      varargs=None, varkw=None, defaults=(1,),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
-    expected = tf_inspect.ArgSpec(['arg1'], None, None, ())
+    expected = tf_inspect.FullArgSpec(args=['arg1'],
+                                      varargs=None, varkw=None, defaults=(),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial1,
                                 arg2=0, kwarg1=0, kwarg2=0)
     self.assertEqual(expected, parser._get_arg_spec(partial))
 
     # Make sure *args, *kwargs is accounted for.
-    expected = tf_inspect.ArgSpec([], 'my_args', 'my_kwargs', ())
+    expected = tf_inspect.FullArgSpec(args=[],
+                                      varargs='my_args', varkw='my_kwargs',
+                                      defaults=(),
+                                      kwonlyargs=[], kwonlydefaults=None,
+                                      annotations={})
     partial = functools.partial(test_function_for_partial2, 0, 1)
     self.assertEqual(expected, parser._get_arg_spec(partial))
-
+    
     # pylint: enable=protected-access
 
   def testSaveReferenceResolver(self):
@@ -524,10 +545,6 @@ class TestParseFunctionDetails(googletest.TestCase):
 class TestGenerateSignature(googletest.TestCase):
 
   def test_known_object(self):
-    if sys.version_info >= (3, 0):
-      print('Warning: Doc generation is not supported from python3.')
-      return
-
     known_object = object()
     reverse_index = {id(known_object): 'location.of.object.in.api'}
 
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 543b5fa6fefcd8e8dca99ad7eac7cca76781ccd3..55ab5bdd49a427e680221f4864b3f31a65b12e8d 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -101,7 +101,7 @@ def _build_class_page(page_info):
 
     link_template = '[`{short_name}`]({url})'
     parts.append(', '.join(
-        link_template.format(**base.__dict__) for base in page_info.bases))
+        link_template.format(**base._asdict()) for base in page_info.bases))
 
   parts.append('\n\n')
 
@@ -159,7 +159,7 @@ def _build_class_page(page_info):
       h3 = ('<h3 id="{short_name}">'
             '<code>{short_name}</code>'
             '</h3>\n\n')
-      parts.append(h3.format(**method_info.__dict__))
+      parts.append(h3.format(**method_info._asdict()))
 
       if method_info.signature is not None:
         parts.append(_build_signature(method_info, use_full_name=False))
@@ -217,7 +217,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}`]({url}) module'
 
     for item in page_info.modules:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -229,7 +229,7 @@ def _build_module_page(page_info):
     template = '[`class {short_name}`]({url})'
 
     for item in page_info.classes:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -241,7 +241,7 @@ def _build_module_page(page_info):
     template = '[`{short_name}(...)`]({url})'
 
     for item in page_info.functions:
-      parts.append(template.format(**item.__dict__))
+      parts.append(template.format(**item._asdict()))
 
       if item.doc.brief:
         parts.append(': ' + item.doc.brief)
@@ -254,7 +254,7 @@ def _build_module_page(page_info):
     parts.append('## Other Members\n\n')
 
     for item in page_info.other_members:
-      parts.append('`{short_name}`\n\n'.format(**item.__dict__))
+      parts.append('`{short_name}`\n\n'.format(**item._asdict()))
 
   return ''.join(parts)
 
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
index 216353ecee377260efd5a19c8536ac41c17592a9..328f42d18f1efb0fd82725a4683abad2df0d5a19 100644
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ b/tensorflow/tools/docs/py_guide_parser.py
@@ -44,7 +44,7 @@ class PyGuideParser(object):
 
   def process(self, full_path):
     """Read and process the file at `full_path`."""
-    md_string = open(full_path).read()
+    md_string = open(full_path, 'rb').read().decode('utf-8')
     self._lines = md_string.split('\n')
     seen = set()
 
diff --git a/tensorflow/tools/git/BUILD b/tensorflow/tools/git/BUILD
index 942ceab85fc8d40d9d4b67537d95204503af8bbe..daa17fbd501651540c4c90c6354eb0a5b2f2b7aa 100644
--- a/tensorflow/tools/git/BUILD
+++ b/tensorflow/tools/git/BUILD
@@ -9,18 +9,3 @@ licenses(["notice"])  # Apache 2.0
 exports_files(
     ["gen_git_source.py"],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py
index cbcdbf5b807a585865e2e3f19291e55388d55cb1..6a1f126131f6ca19220783813514d60299390d40 100755
--- a/tensorflow/tools/git/gen_git_source.py
+++ b/tensorflow/tools/git/gen_git_source.py
@@ -238,7 +238,7 @@ def generate(arglist):
   write_version_info(dest_file, git_version)
 
 
-def raw_generate(output_file):
+def raw_generate(output_file, source_dir):
   """Simple generator used for cmake/make build systems.
 
   This does not create any symlinks. It requires the build system
@@ -246,9 +246,10 @@ def raw_generate(output_file):
 
   Args:
     output_file: Output filename for the version info cc
+    source_dir: Base path of the source code
   """
 
-  git_version = get_git_version(".")
+  git_version = get_git_version(source_dir)
   write_version_info(output_file, git_version)
 
 
@@ -281,6 +282,11 @@ parser.add_argument(
     type=str,
     help="Generate version_info.cc (simpler version used for cmake/make)")
 
+parser.add_argument(
+    "--source_dir",
+    type=str,
+    help="Base path of the source code (used for cmake/make)")
+
 args = parser.parse_args()
 
 if args.configure is not None:
@@ -290,7 +296,10 @@ if args.configure is not None:
 elif args.generate is not None:
   generate(args.generate)
 elif args.raw_generate is not None:
-  raw_generate(args.raw_generate)
+  source_path = "."
+  if args.source_dir is not None:
+    source_path = args.source_dir
+  raw_generate(args.raw_generate, source_path)
 else:
   raise RuntimeError("--configure or --generate or --raw_generate "
                      "must be used")
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 6e21aa28461819fb9f65642716536e37ada8f9bf..1ad1895269022331bfd8156721778f4d68a10ee7 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -313,14 +313,3 @@ tf_py_test(
     ],
     main = "python/transform_graph_test.py",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
diff --git a/tensorflow/tools/graph_transforms/backports_test.cc b/tensorflow/tools/graph_transforms/backports_test.cc
index ab9a61afa7eb1680580c7e0c41f8ff1b47ef6742..80a954e062b06924c6048ac8b011dc1034706e8e 100644
--- a/tensorflow/tools/graph_transforms/backports_test.cc
+++ b/tensorflow/tools/graph_transforms/backports_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -191,7 +192,7 @@ TEST(BackportTensorArrayV3Test, TestBackportTensorArrayV3Subtypes) {
     std::map<string, const NodeDef*> node_lookup;
     MapNamesToNodes(result, &node_lookup);
     ASSERT_EQ(1, node_lookup.count("v3_node"));
-    EXPECT_TRUE(StringPiece(node_lookup.at("v3_node")->op()).ends_with("V2"));
+    EXPECT_TRUE(str_util::EndsWith(node_lookup.at("v3_node")->op(), "V2"));
   }
 }
 
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 250f54e20fba6e24fe95741b1437ac3718ace6fb..85660f94a85dce29360525f7bb7474494b3f010f 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -283,6 +283,10 @@ Status FoldConstants(const GraphDef& input_graph_def,
     };
   }
 
+  TF_RETURN_IF_ERROR(context.GetOneInt64Parameter(
+      "max_constant_size_in_bytes", cf_opts.max_constant_size_in_bytes,
+      &cf_opts.max_constant_size_in_bytes));
+
   // Constant folding.
   bool was_mutated;
   TF_RETURN_IF_ERROR(ConstantFold(cf_opts, nullptr, Env::Default(), nullptr,
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index 41106de008d832a022290e6da38cca8ad6d23ffd..a082399a87dbaad913be421fe273ba89b6f7340e 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -209,10 +210,10 @@ class ConstantFoldingTest : public ::testing::Test {
     for (const NodeDef& node : graph_def.node()) {
       const StringPiece name(node.name());
       const int occurrence_count = folded_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(name, "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(name, "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -370,6 +371,46 @@ class ConstantFoldingTest : public ::testing::Test {
     EXPECT_EQ(0, node_map.count("b"));
     EXPECT_EQ(1, node_map.count("c"));
   }
+
+  void TestMaxConstantSizeInBytes() {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    const int width = 100;
+
+    Tensor a_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&a_data, 1.0f);
+    Output a_const = ::tensorflow::ops::Const(
+        root.WithOpName("a_expect_remains"), Input::Initializer(a_data));
+
+    Tensor b_data(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&b_data, 1.0f);
+    Output b_const = ::tensorflow::ops::Const(
+        root.WithOpName("b_expect_remains"), Input::Initializer(b_data));
+
+    Output add = ::tensorflow::ops::Add(root.WithOpName("add_expect_remains"),
+                                        a_const, b_const);
+
+    Output placeholder = ::tensorflow::ops::Placeholder(
+        root.WithOpName("placeholder_expect_remains"), DT_FLOAT);
+
+    Output mul = ::tensorflow::ops::Mul(
+        root.WithOpName("output_expect_remains"), add, placeholder);
+
+    GraphDef graph_def;
+    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+    Tensor placeholder_tensor(DT_FLOAT, TensorShape({width}));
+    test::FillIota<float>(&placeholder_tensor, 1.0f);
+
+    // Setting the maximum constant size to 10 bytes should stop the constant
+    // folding at add(a, b) that would have yielded a constant of
+    // 100*sizeof(float) bytes.
+    graph_transforms::TransformFuncContext context;
+    context.params["max_constant_size_in_bytes"] = {"10"};
+    TestConstantFolding(graph_def,
+                        {{"placeholder_expect_remains", placeholder_tensor}},
+                        {}, {"output_expect_remains"}, context);
+  }
 };
 
 TEST_F(ConstantFoldingTest, TestSimpleAdd) { TestSimpleAdd(); }
@@ -394,5 +435,9 @@ TEST_F(ConstantFoldingTest, TestRemoveUnusedNodesMultipleOutputs) {
   TestRemoveUnusedNodesMultipleOutputs();
 }
 
+TEST_F(ConstantFoldingTest, TestMaxConstantSizeInBytes) {
+  TestMaxConstantSizeInBytes();
+}
+
 }  // namespace graph_transforms
 }  // namespace tensorflow
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index 2436c7e4a2dc5c8172de2a35abbfc551d6e410fd..f401723808c086bd69743b75b8b4d972e8ab0b83 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -40,8 +40,8 @@ Status ExtractMinMaxRecords(const string& log_file_name,
   for (const string& file_line : file_lines) {
     // We expect to find a line with components separated by semicolons, so to
     // start make sure that the basic structure is in place/
-    StringPiece line(file_line);
-    if (!line.contains(print_suffix + ";" + requant_prefix)) {
+    if (!str_util::StrContains(file_line,
+                               print_suffix + ";" + requant_prefix)) {
       continue;
     }
     std::vector<string> line_parts = str_util::Split(file_line, ';');
@@ -53,8 +53,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     bool min_max_found = false;
     int min_max_index;
     for (int i = 1; i < line_parts.size(); ++i) {
-      StringPiece line_part(line_parts[i]);
-      if (line_part.starts_with(requant_prefix)) {
+      if (str_util::StartsWith(line_parts[i], requant_prefix)) {
         min_max_found = true;
         min_max_index = i;
       }
@@ -90,7 +89,7 @@ Status ExtractMinMaxRecords(const string& log_file_name,
       continue;
     }
     StringPiece name_string = line_parts[min_max_index - 1];
-    if (!name_string.ends_with(print_suffix)) {
+    if (!str_util::EndsWith(name_string, print_suffix)) {
       continue;
     }
     string name =
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index e1ee2b420b062937b5e50c10a05406df3cbd7977..377665448c244aeace78f231ba0c263613afd9a0 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -101,7 +102,7 @@ Status InsertLogging(const GraphDef& input_graph_def,
     const bool op_matches = (ops.count(node.op()) > 0);
     bool prefix_matches = false;
     for (const string& prefix : prefixes) {
-      if (StringPiece(node.name()).starts_with(prefix)) {
+      if (str_util::StartsWith(node.name(), prefix)) {
         prefix_matches = true;
       }
     }
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index 701e350fc39d083665f5420e6b73510c182e12ce..cc82100148117c7846ba5781e1a97e172ad7f03c 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/subgraph.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/command_line_flags.h"
@@ -88,7 +89,7 @@ void CreateConstNode(const Tensor& tensor, const string& name,
 
 string GetMonolithicTensorKey(const string& tensor_slice_name) {
   std::vector<string> names = Split(tensor_slice_name, "/");
-  if (StringPiece(names[names.size() - 1]).starts_with("part_")) {
+  if (str_util::StartsWith(names[names.size() - 1], "part_")) {
     CHECK_GE(names.size(), 2);
     names.pop_back();
   }
@@ -102,8 +103,8 @@ Status ObtainTensorSlice(const GraphDef& input_graph_def,
   for (const auto& node : input_graph_def.node()) {
     std::vector<string> node_name_parts = Split(node.name(), "/");
     if (node_name_parts.size() == 2 &&
-        StringPiece(node_name_parts[0]).starts_with("save") &&
-        StringPiece(node_name_parts[1]).starts_with("Assign") &&
+        str_util::StartsWith(node_name_parts[0], "save") &&
+        str_util::StartsWith(node_name_parts[1], "Assign") &&
         node.input(0) == target_name) {
       restore_node_name = node.input(1);
       break;
diff --git a/tensorflow/tools/graph_transforms/transform_graph_test.cc b/tensorflow/tools/graph_transforms/transform_graph_test.cc
index bc2412fcbdba90731318eea1a2239aa914b35ffc..b276229aa44f747ee81ebcdfe204468929c9eb53 100644
--- a/tensorflow/tools/graph_transforms/transform_graph_test.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/public/session.h"
@@ -112,12 +113,11 @@ class TransformGraphTest : public ::testing::Test {
     graph_transforms::MapNamesToNodes(out_graph_def, &out_node_map);
 
     for (const NodeDef& node : out_graph_def.node()) {
-      const StringPiece name(node.name());
       const int occurrence_count = out_node_map.count(node.name());
-      if (name.ends_with("expect_removed")) {
+      if (str_util::EndsWith(node.name(), "expect_removed")) {
         EXPECT_EQ(0, occurrence_count) << "node.name()=" << node.name();
       }
-      if (name.ends_with("expect_remains")) {
+      if (str_util::EndsWith(node.name(), "expect_remains")) {
         EXPECT_EQ(1, occurrence_count) << "node.name()=" << node.name();
       }
     }
@@ -139,7 +139,7 @@ class TransformGraphTest : public ::testing::Test {
     Status no_such_status =
         TransformGraph({}, {}, {{"test_no_such_transform", {}}}, &graph_def);
     EXPECT_TRUE(
-        StringPiece(no_such_status.ToString()).contains("not recognized"));
+        str_util::StrContains(no_such_status.ToString(), "not recognized"));
   }
 
   void TestParseTransformParameters() {
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 55f28a9e1d8d639a316c9bd121204d603217dea3..367048965d146d782267f23330a435ae72f7f49a 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -88,7 +88,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
     *suffix = ":" + input_parts[1];
   }
   StringPiece node_name_piece(input_parts[0]);
-  if (node_name_piece.Consume("^")) {
+  if (str_util::ConsumePrefix(&node_name_piece, "^")) {
     *prefix = "^";
   } else {
     *prefix = "";
@@ -200,8 +200,7 @@ Status SortByExecutionOrder(const GraphDef& input_graph_def,
       // for merge only wait for one non-control input.
       int32 num_control_edges = 0;
       for (int i = 0; i < node_def.input_size(); ++i) {
-        StringPiece input_name(node_def.input(i));
-        if (input_name.starts_with("^")) {
+        if (str_util::StartsWith(node_def.input(i), "^")) {
           num_control_edges++;
         }
       }
@@ -504,7 +503,7 @@ Status RenameNodeInputs(const GraphDef& input_graph_def,
           const string& dest_name = input_to_rename.second;
           bool is_match;
           string match_name;
-          if (StringPiece(source_name).ends_with(":*")) {
+          if (str_util::EndsWith(source_name, ":*")) {
             is_match = true;
             string prefix;
             string unused_node_name;
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index f9f48c6500cee99dce1f5c9ffe6284e578e82669..89c683c8c422b7341517d80f7c55cceb1636a657 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -32,15 +32,3 @@ tf_cc_binary(
         "//tensorflow/core:op_gen_lib",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 95cdf0bf3cdc76d5d10205dc4f97680cdfd8f8fe..376644718f427cf172353dc172c028ee6ec2e8fa 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -48,36 +48,66 @@ py_binary(
     deps = ["//tensorflow:tensorflow_py"],
 )
 
+COMMON_PIP_DEPS = [
+    ":licenses",
+    "MANIFEST.in",
+    "README",
+    "setup.py",
+    ":included_headers",
+    "//tensorflow:tensorflow_py",
+    "//tensorflow/contrib/autograph:autograph",
+    "//tensorflow/contrib/autograph/converters:converters",
+    "//tensorflow/contrib/autograph/converters:test_lib",
+    "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/pyct:pyct",
+    "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
+    "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
+    "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
+    "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
+    "//tensorflow/contrib/data/python/ops:contrib_op_loader",
+    "//tensorflow/contrib/eager/python/examples:examples_pip",
+    "//tensorflow/contrib/eager/python:checkpointable_utils",
+    "//tensorflow/contrib/eager/python:evaluator",
+    "//tensorflow/contrib/gan:gan",
+    "//tensorflow/contrib/graph_editor:graph_editor_pip",
+    "//tensorflow/contrib/keras:keras",
+    "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
+    "//tensorflow/contrib/nn:nn_py",
+    "//tensorflow/contrib/predictor:predictor_pip",
+    "//tensorflow/contrib/receptive_field:receptive_field_pip",
+    "//tensorflow/contrib/session_bundle:session_bundle_pip",
+    "//tensorflow/contrib/signal:signal_py",
+    "//tensorflow/contrib/signal:test_util",
+    "//tensorflow/contrib/slim:slim",
+    "//tensorflow/contrib/slim/python/slim/data:data_pip",
+    "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
+    "//tensorflow/contrib/specs:specs",
+    "//tensorflow/contrib/summary:summary_test_util",
+    "//tensorflow/contrib/tensor_forest:init_py",
+    "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
+    "//tensorflow/contrib/timeseries:timeseries_pip",
+    "//tensorflow/contrib/tpu",
+    "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/python:distributed_framework_test_lib",
+    "//tensorflow/python:meta_graph_testdata",
+    "//tensorflow/python:spectral_ops_test_util",
+    "//tensorflow/python:util_example_parser_configuration",
+    "//tensorflow/python/debug:debug_pip",
+    "//tensorflow/python/eager:eager_pip",
+    "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/saved_model:saved_model",
+    "//tensorflow/python/tools:tools_pip",
+    "//tensorflow/python:test_ops",
+    "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
+]
+
 # On Windows, python binary is a zip file of runfiles tree.
 # Add everything to its data dependency for generating a runfiles tree
 # for building the pip package on Windows.
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = [
-        "MANIFEST.in",
-        "README",
-        "setup.py",
-        ":included_headers",
-        "//tensorflow/contrib/nn:nn_py",
-        "//tensorflow/contrib/session_bundle:session_bundle_pip",
-        "//tensorflow/contrib/signal:signal_py",
-        "//tensorflow/contrib/slim/python/slim/data:data_pip",
-        "//tensorflow/python:util_example_parser_configuration",
-        "//tensorflow/python/debug:debug_pip",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python:spectral_ops_test_util",
-        "//tensorflow/python/tools:tools_pip",
-        "//tensorflow/python/eager:eager_pip",
-        "//tensorflow/contrib/summary:summary_test_util",
-        # These targets don't build on Windows yet. Exclude them for now.
-        # "//tensorflow/contrib/slim",
-        # "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-        # "//tensorflow/contrib/specs",
-        # "//tensorflow/contrib/tensor_forest:init_py",
-        # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-        # "//tensorflow/examples/tutorials/mnist:package",
-    ],
+    data = COMMON_PIP_DEPS,
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -111,8 +141,10 @@ filegroup(
         "@kafka//:LICENSE",
         "@libxsmm_archive//:LICENSE",
         "@lmdb//:LICENSE",
+        "@local_config_nccl//:LICENSE",
         "@local_config_sycl//sycl:LICENSE.text",
         "@grpc//third_party/nanopb:LICENSE.txt",
+        "@grpc//third_party/address_sorting:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
         "@pcre//:LICENCE",
@@ -126,8 +158,6 @@ filegroup(
         "@org_python_pypi_backports_weakref//:LICENSE",
     ] + if_mkl([
         "//third_party/mkl:LICENSE",
-    ]) + if_not_windows([
-        "@nccl_archive//:LICENSE.txt",
     ]) + tf_additional_license_deps(),
 )
 
@@ -137,63 +167,13 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [":simple_console_for_windows"],
         "//tensorflow:windows_msvc": [":simple_console_for_windows"],
-        "//conditions:default": [
-            ":licenses",
-            "MANIFEST.in",
-            "README",
-            "setup.py",
-            ":included_headers",
+        "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow:tensorflow_py",
-            "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-            "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
-            "//tensorflow/contrib/data/python/kernel_tests:dataset_serialization_test",
-            "//tensorflow/contrib/data/python/ops:contrib_op_loader",
-            "//tensorflow/contrib/eager/python/examples:examples_pip",
-            "//tensorflow/contrib/eager/python:checkpointable_utils",
-            "//tensorflow/contrib/eager/python:evaluator",
-            "//tensorflow/contrib/gan:gan",
-            "//tensorflow/contrib/graph_editor:graph_editor_pip",
-            "//tensorflow/contrib/keras:keras",
-            "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
             "//tensorflow/contrib/lite/python:tf_lite_py_pip",
             "//tensorflow/contrib/lite/toco:toco",
             "//tensorflow/contrib/lite/toco/python:toco_wrapper",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
-            "//tensorflow/contrib/nn:nn_py",
-            "//tensorflow/contrib/predictor:predictor_pip",
-            "//tensorflow/contrib/autograph:autograph",
-            "//tensorflow/contrib/autograph/converters:converters",
-            "//tensorflow/contrib/autograph/converters:test_lib",
-            "//tensorflow/contrib/autograph/impl:impl",
-            "//tensorflow/contrib/autograph/pyct:pyct",
-            "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
-            "//tensorflow/contrib/receptive_field:receptive_field_pip",
-            "//tensorflow/contrib/session_bundle:session_bundle_pip",
-            "//tensorflow/contrib/signal:signal_py",
-            "//tensorflow/contrib/signal:test_util",
-            "//tensorflow/contrib/slim:slim",
-            "//tensorflow/contrib/slim/python/slim/data:data_pip",
-            "//tensorflow/contrib/slim/python/slim/nets:nets_pip",
-            "//tensorflow/contrib/specs:specs",
-            "//tensorflow/contrib/summary:summary_test_util",
-            "//tensorflow/contrib/tensor_forest:init_py",
-            "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip",
-            "//tensorflow/contrib/timeseries:timeseries_pip",
-            "//tensorflow/contrib/tpu",
-            "//tensorflow/examples/tutorials/mnist:package",
-            "//tensorflow/python:distributed_framework_test_lib",
-            "//tensorflow/python:meta_graph_testdata",
-            "//tensorflow/python:spectral_ops_test_util",
-            "//tensorflow/python:util_example_parser_configuration",
-            "//tensorflow/python/debug:debug_pip",
-            "//tensorflow/python/eager:eager_pip",
-            "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
-            "//tensorflow/python/saved_model:saved_model",
-            "//tensorflow/python/tools:tools_pip",
-            "//tensorflow/python:test_ops",
-            "//tensorflow/tools/dist_test/server:grpc_tensorflow_server",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
         "//tensorflow/contrib/tensorrt:init_py",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index dc31e4c5f703b29f464519d5f1fd54f9b5e11690..8f0cf8c3d194807b6c82f50b5ac8c7fe7527fea5 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -139,7 +139,9 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    # TODO(aselle): Re-enable this when we find a way to do it without doubling
+    # the whl size (over the limit).
+    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
@@ -160,7 +162,9 @@ function main() {
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
-  source tools/python_bin_path.sh
+  if [[ -e tools/python_bin_path.sh ]]; then
+    source tools/python_bin_path.sh
+  fi
 
   pushd ${TMPDIR}
   rm -f MANIFEST
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 365e8d6b08d654138debd7acad5cf4aac5d07d55..6511a50b3bbacf8f6c78eacb5c5279202960d198 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -29,13 +29,12 @@ from setuptools.dist import Distribution
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.7.0-rc1'
+_VERSION = '1.7.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
     'gast >= 0.2.0',
-    'grpcio >= 1.8.6',
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
@@ -43,6 +42,12 @@ REQUIRED_PACKAGES = [
     'termcolor >= 1.1.0',
 ]
 
+if sys.byteorder == 'little':
+  # grpcio does not build correctly on big-endian machines due to lack of
+  # BoringSSL support.
+  # See https://github.com/tensorflow/tensorflow/issues/17882.
+  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
+
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 39c4aac1e8b5dfb2582115881c7d10ca3cd04f68..ef7bfdd3c9e46a21e25a7cdfe812b9e37029188b 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -96,18 +96,3 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
-
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/quantization/BUILD b/tensorflow/tools/quantization/BUILD
index e99ad06a06294c4d037b76ea9450e51bd795e79d..17443a8617451cb9b09867e132855d6316d8e423 100644
--- a/tensorflow/tools/quantization/BUILD
+++ b/tensorflow/tools/quantization/BUILD
@@ -76,15 +76,3 @@ py_binary(
         "//tensorflow/python:platform",
     ],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 159a8c1cfbdb793d05eda850afb54e860bf2614e..4b2026b9472b651f8e0571155dab8952d20aa8b2 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -92,15 +92,3 @@ tf_py_logged_benchmark(
     name = "rnn_op_benchmark",
     target = "//tensorflow/python/kernel_tests:rnn_test",
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
index e8198efe2e534d261af21c83682a848589cf7916..71443cc41eb5ecdd23e1a47712633c77fcd7d395 100644
--- a/tensorflow/user_ops/BUILD
+++ b/tensorflow/user_ops/BUILD
@@ -50,15 +50,3 @@ tf_py_test(
     additional_deps = ["//tensorflow:tensorflow_py"],
     data = [":invalid_op.so"],
 )
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6ac98de43a1c648515277c0ff41ace5fdba5647b..5d1e9c93491a95f8c71206f3f7b89c0d487d0734 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -2,6 +2,7 @@
 
 load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+load("//third_party:nccl/nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
@@ -13,6 +14,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("@io_bazel_rules_closure//closure/private:java_import_external.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
+load("//tensorflow/tools/def_file_filter:def_file_filter_configure.bzl",
+     "def_file_filter_configure")
 
 
 # Sanitize a dependency so that it works correctly from code that includes
@@ -29,10 +32,15 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   cc_download_clang_toolchain(name="local_config_download_clang")
   cuda_configure(name="local_config_cuda")
   tensorrt_configure(name="local_config_tensorrt")
+  nccl_configure(name="local_config_nccl")
   git_configure(name="local_config_git")
   sycl_configure(name="local_config_sycl")
   python_configure(name="local_config_python")
 
+  # For windows bazel build
+  # TODO: Remove def file filter when TensorFlow can export symbols properly on Windows.
+  def_file_filter_configure(name = "local_config_def_file_filter")
+
   # Point //external/local_config_arm_compiler to //external/arm_compiler
   arm_compiler_configure(
       name="local_config_arm_compiler",
@@ -42,7 +50,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_lnx_2018.0.1.20171227.tgz",
       ],
       sha256 = "feacc3d82565c1231470359b42c696236fae873704e0b013436afba5fd4fd30f",
@@ -52,7 +60,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_win_2018.0.1.20171227.zip"
       ],
       sha256 = "24bae8d7b22b431a654acadea43f2243c46ae6b1e5a73a4a936825f31d284ee4",
@@ -62,7 +70,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz",
           "https://github.com/intel/mkl-dnn/releases/download/v0.12/mklml_mac_2018.0.1.20171227.tgz"
       ],
       sha256 = "0e954ec6fd3dc5e37f64c4043f6b5613dd687558da3df1028b3b7c29ff5cf77f",
@@ -99,11 +107,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/2355b229ea4c.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
       ],
-      sha256 = "0cadb31a35b514bf2dfd6b5d38205da94ef326ec6908fc3fd7c269948467214f",
-      strip_prefix = "eigen-eigen-2355b229ea4c",
+      sha256 = "791b836cacd03e20bae5bdd25f1c4a5505a0a9975ba94a61eb4e2631fbd1d53a",
+      strip_prefix = "eigen-eigen-6913f0cf7d06",
       build_file = clean_dep("//third_party:eigen.BUILD"),
       patch_file = clean_dep("//third_party:eigen_fix_cuda_compilation.patch")
   )
@@ -213,11 +221,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "png_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
-          "https://github.com/glennrp/libpng/archive/v1.2.53.tar.gz",
+          "https://mirror.bazel.build/github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
+          "https://github.com/glennrp/libpng/archive/v1.6.34.tar.gz",
       ],
-      sha256 = "716c59c7dfc808a4c368f8ada526932be72b2fcea11dd85dc9d88b1df1dfe9c2",
-      strip_prefix = "libpng-1.2.53",
+      sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
+      strip_prefix = "libpng-1.6.34",
       build_file = clean_dep("//third_party:png.BUILD"),
   )
 
@@ -307,18 +315,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "backports.weakref-1.0rc1/src",
       build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
-
-  tf_http_archive(
-      name = "com_github_andreif_codegen",
-      urls = [
-          "https://mirror.bazel.build/github.com/andreif/codegen/archive/1.0.tar.gz",
-          "https://github.com/andreif/codegen/archive/1.0.tar.gz",
-      ],
-      sha256 = "2dadd04a2802de27e0fe5a19b76538f6da9d39ff244036afa00c1bba754de5ee",
-      strip_prefix = "codegen-1.0",
-      build_file = clean_dep("//third_party:codegen.BUILD"),
-  )
-
+  
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
@@ -430,13 +427,14 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "grpc",
       urls = [
-          "https://mirror.bazel.build/github.com/grpc/grpc/archive/575bda39755b98d1f7099406bb57a6e3b2074874.tar.gz",
-          "https://github.com/grpc/grpc/archive/575bda39755b98d1f7099406bb57a6e3b2074874.tar.gz",
+          "https://mirror.bazel.build/github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
+          "https://github.com/grpc/grpc/archive/09386db3939cae1ac12e5f09b735adfa8958c68e.tar.gz",
       ],
-      sha256 = "f08a5c8e265191b39cc74915b1bc1fd380d86cd0176c92b7cce30b6ac50514ad",
-      strip_prefix = "grpc-575bda39755b98d1f7099406bb57a6e3b2074874",
+      sha256 = "b857969c667c14f37faa507afc07a3f39a47fbf73203be889d55925622e7b317",
+      strip_prefix = "grpc-09386db3939cae1ac12e5f09b735adfa8958c68e",
   )
 
+
   tf_http_archive(
       name = "linenoise",
       sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7",
@@ -453,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/1c3cdea2f181d8e14ee184466c5fb237f1b4cda8.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/1c3cdea2f181d8e14ee184466c5fb237f1b4cda8.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/7e78daafdd22f3f17720a103d29d89590534004e.tar.gz",
       ],
-      sha256 = "1efbb9b05af88368be984d2f6526061d4a857181ef10f8841889a3a46869bb01",
-      strip_prefix = "llvm-1c3cdea2f181d8e14ee184466c5fb237f1b4cda8",
+      sha256 = "a6d94bd9de23515a1e3792a830421e3885977ea43d03427cdbe68f98cb7e0045",
+      strip_prefix = "llvm-7e78daafdd22f3f17720a103d29d89590534004e",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
@@ -496,11 +494,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "zlib_archive",
       urls = [
-          "https://mirror.bazel.build/zlib.net/zlib-1.2.8.tar.gz",
-          "http://zlib.net/fossils/zlib-1.2.8.tar.gz",
+          "https://mirror.bazel.build/zlib.net/zlib-1.2.11.tar.gz",
+          "https://zlib.net/zlib-1.2.11.tar.gz",
       ],
-      sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
-      strip_prefix = "zlib-1.2.8",
+      sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
+      strip_prefix = "zlib-1.2.11",
       build_file = clean_dep("//third_party:zlib.BUILD"),
   )
 
@@ -517,11 +515,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "snappy",
       urls = [
-          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.4.tar.gz",
-          "https://github.com/google/snappy/archive/1.1.4.tar.gz",
+          "https://mirror.bazel.build/github.com/google/snappy/archive/1.1.7.tar.gz",
+          "https://github.com/google/snappy/archive/1.1.7.tar.gz",
       ],
-      sha256 = "2f7504c73d85bac842e893340333be8cb8561710642fc9562fccdd9d2c3fcc94",
-      strip_prefix = "snappy-1.1.4",
+      sha256 = "3dfa02e873ff51a11ee02b9ca391807f0c8ea0529a4924afa645fbf97163f9d4",
+      strip_prefix = "snappy-1.1.7",
       build_file = clean_dep("//third_party:snappy.BUILD"),
   )
 
@@ -533,7 +531,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       ],
       sha256 = "2ca86fb6179ecbff789cc67c836139c1bbc0324ed8c04643405a30bf26325176",
       strip_prefix = "nccl-03d856977ecbaac87e598c0c4bafca96761b9ac7",
-      build_file = clean_dep("//third_party:nccl.BUILD"),
+      build_file = clean_dep("//third_party:nccl/nccl_archive.BUILD"),
   )
 
   tf_http_archive(
@@ -705,6 +703,26 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
   )
 
+  tf_http_archive(
+      name = "tflite_mobilenet_ssd",
+      sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip",
+      ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
+  tf_http_archive(
+      name = "tflite_conv_actions_frozen",
+      sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip",
+      ],
+      build_file = str(Label("//third_party:tflite_mobilenet.BUILD")),
+  )
+
   tf_http_archive(
       name = "tflite_smartreply",
       sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c",
diff --git a/third_party/codegen.BUILD b/third_party/codegen.BUILD
deleted file mode 100644
index df436c81635a71421a67fa8d8c84eb8dfcc97d7b..0000000000000000000000000000000000000000
--- a/third_party/codegen.BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-# -*- mode: python; -*-
-#
-# Description:
-#   Extension to ast that allow ast -> python code generation.
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # New BSD
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "com_github_andreif_codegen",
-    srcs = glob(["codegen.py"]),
-    srcs_version = "PY2AND3",
-)
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
index f8fb6ecb0ccc7f81040370a80c31d03daa659051..8a2b24aa4e284fd70c7148d26c3c4d6ccd04f98c 100644
--- a/third_party/examples/eager/spinn/spinn.py
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -266,8 +266,7 @@ class SPINN(tf.keras.Model):
           trackings.append(tracking)
 
       if rights:
-        reducer_output = self.reducer(
-            lefts, right_in=rights, tracking=trackings)
+        reducer_output = self.reducer(lefts, rights, trackings)
         reduced = iter(reducer_output)
 
         for transition, stack in zip(trans, stacks):
@@ -388,10 +387,10 @@ class SNLIClassifier(tf.keras.Model):
 
     # Run the batch-normalized and dropout-processed word vectors through the
     # SPINN encoder.
-    premise = self.encoder(
-        premise_embed, transitions=premise_transition, training=training)
-    hypothesis = self.encoder(
-        hypothesis_embed, transitions=hypothesis_transition, training=training)
+    premise = self.encoder(premise_embed, premise_transition,
+                           training=training)
+    hypothesis = self.encoder(hypothesis_embed, hypothesis_transition,
+                              training=training)
 
     # Combine encoder outputs for premises and hypotheses into logits.
     # Then apply batch normalization and dropuout on the logits.
@@ -465,11 +464,10 @@ class SNLIClassifierTrainer(tfe.Checkpointable):
     """
     with tfe.GradientTape() as tape:
       tape.watch(self._model.variables)
-      # TODO(allenl): Allow passing Layer inputs as position arguments.
       logits = self._model(premise,
-                           premise_transition=premise_transition,
-                           hypothesis=hypothesis,
-                           hypothesis_transition=hypothesis_transition,
+                           premise_transition,
+                           hypothesis,
+                           hypothesis_transition,
                            training=True)
       loss = self.loss(labels, logits)
     gradients = tape.gradient(loss, self._model.variables)
@@ -533,9 +531,7 @@ def _evaluate_on_dataset(snli_data, batch_size, trainer, use_gpu):
       snli_data, batch_size):
     if use_gpu:
       label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
-    logits = trainer.model(
-        prem, premise_transition=prem_trans, hypothesis=hypo,
-        hypothesis_transition=hypo_trans, training=False)
+    logits = trainer.model(prem, prem_trans, hypo, hypo_trans, training=False)
     loss_val = trainer.loss(label, logits)
     batch_size = tf.shape(label)[0]
     mean_loss(loss_val, weights=batch_size.gpu() if use_gpu else batch_size)
@@ -639,11 +635,8 @@ def train_or_infer_spinn(embed,
       hypo, hypo_trans = inference_sentence_pair[1]
       hypo_trans = inference_sentence_pair[1][1]
       inference_logits = model(
-          tf.constant(prem),
-          premise_transition=tf.constant(prem_trans),
-          hypothesis=tf.constant(hypo),
-          hypothesis_transition=tf.constant(hypo_trans),
-          training=False)
+          tf.constant(prem), tf.constant(prem_trans),
+          tf.constant(hypo), tf.constant(hypo_trans), training=False)
       inference_logits = inference_logits[0][1:]
       max_index = tf.argmax(inference_logits)
       print("\nInference logits:")
diff --git a/third_party/hadoop/BUILD b/third_party/hadoop/BUILD
index 9e981544008e889e08195191308add04bf834df4..c3c5e428be083d01e56a6fdd30a71c32c74f695b 100644
--- a/third_party/hadoop/BUILD
+++ b/third_party/hadoop/BUILD
@@ -4,18 +4,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE.txt"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 cc_library(
     name = "hdfs",
     hdrs = ["hdfs.h"],
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 28293a36593d8fa67a2d85631a0769e03d508354..075b46896ed868d8e2e1bcddf6d867974a248313 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -162,13 +162,6 @@ all_cmake_vars = select({
 })
 
 # Performs CMake variable substitutions on configuration header files.
-expand_cmake_vars(
-    name = "datatypes_gen",
-    src = "include/llvm/Support/DataTypes.h.cmake",
-    cmake_vars = all_cmake_vars,
-    dst = "include/llvm/Support/DataTypes.h",
-)
-
 expand_cmake_vars(
     name = "config_gen",
     src = "include/llvm/Config/config.h.cmake",
@@ -305,9 +298,7 @@ cc_binary(
     srcs = glob([
         "utils/TableGen/*.cpp",
         "utils/TableGen/*.h",
-    ]) + [
-        "lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h",
-    ],
+    ]),
     linkopts = [
         "-lm",
         "-ldl",
@@ -2014,7 +2005,6 @@ cc_library(
         "include/llvm/Support/WasmRelocs/*.def",
     ]) + [
         "include/llvm/BinaryFormat/MachO.def",
-        "include/llvm/Support/DataTypes.h",
         "include/llvm/Support/VCSRevision.h",
         "include/llvm/ExecutionEngine/ObjectMemoryBuffer.h",
     ],
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 3262562bccca4f2a8b3da860cb38928f144994a9..c2adf578c703f5657e65a07f9ec9f3b43cd5ae87 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -24,7 +24,6 @@ load(
 
 filegroup(
     name = "LICENSE",
-    visibility = ["//visibility:public"],
     srcs = ["MKL_LICENSE"] + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "@mkl_linux//:LICENSE",
@@ -34,13 +33,13 @@ filegroup(
         ],
         "@org_tensorflow//tensorflow:windows": [
             "@mkl_windows//:LICENSE",
-        ]
-    })
+        ],
+    }),
+    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "intel_binary_blob",
-
     visibility = ["//visibility:public"],
     deps = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
@@ -54,6 +53,6 @@ cc_library(
         "@org_tensorflow//tensorflow:windows": [
             "@mkl_windows//:mkl_headers",
             "@mkl_windows//:mkl_libs_windows",
-        ]
-    })
+        ],
+    }),
 )
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
index 892221ec00295a694ab40868cd886e820768f78f..c3a71e4ff9b33a06a87f0f90978eaf3a718c7de6 100644
--- a/third_party/mkl/mkl.BUILD
+++ b/third_party/mkl/mkl.BUILD
@@ -21,7 +21,7 @@ cc_library(
     name = "mkl_libs_linux",
     srcs = [
         "lib/libiomp5.so",
-        "lib/libmklml_intel.so"
+        "lib/libmklml_intel.so",
     ],
     visibility = ["//visibility:public"],
 )
@@ -30,7 +30,7 @@ cc_library(
     name = "mkl_libs_darwin",
     srcs = [
         "lib/libiomp5.dylib",
-        "lib/libmklml.dylib"
+        "lib/libmklml.dylib",
     ],
     visibility = ["//visibility:public"],
 )
@@ -39,7 +39,7 @@ cc_library(
     name = "mkl_libs_windows",
     srcs = [
         "lib/libiomp5md.lib",
-        "lib/mklml.lib"
+        "lib/mklml.lib",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/mpi/BUILD b/third_party/mpi/BUILD
index ff3f437e92465c774c9906d6987f21c07e251b93..1d6ac2fceb2e7c55a13e80d5a64b61974e9eb15b 100644
--- a/third_party/mpi/BUILD
+++ b/third_party/mpi/BUILD
@@ -1,17 +1,5 @@
 licenses(["restricted"])
 
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
-
 load("//third_party/mpi:mpi.bzl", "mpi_hdr")
 load("//third_party/mpi:mpi.bzl", "if_mpi")
 
diff --git a/third_party/nccl/LICENSE b/third_party/nccl/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..146d9b765c5db44c2f5bea8fa5010eef5ec0c68f
--- /dev/null
+++ b/third_party/nccl/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/nccl.BUILD b/third_party/nccl/nccl_archive.BUILD
similarity index 95%
rename from third_party/nccl.BUILD
rename to third_party/nccl/nccl_archive.BUILD
index b2b8e188248f90805bc2904dca9111550a7dfed8..a05899e38d531c066c774302e4ffd75ce7e482e7 100644
--- a/third_party/nccl.BUILD
+++ b/third_party/nccl/nccl_archive.BUILD
@@ -43,6 +43,7 @@ cc_library(
         "-Iexternal/nccl_archive/src",
         "-O3",
     ] + cuda_default_copts(),
+    include_prefix = "third_party/nccl",
     linkopts = select({
         "@org_tensorflow//tensorflow:android": [
             "-pie",
@@ -61,6 +62,7 @@ cc_library(
             "-lrt",
         ],
     }),
+    strip_include_prefix = "src",
     visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..9dfcb1836989d6c092739100e00e7000e6556c10
--- /dev/null
+++ b/third_party/nccl/nccl_configure.bzl
@@ -0,0 +1,172 @@
+# -*- Python -*-
+"""Repository rule for NCCL configuration.
+
+`nccl_configure` depends on the following environment variables:
+
+  * `TF_NCCL_VERSION`: The NCCL version.
+  * `NCCL_INSTALL_PATH`: The installation path of the NCCL library.
+"""
+
+load(
+    "//third_party/gpus:cuda_configure.bzl",
+    "auto_configure_fail",
+    "find_cuda_define",
+    "matches_version",
+)
+
+_NCCL_INSTALL_PATH = "NCCL_INSTALL_PATH"
+_TF_NCCL_VERSION = "TF_NCCL_VERSION"
+
+_DEFINE_NCCL_MAJOR = "#define NCCL_MAJOR"
+_DEFINE_NCCL_MINOR = "#define NCCL_MINOR"
+_DEFINE_NCCL_PATCH = "#define NCCL_PATCH"
+
+_NCCL_DUMMY_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_ARCHIVE_BUILD_CONTENT = """
+filegroup(
+  name = "LICENSE",
+  data = ["@nccl_archive//:LICENSE.txt"],
+  visibility = ["//visibility:public"],
+)
+
+alias(
+  name = "nccl",
+  actual = "@nccl_archive//:nccl",
+  visibility = ["//visibility:public"],
+)
+"""
+
+_NCCL_LOCAL_BUILD_TEMPLATE = """
+filegroup(
+  name = "LICENSE",
+  data = ["nccl/NCCL-SLA.txt"],
+  visibility = ["//visibility:public"],
+)
+
+cc_library(
+  name = "nccl",
+  srcs = ["nccl/lib/libnccl.so.%s"],
+  hdrs = ["nccl/include/nccl.h"],
+  include_prefix = "third_party/nccl",
+  strip_include_prefix = "nccl/include",
+  deps = [
+      "@local_config_cuda//cuda:cuda_headers",
+  ],
+  visibility = ["//visibility:public"],
+)
+"""
+
+
+def _find_nccl_header(repository_ctx, nccl_install_path):
+  """Finds the NCCL header on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+
+  Returns:
+    The path to the NCCL header.
+  """
+  header_path = repository_ctx.path("%s/include/nccl.h" % nccl_install_path)
+  if not header_path.exists:
+    auto_configure_fail("Cannot find %s" % str(header_path))
+  return header_path
+
+
+def _check_nccl_version(repository_ctx, nccl_install_path, nccl_version):
+  """Checks whether the header file matches the specified version of NCCL.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library install directory.
+    nccl_version: The expected NCCL version.
+
+  Returns:
+    A string containing the library version of NCCL.
+  """
+  header_path = _find_nccl_header(repository_ctx, nccl_install_path)
+  header_dir = str(header_path.realpath.dirname)
+  major_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MAJOR)
+  minor_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_MINOR)
+  patch_version = find_cuda_define(repository_ctx, header_dir, "nccl.h",
+                                   _DEFINE_NCCL_PATCH)
+  header_version = "%s.%s.%s" % (major_version, minor_version, patch_version)
+  if not matches_version(nccl_version, header_version):
+    auto_configure_fail(
+        ("NCCL library version detected from %s/nccl.h (%s) does not match " +
+         "TF_NCCL_VERSION (%s). To fix this rerun configure again.") %
+        (header_dir, header_version, nccl_version))
+
+
+def _find_nccl_lib(repository_ctx, nccl_install_path, nccl_version):
+  """Finds the given NCCL library on the system.
+
+  Args:
+    repository_ctx: The repository context.
+    nccl_install_path: The NCCL library installation directory.
+    nccl_version: The version of NCCL library files as returned
+      by _nccl_version.
+
+  Returns:
+    The path to the NCCL library.
+  """
+  lib_path = repository_ctx.path("%s/lib/libnccl.so.%s" % (nccl_install_path,
+                                                           nccl_version))
+  if not lib_path.exists:
+    auto_configure_fail("Cannot find NCCL library %s" % str(lib_path))
+  return lib_path
+
+
+def _nccl_configure_impl(repository_ctx):
+  """Implementation of the nccl_configure repository rule."""
+  if _TF_NCCL_VERSION not in repository_ctx.os.environ:
+    # Add a dummy build file to make bazel query happy.
+    repository_ctx.file("BUILD", _NCCL_DUMMY_BUILD_CONTENT)
+    return
+
+  nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip()
+  if matches_version("1", nccl_version):
+    # Alias to GitHub target from @nccl_archive.
+    if not matches_version(nccl_version, "1.3"):
+      auto_configure_fail(
+          "NCCL from GitHub must use version 1.3 (got %s)" % nccl_version)
+    repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
+  else:
+    # Create target for locally installed NCCL.
+    nccl_install_path = repository_ctx.os.environ[_NCCL_INSTALL_PATH].strip()
+    _check_nccl_version(repository_ctx, nccl_install_path, nccl_version)
+    repository_ctx.symlink(nccl_install_path, "nccl")
+    repository_ctx.file("BUILD", _NCCL_LOCAL_BUILD_TEMPLATE % nccl_version)
+
+
+nccl_configure = repository_rule(
+    implementation=_nccl_configure_impl,
+    environ=[
+        _NCCL_INSTALL_PATH,
+        _TF_NCCL_VERSION,
+    ],
+)
+"""Detects and configures the NCCL configuration.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+nccl_configure(name = "local_config_nccl")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 6a7ad719aa7d02855e0b319553ea8e86209553c1..76ab32d69c35055b3796b8f612133394758db330 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -9,15 +9,20 @@ cc_library(
     name = "png",
     srcs = [
         "png.c",
+        "pngdebug.h",
         "pngerror.c",
         "pngget.c",
+        "pnginfo.h",
+        "pnglibconf.h",
         "pngmem.c",
         "pngpread.c",
+        "pngpriv.h",
         "pngread.c",
         "pngrio.c",
         "pngrtran.c",
         "pngrutil.c",
         "pngset.c",
+        "pngstruct.h",
         "pngtrans.c",
         "pngwio.c",
         "pngwrite.c",
@@ -33,3 +38,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = ["@zlib_archive//:zlib"],
 )
+
+genrule(
+    name = "snappy_stubs_public_h",
+    srcs = ["scripts/pnglibconf.h.prebuilt"],
+    outs = ["pnglibconf.h"],
+    cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12b0/' $< >$@",
+)
diff --git a/third_party/snappy.BUILD b/third_party/snappy.BUILD
index fd48ed8941e159a8d6176ef3f4e1982d6600e1c2..cc11f52d0eb3e04ad1fde6b2c8ba41e4baad5417 100644
--- a/third_party/snappy.BUILD
+++ b/third_party/snappy.BUILD
@@ -4,25 +4,12 @@ licenses(["notice"])  # BSD 3-Clause
 
 exports_files(["COPYING"])
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "snappy",
     srcs = [
+        "config.h",
         "snappy.cc",
         "snappy.h",
-        "snappy-c.cc",
-        "snappy-c.h",
         "snappy-internal.h",
         "snappy-sinksource.cc",
         "snappy-sinksource.h",
@@ -32,30 +19,85 @@ cc_library(
     ],
     hdrs = ["snappy.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
+        "@org_tensorflow//tensorflow:windows_msvc": [
+            "/DHAVE_CONFIG_H",
+            "/EHsc",
+        ],
         "//conditions:default": [
+            "-DHAVE_CONFIG_H",
+            "-fno-exceptions",
+            "-Wno-sign-compare",
             "-Wno-shift-negative-value",
             "-Wno-implicit-function-declaration",
         ],
     }),
 )
 
+genrule(
+    name = "config_h",
+    outs = ["config.h"],
+    cmd = "\n".join([
+        "cat <<'EOF' >$@",
+        "#define HAVE_STDDEF_H 1",
+        "#define HAVE_STDINT_H 1",
+        "",
+        "#ifdef __has_builtin",
+        "#  if !defined(HAVE_BUILTIN_EXPECT) && __has_builtin(__builtin_expect)",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  if !defined(HAVE_BUILTIN_CTZ) && __has_builtin(__builtin_ctzll)",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4)",
+        "#  ifndef HAVE_BUILTIN_EXPECT",
+        "#    define HAVE_BUILTIN_EXPECT 1",
+        "#  endif",
+        "#  ifndef HAVE_BUILTIN_CTZ",
+        "#    define HAVE_BUILTIN_CTZ 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifdef __has_include",
+        "#  if !defined(HAVE_BYTESWAP_H) && __has_include(<byteswap.h>)",
+        "#    define HAVE_BYTESWAP_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_UNISTD_H) && __has_include(<unistd.h>)",
+        "#    define HAVE_UNISTD_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_ENDIAN_H) && __has_include(<sys/endian.h>)",
+        "#    define HAVE_SYS_ENDIAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_MMAN_H) && __has_include(<sys/mman.h>)",
+        "#    define HAVE_SYS_MMAN_H 1",
+        "#  endif",
+        "#  if !defined(HAVE_SYS_UIO_H) && __has_include(<sys/uio.h>)",
+        "#    define HAVE_SYS_UIO_H 1",
+        "#  endif",
+        "#endif",
+        "",
+        "#ifndef SNAPPY_IS_BIG_ENDIAN",
+        "#  ifdef __s390x__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__",
+        "#    define SNAPPY_IS_BIG_ENDIAN 1",
+        "#  endif",
+        "#endif",
+        "EOF",
+    ]),
+)
+
 genrule(
     name = "snappy_stubs_public_h",
     srcs = ["snappy-stubs-public.h.in"],
     outs = ["snappy-stubs-public.h"],
     cmd = ("sed " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           "-e 's/@ac_cv_have_stddef_h@/1/g' " +
-           "-e 's/@ac_cv_have_stdint_h@/1/g' " +
-           select({
-               "@org_tensorflow//tensorflow:windows": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "@org_tensorflow//tensorflow:windows_msvc": "-e 's/@ac_cv_have_sys_uio_h@/0/g' ",
-               "//conditions:default": "-e 's/@ac_cv_have_sys_uio_h@/1/g' ",
-           }) +
-           "-e 's/@SNAPPY_MAJOR@/1/g' " +
-           "-e 's/@SNAPPY_MINOR@/1/g' " +
-           "-e 's/@SNAPPY_PATCHLEVEL@/4/g' " +
+           "-e 's/$${\\(.*\\)_01}/\\1/g' " +
+           "-e 's/$${SNAPPY_MAJOR}/1/g' " +
+           "-e 's/$${SNAPPY_MINOR}/1/g' " +
+           "-e 's/$${SNAPPY_PATCHLEVEL}/4/g' " +
            "$< >$@"),
 )
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
index fbdf19f2054cf01aec44e3fcb13d0d0a2ff6f914..f631b6df06d13b4ecf09aed3d810f02b996f197e 100644
--- a/third_party/sycl/BUILD
+++ b/third_party/sycl/BUILD
@@ -1,15 +1,3 @@
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
index bc1d18b7b572f7c5cec1241949d2c3eff3d8b16b..b045609954335ab160b52bb5faf3513fa9b9e1fd 100644
--- a/third_party/sycl/sycl/BUILD
+++ b/third_party/sycl/sycl/BUILD
@@ -5,15 +5,3 @@
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/third_party/zlib.BUILD b/third_party/zlib.BUILD
index d164ee719c1fa4a304b82f223a432b9d087db827..e8048dd98adcca2ad6fa07fd582d2090901660e3 100644
--- a/third_party/zlib.BUILD
+++ b/third_party/zlib.BUILD
@@ -2,18 +2,6 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # BSD/MIT-like license (for zlib)
 
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "windows_msvc",
-    values = {"cpu": "x64_windows_msvc"},
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "zlib",
     srcs = [
@@ -45,8 +33,8 @@ cc_library(
     ],
     hdrs = ["zlib.h"],
     copts = select({
-        ":windows": [],
-        ":windows_msvc": [],
+        "@org_tensorflow//tensorflow:windows": [],
+        "@org_tensorflow//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-Wno-shift-negative-value",
             "-DZ_HAVE_UNISTD_H",